I am doing StandardScaler normalization to my dataframe but it is inserting NaN values into some of the rows
Main Function
games = prepare_data()
train_size = int(0.7 * len(games))
test_size = len(games) - train_size
train_set = games.iloc[:train_size]
test_set = games.iloc[-test_size:]
X_train = train_set.iloc[:,:-1]
y_train = train_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]
train_set_normalized = normalize_data(train_set,'HOME_TEAM_WINS')
prepare_data
def prepare_data():
#Import data from pkl files
games = pd.read_pickle('cleaned_data/merged.pkl')
teams = pd.read_pickle('cleaned_data/teams.pkl')
#Add foundation year of home team
games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left')
games['FOUNDATION_HOME'] = games['YEARFOUNDED']
games.drop('TEAM_ID', axis=1, inplace=True)
games.drop('YEARFOUNDED', axis=1, inplace=True)
#Add foundation year of away team
games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='VISITOR_TEAM_ID', right_on='TEAM_ID', how='left')
games['FOUNDATION_VISITOR'] = games['YEARFOUNDED']
games.drop('TEAM_ID', axis=1, inplace=True)
games.drop('YEARFOUNDED', axis=1, inplace=True)
games = games[games['SEASON'] != 2003]
#Remove IDs and redundant columns
# Lista das colunas que você deseja remover
remove = [
'GAME_DATE_EST',
'GAME_ID',
'HOME_TEAM_ID',
'VISITOR_TEAM_ID',
'SEASON',
'HOME_TEAM_REGULAR_SEASON_GAMES',
'HOME_TEAM_REGULAR_SEASON_TOTAL_WINS',
'HOME_TEAM_REGULAR_SEASON_TOTAL_LOSSES',
'HOME_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE',
'HOME_TEAM_REGULAR_SEASON_WINS_AT_HOME',
'HOME_TEAM_REGULAR_SEASON_LOSSES_AT_HOME',
'HOME_TEAM_REGULAR_SEASON_WINS_AWAY',
'HOME_TEAM_REGULAR_SEASON_AWAY_LOSSES',
'STANDINGSDATE',
'AWAY_TEAM_REGULAR_SEASON_GAMES',
'AWAY_TEAM_REGULAR_SEASON_TOTAL_WINS',
'AWAY_TEAM_REGULAR_SEASON_TOTAL_LOSSES',
'AWAY_TEAM_REGULAR_SEASON_WINNING_PERCENTAGE',
'AWAY_TEAM_REGULAR_SEASON_WINS_AT_HOME',
'AWAY_TEAM_REGULAR_SEASON_LOSSES_AT_HOME',
'AWAY_TEAM_REGULAR_SEASON_WINS_AWAY',
'AWAY_TEAM_REGULAR_SEASON_AWAY_LOSSES'
]
games = games.drop(columns=remove)
#HOME_TEAM_WINS TO LAST COLUMN
target = games.pop('HOME_TEAM_WINS')
games.insert(len(games.columns), 'HOME_TEAM_WINS', target)
print(games.head(50))
print("HOME TEAM WINS VALUE COUNTS:")
print(games['HOME_TEAM_WINS'].value_counts())
games.to_csv('2.csv', index=False)
return games
normalize_data
def normalize_data(data, target_column):
result = data.isna().sum()
result.to_csv('nan_counts.csv', header=['NaN_count'])
data_normalized = data.copy()
subset = data_normalized[[
'HOME_TEAM_TOTAL_GAMES',
'AWAY_TEAM_TOTAL_GAMES',
'HOME_TEAM_TOTAL_WINS',
'AWAY_TEAM_TOTAL_WINS',
'HOME_TEAM_TOTAL_LOSSES',
'AWAY_TEAM_TOTAL_LOSSES',
'HOME_TEAM_WIN_PERCENTAGE',
'AWAY_TEAM_WIN_PERCENTAGE',
'HOME_TEAM_TOTAL_GAMES_AT_HOME',
'AWAY_TEAM_TOTAL_GAMES_AWAY',
'HOME_TEAM_TOTAL_WINS_AT_HOME',
'AWAY_TEAM_TOTAL_WINS_AWAY',
'HOME_TEAM_TOTAL_LOSSES_AT_HOME',
'AWAY_TEAM_TOTAL_LOSSES_AWAY',
'HOME_TEAM_WIN_PERCENTAGE_AT_HOME',
'AWAY_TEAM_WIN_PERCENTAGE_AWAY',
'HOME_TEAM_WIN/LOSS_STREAK',
'VISITOR_TEAM_WIN/LOSS_STREAK',
'HOME_TEAM_WIN/LOSS_STREAK_AT_HOME',
'VISITOR_TEAM_WIN/LOSS_STREAK_AWAY',
'AVG_POINTS_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'FOUNDATION_HOME',
'FOUNDATION_VISITOR'
]]
scaler = StandardScaler()
scaler.fit(subset)
scaled_data = scaler.transform(subset)
scaled_df = pd.DataFrame(scaled_data, columns=[
'HOME_TEAM_TOTAL_GAMES',
'AWAY_TEAM_TOTAL_GAMES',
'HOME_TEAM_TOTAL_WINS',
'AWAY_TEAM_TOTAL_WINS',
'HOME_TEAM_TOTAL_LOSSES',
'AWAY_TEAM_TOTAL_LOSSES',
'HOME_TEAM_WIN_PERCENTAGE',
'AWAY_TEAM_WIN_PERCENTAGE',
'HOME_TEAM_TOTAL_GAMES_AT_HOME',
'AWAY_TEAM_TOTAL_GAMES_AWAY',
'HOME_TEAM_TOTAL_WINS_AT_HOME',
'AWAY_TEAM_TOTAL_WINS_AWAY',
'HOME_TEAM_TOTAL_LOSSES_AT_HOME',
'AWAY_TEAM_TOTAL_LOSSES_AWAY',
'HOME_TEAM_WIN_PERCENTAGE_AT_HOME',
'AWAY_TEAM_WIN_PERCENTAGE_AWAY',
'HOME_TEAM_WIN/LOSS_STREAK',
'VISITOR_TEAM_WIN/LOSS_STREAK',
'HOME_TEAM_WIN/LOSS_STREAK_AT_HOME',
'VISITOR_TEAM_WIN/LOSS_STREAK_AWAY',
'AVG_POINTS_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'FOUNDATION_HOME',
'FOUNDATION_VISITOR'
])
data_normalized[[
'HOME_TEAM_TOTAL_GAMES',
'AWAY_TEAM_TOTAL_GAMES',
'HOME_TEAM_TOTAL_WINS',
'AWAY_TEAM_TOTAL_WINS',
'HOME_TEAM_TOTAL_LOSSES',
'AWAY_TEAM_TOTAL_LOSSES',
'HOME_TEAM_WIN_PERCENTAGE',
'AWAY_TEAM_WIN_PERCENTAGE',
'HOME_TEAM_TOTAL_GAMES_AT_HOME',
'AWAY_TEAM_TOTAL_GAMES_AWAY',
'HOME_TEAM_TOTAL_WINS_AT_HOME',
'AWAY_TEAM_TOTAL_WINS_AWAY',
'HOME_TEAM_TOTAL_LOSSES_AT_HOME',
'AWAY_TEAM_TOTAL_LOSSES_AWAY',
'HOME_TEAM_WIN_PERCENTAGE_AT_HOME',
'AWAY_TEAM_WIN_PERCENTAGE_AWAY',
'HOME_TEAM_WIN/LOSS_STREAK',
'VISITOR_TEAM_WIN/LOSS_STREAK',
'HOME_TEAM_WIN/LOSS_STREAK_AT_HOME',
'VISITOR_TEAM_WIN/LOSS_STREAK_AWAY',
'AVG_POINTS_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_VISITOR_TEAM',
'AVG_POINTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_REB_LAST_4_GAMES_AT_HOME_HOME_TEAM',
'AVG_POINTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_POINTS_CONCEDED_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_ASSISTS_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FGPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FTPCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_FG3PCT_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'AVG_REB_LAST_4_GAMES_AWAY_VISITOR_TEAM',
'FOUNDATION_HOME',
'FOUNDATION_VISITOR'
]] = scaled_df
result = data_normalized.isna().sum()
result.to_csv('nan_counts2.csv', header=['NaN_count'])
return data_normalized
I checked the csv file that is generate in the beginning of the normalize_data
function and there are no NaN, but in the csv file at the end of the function there are around 7000 lines with NaN values.
Am I doing some mistake when I am normalizing my data?
Also, I tried deleting the rows where Season is 2003 before merging the two dataframes and the problem doesn’t happen.
#Import data from pkl files
games = pd.read_pickle('cleaned_data/merged.pkl')
teams = pd.read_pickle('cleaned_data/teams.pkl')
games = games[games['SEASON'] != 2003]
#Add foundation year of home team
games = games.merge(teams[['TEAM_ID', 'YEARFOUNDED']], left_on='HOME_TEAM_ID', right_on='TEAM_ID', how='left')
games['FOUNDATION_HOME'] = games['YEARFOUNDED']
games.drop('TEAM_ID', axis=1, inplace=True)
games.drop('YEARFOUNDED', axis=1, inplace=True)
I runned both versions while saving the resultant games dataframe to a csv file, and when comparing them I reached the conclusion that they are equal.
What can be causing this problem?