I am trying to write a Python script which seeks to replace zero values within a csv file, parsing it with a set of rules:
- The first function seeks to take an average of rows with matching values in Cluster, Group, and Date
- Second function parses it again, this time grouping by Cluster, Group, and Period
- Third function instead takes a row by row approach with respect to Area Code and Group, whereby wherever a zero value is found in Values, the script looks for the first non-zero value within that group before it, and the first non-zero after it, also by date, and takes an average of the two.
Ideally, the third function would do something like this:
Initial
Date | Value |
---|---|
45443 | 10 |
45444 | 0 |
45445 | 0 |
45446 | 15 |
First Parse
Date | Value |
---|---|
45443 | 10 |
45444 | 12.5 |
45445 | 0 |
45446 | 15 |
Second parse
Date | Value |
---|---|
45443 | 10 |
45444 | 12.5 |
45445 | 13.75 |
45446 | 15 |
However, while the initial functions seem to fill their purpose, the third one does not seem to fix anything on each parse, as the logging would suggest.
Worth noting: The first two functions are to only run once, while the third function is to loop until everything is “cleaned up”
I have attached the code in question here:
import pandas as pd
import numpy as np
import logging
# Sample data creation
np.random.seed(0)
data = {
'Area_Code': ['AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA', 'AA'],
'Date': [pd.Timestamp('2023-01-01').toordinal() + i for i in range(10)],
'Group': ['G1', 'G1', 'G1', 'G1', 'G1', 'G2', 'G2', 'G2', 'G2', 'G2'],
'PERIOD_NAME': ['P1', 'P1', 'P1', 'P1', 'P1', 'P2', 'P2', 'P2', 'P2', 'P2'],
'CLUSTER': ['C1', 'C1', 'C1', 'C1', 'C1', 'C2', 'C2', 'C2', 'C2', 'C2'],
'Value': [100, 0, 0, 200, 0, 0, 300, 0, 0, 400]
}
df = pd.DataFrame(data)
# Function to replace zero values by looking at the date above and the date below
def replace_zeros_with_neighbors(df):
logging.info("Starting to replace remaining zeros by looking at neighbors.")
zeros_fixed = 0
for index, row in df.iterrows():
if row['Value'] == 0:
area_code = row['Area_Code']
group = row['Group']
date = row['Date']
prev_row = df[(df['Area_Code'] == area_code) &
(df['Group'] == group) &
(df['Date'] < date) &
(df['Value'] != 0)].sort_values(by='Date').tail(1)
next_row = df[(df['Area_Code'] == area_code) &
(df['Group'] == group) &
(df['Date'] > date) &
(df['Value'] != 0)].sort_values(by='Date').head(1)
if not prev_row.empty and not next_row.empty:
avg_value = (prev_row['Value'].values[0] + next_row['Value'].values[0]) / 2
df.at[index, 'Value'] = avg_value
zeros_fixed += 1
elif not prev_row.empty:
df.at[index, 'Value'] = prev_row['Value'].values[0]
zeros_fixed += 1
elif not next_row.empty:
df.at[index, 'Value'] = next_row['Value'].values[0]
zeros_fixed += 1
logging.info(f"Completed neighbor-based zero replacement. Fixed {zeros_fixed} zeros.")
return df, zeros_fixed
# Function to iteratively replace zero values until there are no zeros left
def iterative_replace_zeros(df):
logging.info("Starting iterative zero replacement.")
df, zeros_fixed = replace_zeros_with_neighbors(df)
total_zeros_fixed = zeros_fixed
remaining_zeros = (df['Value'] == 0).sum()
logging.info(f"Initial pass: Fixed {zeros_fixed} zeros. {remaining_zeros} zeros remaining.")
while remaining_zeros > 0:
df, zeros_fixed = replace_zeros_with_neighbors(df)
total_zeros_fixed += zeros_fixed
remaining_zeros = (df['Value'] == 0).sum()
logging.info(f"Fixed {zeros_fixed} zeros in this loop. {remaining_zeros} zeros remaining.")
logging.info(f"Completed iterative zero replacement. Total zeros fixed: {total_zeros_fixed}.")
return df
# Example usage
logging.basicConfig(level=logging.INFO)
df = iterative_replace_zeros(df)
print(df)
RMS-DRS is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
7