I am doing a small project (while trying to educate myself in OCR and data analysis), and I am facing a problem I cannot solve.
I am plotting this graph of speed relative to time, but due to the inconsistency of my OCR results, I am facing the problem of random dots.
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
"""
Change obvious outliers to None in the DataFrame.
Args:
df (pd.DataFrame): The DataFrame to clean.
Returns:
pd.DataFrame: The cleaned DataFrame.
"""
# Define thresholds for outliers
sh_speed_threshold = 6000 # Example threshold for Superheavy speed
sh_altitude_threshold = 100 # Example threshold for Superheavy altitude
ss_speed_threshold = 30000 # Example threshold for Starship speed
ss_altitude_threshold = 200 # Example threshold for Starship altitude
# Change outliers to None
df.loc[df['superheavy_speed'] > sh_speed_threshold, 'superheavy_speed'] = None
df.loc[df['superheavy_altitude'] > sh_altitude_threshold, 'superheavy_altitude'] = None
df.loc[df['starship_speed'] > ss_speed_threshold, 'starship_speed'] = None
df.loc[df['starship_altitude'] > ss_altitude_threshold, 'starship_altitude'] = None
return df
def delete_outliers_within_window(df: pd.DataFrame, window_size: int = 5) -> pd.DataFrame:
"""
Delete outliers within a rolling window from the DataFrame.
Args:
df (pd.DataFrame): The DataFrame to clean.
window_size (int): The window size for the rolling operations.
Returns:
pd.DataFrame: The cleaned DataFrame.
"""
for column in ['superheavy_speed', 'superheavy_altitude', 'starship_speed', 'starship_altitude']:
rolling_median = df[column].rolling(window=window_size, center=True).median()
rolling_std = df[column].rolling(window=window_size, center=True).std()
df = df[(df[column] >= (rolling_median - 2 * rolling_std)) & (df[column] <= (rolling_median + 2 * rolling_std))]
return df
with open(json_path, "r") as f:
data = json.load(f)
pd.set_option('display.max_columns', None) # Add this line to display all columns
result, entry = validate_json_structure(data)
if not result:
print("Invalid JSON structure.", entry)
df = pd.DataFrame(data)
df.drop(columns=["time"], inplace=True) # Drop the "time" column
# Split "superheavy" and "starship" columns into separate "speed" and "altitude" columns with progress bar
for column in tqdm(["superheavy", "starship"], desc="Separating columns"):
df[[f"{column}_speed", f"{column}_altitude"]] = df[column].apply(pd.Series)
# Drop the original "superheavy" and "starship" columns
df.drop(columns=["superheavy", "starship"], inplace=True)
# Sort the DataFrame by real time
df.sort_values(by="real_time", inplace=True)
# # # Remove obvious outliers
df = remove_outliers(df)
delete_outliers_within_window(df, window_size=10000)
# Set all Superheavy's data to None after 7 minutes
seven_minutes = 7 * 60 # 7 minutes in seconds
df.loc[df['real_time'] > seven_minutes, ['superheavy_speed', 'superheavy_altitude']] = None
Said graph
Is there any way to remove them using Pandas or any other library? I’ve tried using rolling windows, but it’s not much of a help.
New contributor
sanitaravel is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
0