I’ve been trying to merge mutliple financial statements, without sacrificing line items, and merging all the items along with the years amnd year’s value to get the a larger statement with this items and years. I’ve tried a few aprproach but I’m very rookie at coding, I export the dataframes (BALANCE SHEETS) for ilustration purposes.https://1drv.ms/f/c/175d836e30c91f0a/Ep80w4-k929ElYJ_NIcFa4sB8Bwm3JgZYTCDz_b0XKx3BQ?e=MXHRuI
This is what I’ve tried so far:
import pandas as pd
from filing_scraper import scrape_and_process
# Call the function to get the dataframes stored in a list
categories_dict = scrape_and_process()
# List of headers to process
headers = ["Consolidated Balance Sheets", "Consolidated Statements of Operations", "Consolidated Statements of Cash Flows"]
def try_merge(ls_dfs, key_col):
# Ensure all dataframes have the key column as the first column
for idx, df in enumerate(ls_dfs):
if key_col in df.columns:
cols = [key_col] + [col for col in df.columns if col != key_col]
df = df[cols]
df.columns = [f"{col}_{idx}" if col != key_col else col for col in df.columns]
ls_dfs[idx] = df
final_df = ls_dfs[0]
for i in ls_dfs[1:]:
final_df = pd.merge(final_df, i, on=key_col, how='outer')
# Combine duplicate columns (if any) after merge
col_sets = set(final_df.columns)
for col in col_sets:
if col.endswith('_0'):
base_col = col.rsplit('_', 1)[0]
col_variants = [f"{base_col}_{idx}" for idx in range(len(ls_dfs)) if f"{base_col}_{idx}" in final_df.columns]
combined_series = final_df[col_variants[0]]
for variant in col_variants[1:]:
combined_series = combined_series.combine_first(final_df[variant])
final_df[base_col] = combined_series
final_df.drop(columns=col_variants, inplace=True)
return final_df
merged_dfs = {}
for header in headers:
if header in categories_dict:
dataframes = categories_dict[header]
if dataframes: # Check if the list is not empty
refined_data = try_merge(dataframes, "Keys")
refined_data = refined_data.dropna(axis=0, how="all").dropna(axis=1, how="all")
merged_dfs[header] = refined_data
for header, df in merged_dfs.items():
print(f"Merged DataFrame for {header}:n", df)
Francisco Padron is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.