I have some code in Python which, very briefly explained:
-
Reads .CSV files and converts them to .xlsx files, which are paired by date-suffixes. So, there are two files for each date, one called “FLIS YYYY-MM-DD” and one called “SEI2 YYYY-MM-DD”
-
The pairs of .xlsx files are compared to find any differences between the pairs.
-
Data is contained in 5 columns
-
An example from a pair of files, after conversion etc., could look like this
File 1:File2:
-
-
The code checks if there are any differences between the files. It checks values in row 3 in file 1 against row 4 in file 2 – which is for some reason just the way i could get it to work. If any differences are found between the files, the code tells me so.
-
In the example above, the values in D3 + D4 in file 1 add up to the value in D4 in file 2. I need the code to add the values in row 3, with the values in row 4, as it is the sum of these values (A3 + A4, B3 + B4 etc.) that is important to me.
The current code looks like this (although changed a little for the purposes of posting here):
import pandas as pd
import os
import re
from datetime import date
from xlsxwriter.workbook import Workbook
# Path
directory = "#The path of the files"
# Expression to find date suffix in CSV for reading files
pattern = r'(d{4}-d{2}-d{2})'
# Dict of ID's
number_to_name = {
# List removed to shorten post
}
# Function for translating ID
def translate_number(number):
return number_to_name.get(number, f"Unknown_{number}")
# Function to compare pairs of Excel files
def compare_excel_files(file1, file2, skiprows1=None, skiprows2=None):
# Read the Excel files
df1 = pd.read_excel(file1, skiprows=skiprows1)
df2 = pd.read_excel(file2, skiprows=skiprows2)
# Compare their dataframes
differences = df1.compare(df2)
if not differences.empty:
# Display which file pairs contain differences
filename1 = os.path.basename(file1)
filename2 = os.path.basename(file2)
print(f"Differences found between {filename1} and {filename2} :")
print(differences)
print() # Add newline between each for clarity
return True
return False
# Function to find missing IDs in an Excel sheet
def find_missing_IDs(file_path):
# Read the Excel file
df = pd.read_excel(file_path)
# Find unique ID names in the Excel sheet
present_IDs = set(df.iloc[-1, :])
# Find missing IDs
missing_IDs = set(number_to_name.values()) - present_IDs
return missing_IDs
# Flag to track if there are differences in the files
differences_found = False
# Go through each file in the directory
for filename in os.listdir(directory):
if filename.endswith(".csv"):
# Find date from CSV filename
match = re.search(pattern, filename)
if match:
date_suffix = match.group(1)
# Skip files if filename does not match the desired format
else:
print(f"Skipping file {filename} as it does not have a valid date suffix.")
continue
# Read data from CSV file
df = pd.read_csv(os.path.join(directory, filename), header=None)
# Split data in the first column out to the necessary number of columns
max_commas = df[0].str.split(',').transform(len).max()
df[[f'name_{x}' for x in range(max_commas)]] = df[0].str.split(',', expand=True)
# Remove the original unnamed column
df.drop(0, axis=1, inplace=True)
# Remove last column (name_0 - column only shows report time)
df = df[df.columns.drop(list(df.filter(regex='name_0')))]
# Find SOR number
ids_row = df.iloc[0]
# Translate SOR number to ID name
translated_ids_row = ids_row.apply(translate_number)
# Create new dataframe with row for ID names
translated_df = pd.DataFrame([translated_ids_row.tolist()], columns=df.columns)
# Insert new dataframe onto the original dataframe (new row at the bottom with ID name)
df = pd.concat([df, translated_df], ignore_index=True)
# Save the new dataframe as an Excel file
try:
# Pull filename without path name
filename_only = os.path.basename(filename)
output_name = f"{filename_only.split('_')[2].split('-')[0]} {date_suffix}.xlsx"
output_path = os.path.join(directory, output_name)
df.to_excel(output_path, index=None)
# Check if there is a corresponding file to compare with (date pair), and if there are differences, set differences_found to True
if "FLIS" in filename:
sei2_filename = f"SEI2 {date_suffix}.xlsx"
sei2_file = os.path.join(directory, sei2_filename)
if os.path.exists(sei2_file):
if compare_excel_files(output_path, sei2_file, skiprows1=[2], skiprows2=[3]):
differences_found = True
elif "SEI2" in filename:
flis_filename = f"FLIS {date_suffix}.xlsx"
flis_file = os.path.join(directory, flis_filename)
if os.path.exists(flis_file):
if compare_excel_files(output_path, flis_file, skiprows1=[3], skiprows2=[2]):
differences_found = True
# Print message if there is an error when trying to save
except Exception as e:
print(f"There was an error when I tried to save: {e}")
# If there are no differences in date-paired files
if not differences_found:
print("No differences found in any of the files.")
print()
# Function to ask the user to show missing IDs
def ask_show_missing_IDs():
while True:
answer = input("Would you like a list of which IDs did not appear in the data on the different days? (Yes/No): ").strip().lower()
print()
if answer in ["yes", "no"]:
return answer == "yes"
else:
print("Please answer with 'Yes' or 'No'.")
# Function to extract the date from the filename
def extract_date(filename):
# Remove any path and file type name, and then split by space and '-' to get the date
return filename.split()[-1].split('.')[0]
# Ask the user if they want to see missing IDs
show_missing_IDs = ask_show_missing_IDs()
# Show missing IDs if the user wants it
if show_missing_IDs:
missing_IDs_found = False
for filename in os.listdir(directory):
if filename.endswith(".xlsx") and filename.startswith("FLIS"):
output_path = os.path.join(directory, filename)
missing_IDs = find_missing_IDs(output_path)
if missing_IDs:
if not missing_IDs_found:
print("Here are the missing IDs for each of the following dates:")
missing_IDs_found = True
# Sort the IDs in alphabetical order before printing
sorted_missing_IDs = sorted(missing_IDs)
# Only print the date from the filename without "FLIS"
print(f"{extract_date(filename)}:")
for ID in sorted_missing_IDs:
print(ID)
print()
if not missing_IDs_found:
print("No missing IDs found in any of the files.")
# Function to ask the user to delete the files
def ask_delete_files():
while True:
answer = input("Should the CSV and Excel files be automatically deleted? (Yes/No): ").strip().lower()
print()
if answer in ["yes", "no"]:
return answer == "yes"
else:
print("Please answer with 'Yes' or 'No'.")
# Ask the user if they want to delete the files after processing
delete_files = ask_delete_files()
# Delete the files if the user wants it
if delete_files:
for filename in os.listdir(directory):
if filename.endswith((".csv", ".xlsx")):
os.remove(os.path.join(directory, filename))
print("The files have been deleted.")
print()
# Wait for user input before window closes
input("Press Enter to exit")
I have tried a variety of solutions from ChatGPT and Copilot, as i am no way good at coding – but nothing seems to work. Some of the solutions can sum the values and print it in the output, but i can not get it to add the summed values, which i need for the comparison of files to pass.
Other solutions are welcome as well