I’m facing issue with converting the tables from pdf to excel table I want to save it into the separate file but I’m facing some issue, the first table im getting is perfectly fine but in the 2nd table its starting from 4th row and giving me the data! I have added the identifier as well to detect the table: here is the code for better understanding:
import tabula
import pandas as pd
import os
def extract_and_save_tables(input_filename):
# Define the output directory
project_dir = os.path.dirname(os.path.abspath(input_filename))
output_dir = os.path.join(project_dir, "from pdf")
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
print(f'Creating directory: {output_dir}')
os.makedirs(output_dir)
# Extract tables from the PDF
try:
tables = tabula.read_pdf(input_filename, pages='all', multiple_tables=True, java_options='-Djava.awt.headless=true')
except Exception as e:
print(f'Error extracting tables: {e}')
return
if not tables:
print('No tables found in the PDF.')
return
# Save each table to a separate Excel file
for index, table in enumerate(tables):
try:
# Create a DataFrame from the table
df = pd.DataFrame(table)
output_file = os.path.join(output_dir, f'Table_{index + 1}.xlsx')
df.to_excel(output_file, index=False)
print(f'Table_{index + 1} saved to "{output_file}".')
except Exception as e:
print(f'Error saving Table_{index + 1}: {e}')
input_filename = r'D:Projectsproject1Press Release Q2_2023.pdf'
extract_and_save_tables(input_filename)
Im getting this as result
but the data is like this