I’m working on a script that will add employee names to a table in a PDF by first extracting that table, modifying it, and reentering it into the PDF. My script is as follows:
import camelot
import fitz
import pandas as pd
def extract_tables_from_pdf(pdf_path):
try:
tables = camelot.read_pdf(pdf_path, pages='all')
print(f"Total tables extracted: {len(tables)}")
pdf_document = fitz.open(pdf_path)
header_coordinates = {}
for i, table in enumerate(tables):
print(f"--- Table {i + 1} ---")
table.to_csv(f'table_{i + 1}.csv')
page_num = table.page
page = pdf_document.load_page(page_num - 1)
header_texts = ["Print", "Sign", "Company"]
for text in header_texts:
text_instances = page.search_for(text)
for inst in text_instances:
print(f"Header '{text}' found at page {page_num}, coordinates: {inst}")
header_coordinates[text] = inst
pdf_document.close()
return tables, header_coordinates
except Exception as e:
print(f"An error occurred: {str(e)}")
return None, None
def modify_and_insert_table(pdf_path, tables, header_coordinates, output_pdf_path):
try:
pdf_document = fitz.open(pdf_path)
for i, table in enumerate(tables):
df = table.df
df_no_headers = df.iloc[1:].copy()
new_data = pd.Series(["Employee Name", "Employee Name", "Company Name"], index=[0, 1, 2])
df_no_headers.iloc[0, 0:3] = new_data
table_text = df_no_headers.to_string(index=False, header=False)
page_num = table.page
page = pdf_document.load_page(page_num - 1)
for col_idx, header in enumerate(["Print", "Sign", "Company"]):
if header in header_coordinates:
inst = header_coordinates[header]
rect = fitz.Rect(inst.x0, inst.y1, inst.x1, inst.y1 + 20)
text = new_data[col_idx]
page.insert_textbox(rect, text, fontsize=12, fontname="helvetica", color=(0, 0, 0), align=0)
pdf_document.save(output_pdf_path)
pdf_document.close()
print(f"Modified PDF saved to {output_pdf_path}")
except Exception as e:
print(f"An error occurred: {str(e)}")
if __name__ == "__main__":
pdf_path = "/path/to/Adobe.pdf"
output_pdf_path = "/path/to/Signed Adobe.pdf"
tables, header_coordinates = extract_tables_from_pdf(pdf_path)
if tables and header_coordinates:
modify_and_insert_table(pdf_path, tables, header_coordinates, output_pdf_path)
I noticed that new_data = pd.Series(["Employee Name", "Employee Name", "Company Name"], index=[0, 1, 2])
didn’t work, so I tried new_data = pd.Series(["Employee", "Employee", "Company"], index=[0, 1, 2])
and that appears to have worked fine.
Right before page.insert_textbox(rect, text, fontsize=12, fontname="helvetica", color=(0, 0, 0), align=0)
, I put print(text)
and am able to see Employee Name and Company Name, but when I inspect the Signed Adobe.pdf, the text is not there. I’m not sure why it would work with a single string containing no spaces, and not with a string containing spaces. Any assistance is greatly appreciated!