I have been trying to extract text from PDF files to automate a significant and tedious part of my job using Python. With the help of ChatGPT, I have written multiple lines of code. However, I am encountering a problem that neither I nor ChatGPT can solve. Some of the PDFs contain comments in the form of text boxes, comment notes, highlights, and overlining—essentially all kinds of comments.
The program I have written works well and extracts all the desired content, exporting it to an Excel file. However, it also extracts the text within these comments and includes it in the same Excel file, making it slightly unreadable.
My question is: Is there any way to extract text from PDFs without also extracting the text from the comments?
These are the suggestions that ChatGPT provided repeatedly. However, if implemented, the program effectively skips the comments but also skips any text underneath the comments. As a result, it omits critical information from the extraction process.
# Get annotation rectangles
annotations = []
if page.annots():
for annot in page.annots():
annotations.append(annot.rect)
def extract_words_from_box(page, rect, annotations):
words = page.get_text("words") # Extract words from the page
words_in_box = [word for word in words if rect.intersects(fitz.Rect(word[:4]))]
# Exclude words that fall within annotation rectangles
words_in_box = [word for word in words_in_box if not any(fitz.Rect(word[:4]).intersects(annot_rect) for annot_rect in annotations)]
return words_in_box
def print_text_in_boxes(pdf_path):
material_box_def = (40, 85, 70, 705) # Adjusted the dimensions
po_number_box_def = (330, 40, 570, 115) # PO number box dimensions
destination_box_def = (40, 200, 330, 300) # Destination box dimensions
other_boxes_definitions = {
'product_code': (72, -2, 131, 15), # Adjusted to avoid concatenation
'due_date': (217, -2, 270, 15),
'qty': (286, -2, 314, 15),
'net_price': (350, -2, 399, 15),
'material_revision_box': (80, 15, 192, 80) # Added box for Material Revision
}
po_number = None # Initialize PO number as None
destination = "Unknown" # Default destination as "Unknown"
extracted_data = []
try:
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Get annotation rectangles
annotations = []
if page.annots():
for annot in page.annots():
annotations.append(annot.rect)
# Extract PO number from the first page only
if page_num == 0:
po_number_rect = fitz.Rect(*po_number_box_def)
words_in_po_number_box = extract_words_from_box(page, po_number_rect, annotations)
po_numbers = [word[4] for word in words_in_po_number_box if word[4].isdigit() and len(word[4]) == 10]
if po_numbers:
po_number = po_numbers[0]
destination_rect = fitz.Rect(*destination_box_def)
words_in_destination_box = extract_words_from_box(page, destination_rect, annotations)
destination_text = ' '.join(word[4] for word in words_in_destination_box)
destination = determine_destination(destination_text)
material_rect = fitz.Rect(*material_box_def)
words_in_material_box = extract_words_from_box(page, material_rect, annotations)
item_numbers = [word for word in words_in_material_box if word[4].isdigit() and len(word[4]) == 5]
for item_number in item_numbers:
item_number_rect = fitz.Rect(item_number[:4])
item_info = extract_item_info(page, item_number_rect, other_boxes_definitions, po_number, destination, annotations)
# Separate product code and name
if 'product_code' in item_info:
description_parts = item_info['product_code'].split(' ', 1)
if len(description_parts) == 2:
item_info['product_code'] = description_parts[0]
item_info['product_name'] = description_parts[1]
else:
item_info['product_code'] = description_parts[0]
item_info['product_name'] = 'No text found'
# Convert due date to Finnish format
if 'due_date' in item_info:
item_info['due_date'] = convert_date_format(item_info['due_date'])
extracted_data.append(item_info) # Add extracted item info to the data list
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
return extracted_data
matias turpeinen is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.