Thiết kế website giá rẻ

Question

I have been trying to extract text from PDF files to automate a significant and tedious part of my job using Python. With the help of ChatGPT, I have written multiple lines of code. However, I am encountering a problem that neither I nor ChatGPT can solve. Some of the PDFs contain comments in the form of text boxes, comment notes, highlights, and overlining—essentially all kinds of comments.

The program I have written works well and extracts all the desired content, exporting it to an Excel file. However, it also extracts the text within these comments and includes it in the same Excel file, making it slightly unreadable.

My question is: Is there any way to extract text from PDFs without also extracting the text from the comments?

These are the suggestions that ChatGPT provided repeatedly. However, if implemented, the program effectively skips the comments but also skips any text underneath the comments. As a result, it omits critical information from the extraction process.

# Get annotation rectangles
annotations = []
if page.annots():
    for annot in page.annots():
        annotations.append(annot.rect)

def extract_words_from_box(page, rect, annotations):
    words = page.get_text("words")  # Extract words from the page
    words_in_box = [word for word in words if rect.intersects(fitz.Rect(word[:4]))]
    # Exclude words that fall within annotation rectangles
    words_in_box = [word for word in words_in_box if not any(fitz.Rect(word[:4]).intersects(annot_rect) for annot_rect in annotations)]
    return words_in_box

def print_text_in_boxes(pdf_path):
    material_box_def = (40, 85, 70, 705)  # Adjusted the dimensions
    po_number_box_def = (330, 40, 570, 115)  # PO number box dimensions
    destination_box_def = (40, 200, 330, 300)  # Destination box dimensions
    other_boxes_definitions = {
        'product_code': (72, -2, 131, 15),  # Adjusted to avoid concatenation
        'due_date': (217, -2, 270, 15),
        'qty': (286, -2, 314, 15),
        'net_price': (350, -2, 399, 15),
        'material_revision_box': (80, 15, 192, 80)  # Added box for Material Revision
    }

    po_number = None  # Initialize PO number as None
    destination = "Unknown"  # Default destination as "Unknown"
    extracted_data = []

    try:
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc[page_num]

            # Get annotation rectangles
            annotations = []
            if page.annots():
                for annot in page.annots():
                    annotations.append(annot.rect)

            # Extract PO number from the first page only
            if page_num == 0:
                po_number_rect = fitz.Rect(*po_number_box_def)
                words_in_po_number_box = extract_words_from_box(page, po_number_rect, annotations)
                po_numbers = [word[4] for word in words_in_po_number_box if word[4].isdigit() and len(word[4]) == 10]
                if po_numbers:
                    po_number = po_numbers[0]

                destination_rect = fitz.Rect(*destination_box_def)
                words_in_destination_box = extract_words_from_box(page, destination_rect, annotations)
                destination_text = ' '.join(word[4] for word in words_in_destination_box)
                destination = determine_destination(destination_text)

            material_rect = fitz.Rect(*material_box_def)
            words_in_material_box = extract_words_from_box(page, material_rect, annotations)
            
            item_numbers = [word for word in words_in_material_box if word[4].isdigit() and len(word[4]) == 5]
            for item_number in item_numbers:
                item_number_rect = fitz.Rect(item_number[:4])
                item_info = extract_item_info(page, item_number_rect, other_boxes_definitions, po_number, destination, annotations)
                
                # Separate product code and name
                if 'product_code' in item_info:
                    description_parts = item_info['product_code'].split(' ', 1)
                    if len(description_parts) == 2:
                        item_info['product_code'] = description_parts[0]
                        item_info['product_name'] = description_parts[1]
                    else:
                        item_info['product_code'] = description_parts[0]
                        item_info['product_name'] = 'No text found'

                # Convert due date to Finnish format
                if 'due_date' in item_info:
                    item_info['due_date'] = convert_date_format(item_info['due_date'])

                extracted_data.append(item_info)  # Add extracted item info to the data list

    except Exception as e:
        print(f"Error processing {pdf_path}: {str(e)}")

    return extracted_data

Thiết kế website giá rẻ

Danh mục

Extracting Text from PDFs with Python Without Including Comments