Thiết kế website giá rẻ

Question

I’m trying to get this code to work. It essentially highlights specific text on another PDF, but I want the percentage of text it highlights to be returned to an Excel cell. I tried running it through the command prompt, but it wouldn’t work either. I call the code like this:

I am extracting text from one PDF and then trying to highlight all the possible percentages with the corresponding lender for my research evidence. I would like to automate this process. Here is the code I am using:

Function PythonEORnote(directory As String, note As String, lenderName As String) As String
    Dim pythonExePath As String
    Dim pythonScriptPath As String
    Dim cmd As String
    Dim wsh As Object
    Dim exec As Object

    ' Define the paths to the Python executable and script
    pythonExePath = """C:UsersPhillDesktopFact find DocumentsPython macrospython2pythonProject3.venvScriptspythonw.exe"""
    pythonScriptPath = """C:UsersPhillDesktopFact find DocumentsPython macrospython2pythonProject3.venvPostnote3.py"""

    ' Create the command string with the arguments
    cmd = pythonExePath & " " & pythonScriptPath & " """ & directory & """ """ & note & """ """ & lenderName & """"

    ' Run the command and capture the output
    Set wsh = VBA.CreateObject("WScript.Shell")
    Set exec = wsh.exec(cmd)
    Do While exec.status = 0
        DoEvents
    Loop

End Function

import fitz  # PyMuPDF
import sys
import os
import glob
import re
import xlwings as xw

def add_annotation_and_highlight(doc, note, highlight_texts, debug_file):
    try:
        page = doc[0]  # Assuming we're always working with the first page
    except IndexError:
        print("PDF file does not contain any pages.", file=debug_file)
        sys.exit(1)

    # Add the text annotation
    page.add_text_annot((800, 50), note)
    print(f"Note added: '{note}'", file=debug_file)

    # Highlight the specified texts and track which combinations were highlighted
    highlighted_percentages = set()
    for highlight_text in highlight_texts:
        text_instances = page.search_for(highlight_text)
        if text_instances:
            for inst in text_instances:
                page.add_highlight_annot(inst)
                print(f"Highlighted text: '{highlight_text}' at {inst}", file=debug_file)
                # Extract and add the percentage part of the highlighted text
                percentage_match = re.search(r'(d+.d+)%', highlight_text)
                if percentage_match:
                    highlighted_percentages.add(percentage_match.group(1))
    return highlighted_percentages

def save_backup_and_restore_original(doc, original_path, debug_file):
    # Define the path for the backup copy, replacing ".pdf" with "_backup.pdf"
    backup_copy_path = original_path.replace(".pdf", "_backup.pdf")

    try:
        # Attempt to save a backup copy of the document
        doc.save(backup_copy_path, encryption=fitz.PDF_ENCRYPT_KEEP)
        print(f"Backup document saved successfully as {backup_copy_path}.", file=debug_file)
    except Exception as e:
        # If saving fails, print the error and raise an exception
        print(f"Failed to save backup document: {e}", file=debug_file)
        raise

    # Close the document before attempting to delete the original file
    doc.close()

    try:
        # Now that the document is closed, delete the original document
        os.remove(original_path)
        print(f"Original document deleted successfully: {original_path}", file=debug_file)
    except Exception as e:
        # If deletion fails, print the error and raise an exception
        print(f"Failed to delete original document: {e}", file=debug_file)
        raise

    try:
        # Rename the backup file to the original name
        os.rename(backup_copy_path, original_path)
        print(f"Backup document renamed to original name successfully: {original_path}", file=debug_file)
    except Exception as e:
        # If renaming fails, print the error and raise an exception
        print(f"Failed to rename backup document to original name: {e}", file=debug_file)
        raise

def find_latest_pdf(directory, keywords, debug_file):
    pdf_files = []
    for keyword in keywords:
        pdf_files.extend(glob.glob(os.path.join(directory, f"*{keyword.lower()}*.pdf")))

    # Filter the files to make sure they match the keywords case-insensitively
    matching_files = [f for f in pdf_files if
                      any(keyword.lower() in os.path.basename(f).lower() for keyword in keywords)]

    if not matching_files:
        print("No matching PDF files found.", file=debug_file)
        return None

    latest_pdf = max(matching_files, key=os.path.getmtime)
    print(f"Latest PDF found: {latest_pdf}", file=debug_file)
    return latest_pdf

def extract_text_from_pdf(pdf_path, debug_file):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            text += page.get_text()
            print(f"Extracted text from page {page_num} of {pdf_path}:n{text}", file=debug_file)
        doc.close()
    except Exception as e:
        print(f"Failed to extract text from PDF: {pdf_path}, error: {e}", file=debug_file)
    return text

def extract_text_from_pdf_alternative(pdf_path, debug_file):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc[page_num]
            text += str(page.get_text("dict")["blocks"])
            print(f"Alternative method extracted text from page {page_num} of {pdf_path}:n{text}", file=debug_file)
        doc.close()
    except Exception as e:
        print(f"Alternative method failed to extract text from PDF: {pdf_path}, error: {e}", file=debug_file)
    return text

def extract_percentages_from_pdfs(directory, keywords, debug_file):
    percentages = set()
    percentage_pattern = re.compile(r'(d+.d+)%')
    pdf_files = []

    for keyword in keywords:
        pdf_files.extend(glob.glob(os.path.join(directory, f"*{keyword.lower()}*.pdf")))

    print(f"KFI files found: {pdf_files}", file=debug_file)

    for pdf_file in pdf_files:
        try:
            # Try the primary method
            text = extract_text_from_pdf(pdf_file, debug_file)
            if not text:
                # If the primary method fails, try the alternative method
                text = extract_text_from_pdf_alternative(pdf_file, debug_file)

            matches = percentage_pattern.findall(text)
            for match in matches:
                # Convert percentage to two decimal places
                percentage = "{:.2f}".format(float(match))
                percentages.add(percentage)
        except Exception as e:
            print(f"Failed to process the PDF: {pdf_file}, error: {e}", file=debug_file)

    print(f"Percentages found: {percentages}", file=debug_file)
    return list(percentages)

def get_possible_lender_names(lender_name):
    lender_variations = {
        "Nationwide": ["Nationwide", "Nationwide B.Soc"],
        "NatWest": ["NatWest", "NatWest Intermediary"],
        "Santander": ["Santander"],
        "Barclays": ["Barclays"],
        "TSB": ["TSB", "TSB - Intermediaries"],
        "Co-operative": ["Co-operative", "Co-operative Bank"],
        "Bank of Ireland": ["Bank of Ireland", "BOI"],
        "Clydesdale Bank": ["Clydesdale", "Clydesdale Bank PLC"],
        "Newcastle": ["Newcastle BS", "Newcastle"],
        "Leeds": ["Leeds B Society", "Leeds B Society"],
        "West One Loans": ["West One Loans", "West One Loans"],
        "Nottingham": ["Nottingham", "Nottingham"],
        "Marsden": ["Marsden", "Marsden"],
        "Aldermore": ["Aldermore", "Aldermore"],
        "Kent Reliance": ["Kent Reliance", "Kent Reliance"],
        "Furness BS": ["Furness BS", "Furness B.Soc"],
        "BM Solutions": ["BM Solutions"],
        "Principality": ["Principality"],
        "Post Office": ["Post Office"],
        "Godiva": ["Coventry BS-Godiva"],
        "Kensington": ["Kensington"],
        "The Mortgage Works": ["The Mortgage Works"],
        "TMW": ["The Mortgage Works"],
        "Virgin": ["Virgin", "Virgin Money"],
        "HSBC": ["HSBC", "HSBC Intermediary"],
        "Halifax": ["Halifax"],
        "Coventry": ["Coventry"],
        "Accord": ["Accord", "Accord Mortgages"]
        # Add more lenders and their variations here as needed
    }
    return lender_variations.get(lender_name, [lender_name])

def write_to_excel(workbook_name, sheet_name, start_cell, values):
    app = xw.apps.active
    book = app.books[workbook_name]
    sheet = book.sheets[sheet_name]

    # Write each value to a new column starting from the start_cell
    col = start_cell[0]  # Starting column (e.g., 'B')
    row = start_cell[1]  # Starting row (e.g., 18)
    for i, value in enumerate(values):
        cell = f"{chr(ord(col) + i)}{row}"  # Calculate the cell to write to
        sheet.range(cell).value = value

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print("Usage: python script.py <DIRECTORY> <NOTE> <LENDER_NAME>")
        sys.exit(1)

    directory = sys.argv[1]
    note = sys.argv[2]
    lender_name = sys.argv[3]

    debug_file_path = os.path.expanduser("~/Desktop/python_debug.txt")
    debug_file = open(debug_file_path, "w")

    print(f"Directory: {directory}", file=debug_file)
    print(f"Note: {note}", file=debug_file)
    print(f"Lender Name: {lender_name}", file=debug_file)

    if not os.path.isdir(directory):
        print(f"Directory does not exist: {directory}", file=debug_file)
        debug_file.close()
        sys.exit(1)

    # Keywords to search for
    eor_keywords = ["EOR", "Evidence", "Research"]
    kfi_keywords = ["ESIS", "KFI", "illustration", "key facts"]

    # Find the latest EOR PDF
    latest_pdf_path = find_latest_pdf(directory, eor_keywords, debug_file)

    if latest_pdf_path is None:
        print("No matching PDF files found to annotate.", file=debug_file)
        debug_file.close()
        sys.exit(1)

    # Extract percentages from KFI PDFs
    percentages = extract_percentages_from_pdfs(directory, kfi_keywords, debug_file)

    doc = None
    try:
        doc = fitz.open(latest_pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc[page_num]
            text += page.get_text()

        possible_lender_names = get_possible_lender_names(lender_name)

        highlight_texts = []
        for possible_name in possible_lender_names:
            for percentage in percentages:
                highlight_texts.append(f"{possible_name} {percentage}")

        # Print highlight texts
        print(f"Highlight texts: {highlight_texts}", file=debug_file)

        # Always add the note, even if no text is highlighted
        highlighted_percentages = add_annotation_and_highlight(doc, note, highlight_texts, debug_file)
        if highlighted_percentages:
            print(f"Highlighted percentages: {highlighted_percentages}", file=debug_file)
            # Write the highlighted percentages to the Excel file
            write_to_excel("Myworksheet - work in progress.xlsb", "WRK Helper", ('B', 18), list(highlighted_percentages))
            save_backup_and_restore_original(doc, latest_pdf_path, debug_file)
        else:
            print("No text was highlighted in the document, but note was added.", file=debug_file)
            save_backup_and_restore_original(doc, latest_pdf_path, debug_file)  # Ensure the document is saved even if no text was highlighted

    except Exception as e:
        print(f"Failed to process the PDF: {e}", file=debug_file)
    finally:
        if doc is not None:
            try:
                doc.close()
            except Exception as close_error:
                print(f"Error closing the document: {close_error}", file=debug_file)

        debug_file.close()

Thiết kế website giá rẻ

Danh mục

Writing from python to excel