I’m trying to get this code to work. It essentially highlights specific text on another PDF, but I want the percentage of text it highlights to be returned to an Excel cell. I tried running it through the command prompt, but it wouldn’t work either. I call the code like this:
I am extracting text from one PDF and then trying to highlight all the possible percentages with the corresponding lender for my research evidence. I would like to automate this process. Here is the code I am using:
Function PythonEORnote(directory As String, note As String, lenderName As String) As String
Dim pythonExePath As String
Dim pythonScriptPath As String
Dim cmd As String
Dim wsh As Object
Dim exec As Object
' Define the paths to the Python executable and script
pythonExePath = """C:UsersPhillDesktopFact find DocumentsPython macrospython2pythonProject3.venvScriptspythonw.exe"""
pythonScriptPath = """C:UsersPhillDesktopFact find DocumentsPython macrospython2pythonProject3.venvPostnote3.py"""
' Create the command string with the arguments
cmd = pythonExePath & " " & pythonScriptPath & " """ & directory & """ """ & note & """ """ & lenderName & """"
' Run the command and capture the output
Set wsh = VBA.CreateObject("WScript.Shell")
Set exec = wsh.exec(cmd)
Do While exec.status = 0
DoEvents
Loop
End Function
import fitz # PyMuPDF
import sys
import os
import glob
import re
import xlwings as xw
def add_annotation_and_highlight(doc, note, highlight_texts, debug_file):
try:
page = doc[0] # Assuming we're always working with the first page
except IndexError:
print("PDF file does not contain any pages.", file=debug_file)
sys.exit(1)
# Add the text annotation
page.add_text_annot((800, 50), note)
print(f"Note added: '{note}'", file=debug_file)
# Highlight the specified texts and track which combinations were highlighted
highlighted_percentages = set()
for highlight_text in highlight_texts:
text_instances = page.search_for(highlight_text)
if text_instances:
for inst in text_instances:
page.add_highlight_annot(inst)
print(f"Highlighted text: '{highlight_text}' at {inst}", file=debug_file)
# Extract and add the percentage part of the highlighted text
percentage_match = re.search(r'(d+.d+)%', highlight_text)
if percentage_match:
highlighted_percentages.add(percentage_match.group(1))
return highlighted_percentages
def save_backup_and_restore_original(doc, original_path, debug_file):
# Define the path for the backup copy, replacing ".pdf" with "_backup.pdf"
backup_copy_path = original_path.replace(".pdf", "_backup.pdf")
try:
# Attempt to save a backup copy of the document
doc.save(backup_copy_path, encryption=fitz.PDF_ENCRYPT_KEEP)
print(f"Backup document saved successfully as {backup_copy_path}.", file=debug_file)
except Exception as e:
# If saving fails, print the error and raise an exception
print(f"Failed to save backup document: {e}", file=debug_file)
raise
# Close the document before attempting to delete the original file
doc.close()
try:
# Now that the document is closed, delete the original document
os.remove(original_path)
print(f"Original document deleted successfully: {original_path}", file=debug_file)
except Exception as e:
# If deletion fails, print the error and raise an exception
print(f"Failed to delete original document: {e}", file=debug_file)
raise
try:
# Rename the backup file to the original name
os.rename(backup_copy_path, original_path)
print(f"Backup document renamed to original name successfully: {original_path}", file=debug_file)
except Exception as e:
# If renaming fails, print the error and raise an exception
print(f"Failed to rename backup document to original name: {e}", file=debug_file)
raise
def find_latest_pdf(directory, keywords, debug_file):
pdf_files = []
for keyword in keywords:
pdf_files.extend(glob.glob(os.path.join(directory, f"*{keyword.lower()}*.pdf")))
# Filter the files to make sure they match the keywords case-insensitively
matching_files = [f for f in pdf_files if
any(keyword.lower() in os.path.basename(f).lower() for keyword in keywords)]
if not matching_files:
print("No matching PDF files found.", file=debug_file)
return None
latest_pdf = max(matching_files, key=os.path.getmtime)
print(f"Latest PDF found: {latest_pdf}", file=debug_file)
return latest_pdf
def extract_text_from_pdf(pdf_path, debug_file):
text = ""
try:
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
text += page.get_text()
print(f"Extracted text from page {page_num} of {pdf_path}:n{text}", file=debug_file)
doc.close()
except Exception as e:
print(f"Failed to extract text from PDF: {pdf_path}, error: {e}", file=debug_file)
return text
def extract_text_from_pdf_alternative(pdf_path, debug_file):
text = ""
try:
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
text += str(page.get_text("dict")["blocks"])
print(f"Alternative method extracted text from page {page_num} of {pdf_path}:n{text}", file=debug_file)
doc.close()
except Exception as e:
print(f"Alternative method failed to extract text from PDF: {pdf_path}, error: {e}", file=debug_file)
return text
def extract_percentages_from_pdfs(directory, keywords, debug_file):
percentages = set()
percentage_pattern = re.compile(r'(d+.d+)%')
pdf_files = []
for keyword in keywords:
pdf_files.extend(glob.glob(os.path.join(directory, f"*{keyword.lower()}*.pdf")))
print(f"KFI files found: {pdf_files}", file=debug_file)
for pdf_file in pdf_files:
try:
# Try the primary method
text = extract_text_from_pdf(pdf_file, debug_file)
if not text:
# If the primary method fails, try the alternative method
text = extract_text_from_pdf_alternative(pdf_file, debug_file)
matches = percentage_pattern.findall(text)
for match in matches:
# Convert percentage to two decimal places
percentage = "{:.2f}".format(float(match))
percentages.add(percentage)
except Exception as e:
print(f"Failed to process the PDF: {pdf_file}, error: {e}", file=debug_file)
print(f"Percentages found: {percentages}", file=debug_file)
return list(percentages)
def get_possible_lender_names(lender_name):
lender_variations = {
"Nationwide": ["Nationwide", "Nationwide B.Soc"],
"NatWest": ["NatWest", "NatWest Intermediary"],
"Santander": ["Santander"],
"Barclays": ["Barclays"],
"TSB": ["TSB", "TSB - Intermediaries"],
"Co-operative": ["Co-operative", "Co-operative Bank"],
"Bank of Ireland": ["Bank of Ireland", "BOI"],
"Clydesdale Bank": ["Clydesdale", "Clydesdale Bank PLC"],
"Newcastle": ["Newcastle BS", "Newcastle"],
"Leeds": ["Leeds B Society", "Leeds B Society"],
"West One Loans": ["West One Loans", "West One Loans"],
"Nottingham": ["Nottingham", "Nottingham"],
"Marsden": ["Marsden", "Marsden"],
"Aldermore": ["Aldermore", "Aldermore"],
"Kent Reliance": ["Kent Reliance", "Kent Reliance"],
"Furness BS": ["Furness BS", "Furness B.Soc"],
"BM Solutions": ["BM Solutions"],
"Principality": ["Principality"],
"Post Office": ["Post Office"],
"Godiva": ["Coventry BS-Godiva"],
"Kensington": ["Kensington"],
"The Mortgage Works": ["The Mortgage Works"],
"TMW": ["The Mortgage Works"],
"Virgin": ["Virgin", "Virgin Money"],
"HSBC": ["HSBC", "HSBC Intermediary"],
"Halifax": ["Halifax"],
"Coventry": ["Coventry"],
"Accord": ["Accord", "Accord Mortgages"]
# Add more lenders and their variations here as needed
}
return lender_variations.get(lender_name, [lender_name])
def write_to_excel(workbook_name, sheet_name, start_cell, values):
app = xw.apps.active
book = app.books[workbook_name]
sheet = book.sheets[sheet_name]
# Write each value to a new column starting from the start_cell
col = start_cell[0] # Starting column (e.g., 'B')
row = start_cell[1] # Starting row (e.g., 18)
for i, value in enumerate(values):
cell = f"{chr(ord(col) + i)}{row}" # Calculate the cell to write to
sheet.range(cell).value = value
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: python script.py <DIRECTORY> <NOTE> <LENDER_NAME>")
sys.exit(1)
directory = sys.argv[1]
note = sys.argv[2]
lender_name = sys.argv[3]
debug_file_path = os.path.expanduser("~/Desktop/python_debug.txt")
debug_file = open(debug_file_path, "w")
print(f"Directory: {directory}", file=debug_file)
print(f"Note: {note}", file=debug_file)
print(f"Lender Name: {lender_name}", file=debug_file)
if not os.path.isdir(directory):
print(f"Directory does not exist: {directory}", file=debug_file)
debug_file.close()
sys.exit(1)
# Keywords to search for
eor_keywords = ["EOR", "Evidence", "Research"]
kfi_keywords = ["ESIS", "KFI", "illustration", "key facts"]
# Find the latest EOR PDF
latest_pdf_path = find_latest_pdf(directory, eor_keywords, debug_file)
if latest_pdf_path is None:
print("No matching PDF files found to annotate.", file=debug_file)
debug_file.close()
sys.exit(1)
# Extract percentages from KFI PDFs
percentages = extract_percentages_from_pdfs(directory, kfi_keywords, debug_file)
doc = None
try:
doc = fitz.open(latest_pdf_path)
text = ""
for page_num in range(len(doc)):
page = doc[page_num]
text += page.get_text()
possible_lender_names = get_possible_lender_names(lender_name)
highlight_texts = []
for possible_name in possible_lender_names:
for percentage in percentages:
highlight_texts.append(f"{possible_name} {percentage}")
# Print highlight texts
print(f"Highlight texts: {highlight_texts}", file=debug_file)
# Always add the note, even if no text is highlighted
highlighted_percentages = add_annotation_and_highlight(doc, note, highlight_texts, debug_file)
if highlighted_percentages:
print(f"Highlighted percentages: {highlighted_percentages}", file=debug_file)
# Write the highlighted percentages to the Excel file
write_to_excel("Myworksheet - work in progress.xlsb", "WRK Helper", ('B', 18), list(highlighted_percentages))
save_backup_and_restore_original(doc, latest_pdf_path, debug_file)
else:
print("No text was highlighted in the document, but note was added.", file=debug_file)
save_backup_and_restore_original(doc, latest_pdf_path, debug_file) # Ensure the document is saved even if no text was highlighted
except Exception as e:
print(f"Failed to process the PDF: {e}", file=debug_file)
finally:
if doc is not None:
try:
doc.close()
except Exception as close_error:
print(f"Error closing the document: {close_error}", file=debug_file)
debug_file.close()