I am trying to delete all footnotes/endnotes in a pdf. They all have a font size of six, while the rest of the text has a font size of twelve.
Example text:
“The Industrial Revolution marked a significant turning point in human history, ushering in mass production and urbanization.”19 (“19” would be in font size six).
Text after running code:
“The Industrial Revolution marked a significant turning point in human history, ushering in mass production and urbanization.”
This is the code I have so far, but is not working.
import fitz # PyMuPDF
def remove_small_numbers(pdf_path, output_path):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
for b in blocks:
if "lines" in b:
for line in b["lines"]:
for span in line["spans"]:
if span["size"] == 6 and span["text"].strip().isdigit():
rect = fitz.Rect(span["bbox"]) # Get the bounding box of the span
page.add_rect_annot(rect) # Add a transparent annotation to cover the text
# Remove all annotations (which cover the small numbers)
for annot in page.annots():
page.delete_annot(annot)
doc.save(output_path)
doc.close()
# Example usage
input_pdf = "input.pdf"
output_pdf = "output.pdf"
remove_small_numbers(input_pdf, output_pdf)