def extract_text_from_pdf(pdf_path):
text = ""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
num_pages = len(pdf_reader.pages)
for page_num in range(num_pages):
page = pdf_reader.pages[page_num]
page_text = page.extract_text()
lines = page_text.split('n')
# Identify the spacing between lines
line_spacing = []
for i in range(len(lines) - 1):
line_spacing.append(len(lines[i + 1]) - len(lines[i].rstrip()))
# Create a new page with line spacing information
new_page = PyPDF2.PageObject.createBlankPage(width=page.mediaBox.getWidth(), height=page.mediaBox.getHeight())
new_text = ""
for i, line in enumerate(lines):
new_text += line + "n"
if i < len(line_spacing):
new_text += f"[Line Spacing: {line_spacing[i]}]n"
new_page.mergePage(page)
new_page.mergeTxtLine(new_text, 0, page.mediaBox.getHeight())
text += new_text
# Remove the specific text
text = text.replace("Global LNG Monthly Forecast May 13, 2024n© 2024 by S&P Global Inc.", "")
# Write the modified PDF to a new file (if needed)
output_pdf = BytesIO()
pdf_writer = PyPDF2.PdfWriter(output_pdf)
pdf_writer.addPage(new_page)
pdf_writer.write(output_pdf)
return text, output_pdf.getvalue()
# Example usage
pdf_path = "Global LNG Monthly Forecast.pdf"
output_file = "Global LNG Monthly Forecast.txt"
text, modified_pdf = extract_text_from_pdf(pdf_path)
with open(output_file, 'w', encoding='utf-8') as file:
file.write(text)
print(f'Text saved to {output_file}')
DeprecationError Traceback (most recent call last)
Cell In[24], line 41
39 # Example usage
40 pdf_path = “Global LNG Monthly Forecast.pdf”
—> 41 text, modified_pdf = extract_text_from_pdf(pdf_path)
42 print(text)
Cell In[24], line 18
15 line_spacing.append(len(lines[i + 1]) – len(lines[i].rstrip()))
17 # Create a new page with line spacing information
—> 18 new_page = PyPDF2.PageObject.createBlankPage(None, page.mediabox.width, page.mediabox.height)
19 new_text = “”
20 for i, line in enumerate(lines):
File c:Userslydia_chuAppDataLocalProgramsPythonPython312Libsite-packagesPyPDF2_page.py:461, in PageObject.createBlankPage(pdf, width, height)
450 @staticmethod
451 def createBlankPage(
452 pdf: Optional[Any] = None, # PdfReader
453 width: Union[float, Decimal, None] = None,
454 height: Union[float, Decimal, None] = None,
455 ) -> “PageObject”: # pragma: no cover
456 “””
457 .. deprecated:: 1.28.0
458
…
File c:Userslydia_chuAppDataLocalProgramsPythonPython312Libsite-packagesPyPDF2_utils.py:351, in deprecation(msg)
350 def deprecation(msg: str) -> None:
–> 351 raise DeprecationError(msg)
DeprecationError: createBlankPage is deprecated and was removed in PyPDF2 3.0.0. Use create_blank_page instead.