Can someone please explain to me what is wrong with this code?
Code summary: It applies OCR to extract text from an image in a PDF file.
<code>import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import fitz
import pymupdf
import pandas as pd
import os
import io
import re
# Path to the PDF file
folder = "C:\Users\Yasser\Desktop\Test2"
dir_list = os.listdir(folder)
extracted_text = []
for file_name in dir_list:
if file_name.endswith(".pdf"):
# Open the PDF file
pdf_document = fitz.open(os.path.join(folder, file_name))
page = pdf_document[0]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
text = pytesseract.image_to_string(image)
extracted_text.append(text)
# Print the extracted text
for page_num, text in enumerate(extracted_text, start=1):
print(f"Page {page_num}:n{text}n")
</code>
<code>import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import fitz
import pymupdf
import pandas as pd
import os
import io
import re
# Path to the PDF file
folder = "C:\Users\Yasser\Desktop\Test2"
dir_list = os.listdir(folder)
extracted_text = []
for file_name in dir_list:
if file_name.endswith(".pdf"):
# Open the PDF file
pdf_document = fitz.open(os.path.join(folder, file_name))
page = pdf_document[0]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
text = pytesseract.image_to_string(image)
extracted_text.append(text)
# Print the extracted text
for page_num, text in enumerate(extracted_text, start=1):
print(f"Page {page_num}:n{text}n")
</code>
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import fitz
import pymupdf
import pandas as pd
import os
import io
import re
# Path to the PDF file
folder = "C:\Users\Yasser\Desktop\Test2"
dir_list = os.listdir(folder)
extracted_text = []
for file_name in dir_list:
if file_name.endswith(".pdf"):
# Open the PDF file
pdf_document = fitz.open(os.path.join(folder, file_name))
page = pdf_document[0]
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_document.extract_image(xref)
image_bytes = base_image["image"]
image = Image.open(io.BytesIO(image_bytes))
text = pytesseract.image_to_string(image)
extracted_text.append(text)
# Print the extracted text
for page_num, text in enumerate(extracted_text, start=1):
print(f"Page {page_num}:n{text}n")
Don’t mind the import list I was testing a lot of ways, and if you have any recommendation for OCR I’m all ears