we had a relaunch of a webiste and hundreds PDF in different languages and hardprinted QR-Codes.
After relaunch, we had the proble, some QR-Codes are not linking to the right URls.
To reduce the workload, my idea was to doenload all PDF, scan for QR-Code, extract the URL from the pdf and make a request is status code 200 of the URL (and have a table with all documents and links).
But my code can’t find the QR-Code.
Any help to fix?
import os
import requests
import fitz # PyMuPDF
import cv2
from pyzbar.pyzbar import decode
import numpy as np
import csv
import pdfplumber
# Funktion zum Herunterladen der PDFs
def download_pdf(url, save_path):
if os.path.exists(save_path):
print(f"{save_path} already exists, skipping download.")
return True
response = requests.get(url)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded {url}")
return True
else:
print(f"Failed to download {url}, status code: {response.status_code}")
return False
# Funktion zum Konvertieren von PDF-Seiten in Bilder mit PyMuPDF
def pdf_to_images(pdf_path):
doc = fitz.open(pdf_path)
images = []
for page in doc:
pix = page.get_pixmap()
if pix.alpha:
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 4)
else:
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
images.append(img)
return images
# Funktion zum Extrahieren von QR-Codes aus einem Bild
def extract_qr_codes(image):
qr_codes = decode(image)
return [qr.data.decode('utf-8') for qr in qr_codes]
# Funktion zum Überprüfen der URL und Rückgabe des Statuscodes
def check_url(url):
try:
response = requests.head(url, allow_redirects=True)
return response.status_code
except requests.RequestException as e:
print(f"Error checking URL {url}: {e}")
return 'Error'
# Alternative Methode zur Bildextraktion mit pdfplumber
def extract_images_and_decode_qr(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
im_list = page.images
for im in im_list:
bbox = (im['x0'], im['top'], im['x1'], im['bottom'])
cropped_image = page.crop(bbox).to_image(resolution=300)
image = np.array(cropped_image.original)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
qr_codes = decode(image)
for qr in qr_codes:
print("Gefundener QR-Code:", qr.data.decode())
# Hauptfunktion zum Verarbeiten der PDFs
def process_pdfs(url_list):
results = []
for url in url_list:
pdf_filename = os.path.basename(url)
pdf_path = os.path.join('downloads', pdf_filename)
if not os.path.exists('downloads'):
os.makedirs('downloads')
if download_pdf(url, pdf_path):
print(f"Processing {pdf_path}...")
images = pdf_to_images(pdf_path)
for img in images:
qr_urls = extract_qr_codes(img)
for qr_url in qr_urls:
status_code = check_url(qr_url)
results.append((pdf_filename, qr_url, status_code))
# Versuchen Sie es mit pdfplumber, wenn keine QR-URLs gefunden wurden
if not any(result[1] for result in results if result[0] == pdf_filename):
extract_images_and_decode_qr(pdf_path)
if results:
with open('results.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['PDF Name', 'QR Code', 'Target URL', 'Status Code'])
for result in results:
writer.writerow(result)
else:
print("No results to write to CSV.")
# Testaufruf für URLs
if __name__ == "__main__":
url_list = [
"https:.....pdf"
]
process_pdfs(url_list)
Different codes and appoaches
Expecting: Status Code of the URL behind QR-Code, and QR-Code detection as basic.