Thiết kế website giá rẻ

Question

we had a relaunch of a webiste and hundreds PDF in different languages and hardprinted QR-Codes.

After relaunch, we had the proble, some QR-Codes are not linking to the right URls.
To reduce the workload, my idea was to doenload all PDF, scan for QR-Code, extract the URL from the pdf and make a request is status code 200 of the URL (and have a table with all documents and links).

But my code can’t find the QR-Code.

Any help to fix?

import os
import requests
import fitz  # PyMuPDF
import cv2
from pyzbar.pyzbar import decode
import numpy as np
import csv
import pdfplumber

# Funktion zum Herunterladen der PDFs
def download_pdf(url, save_path):
    if os.path.exists(save_path):
        print(f"{save_path} already exists, skipping download.")
        return True
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {url}")
        return True
    else:
        print(f"Failed to download {url}, status code: {response.status_code}")
        return False

# Funktion zum Konvertieren von PDF-Seiten in Bilder mit PyMuPDF
def pdf_to_images(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        if pix.alpha:
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 4)
        else:
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, 3)
        images.append(img)
    return images

# Funktion zum Extrahieren von QR-Codes aus einem Bild
def extract_qr_codes(image):
    qr_codes = decode(image)
    return [qr.data.decode('utf-8') for qr in qr_codes]

# Funktion zum Überprüfen der URL und Rückgabe des Statuscodes
def check_url(url):
    try:
        response = requests.head(url, allow_redirects=True)
        return response.status_code
    except requests.RequestException as e:
        print(f"Error checking URL {url}: {e}")
        return 'Error'

# Alternative Methode zur Bildextraktion mit pdfplumber
def extract_images_and_decode_qr(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            im_list = page.images
            for im in im_list:
                bbox = (im['x0'], im['top'], im['x1'], im['bottom'])
                cropped_image = page.crop(bbox).to_image(resolution=300)
                image = np.array(cropped_image.original)
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                qr_codes = decode(image)
                for qr in qr_codes:
                    print("Gefundener QR-Code:", qr.data.decode())

# Hauptfunktion zum Verarbeiten der PDFs
def process_pdfs(url_list):
    results = []
    for url in url_list:
        pdf_filename = os.path.basename(url)
        pdf_path = os.path.join('downloads', pdf_filename)
        
        if not os.path.exists('downloads'):
            os.makedirs('downloads')
        
        if download_pdf(url, pdf_path):
            print(f"Processing {pdf_path}...")
            images = pdf_to_images(pdf_path)
            for img in images:
                qr_urls = extract_qr_codes(img)
                for qr_url in qr_urls:
                    status_code = check_url(qr_url)
                    results.append((pdf_filename, qr_url, status_code))

            # Versuchen Sie es mit pdfplumber, wenn keine QR-URLs gefunden wurden
            if not any(result[1] for result in results if result[0] == pdf_filename):
                extract_images_and_decode_qr(pdf_path)

    if results:
        with open('results.csv', 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['PDF Name', 'QR Code', 'Target URL', 'Status Code'])
            for result in results:
                writer.writerow(result)
    else:
        print("No results to write to CSV.")

# Testaufruf für URLs
if __name__ == "__main__":
    url_list = [
        "https:.....pdf"
    ]
    
    process_pdfs(url_list)

Different codes and appoaches

Expecting: Status Code of the URL behind QR-Code, and QR-Code detection as basic.

Thiết kế website giá rẻ

Danh mục

Scan PDF for QR-Code, extract URL behind and check status Code of the URL