Thiết kế website giá rẻ

Question

enter image description here
using pdfplumber a python library to extract the data from pdf, but i cant get the images out of it in the particular table column.

using below packages to get the images
import fitz # PyMuPDF
from PIL import Image

so i need to use these both pdfplumber and above packages to place the images in the excel rows as it is in the pdf.

from flask import Flask, request, send_file, jsonify
import pdfplumber
import pandas as pd
import os
from flask_cors import CORS
import io
import fitz  # PyMuPDF
from PIL import Image
from openpyxl import Workbook
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.utils.dataframe import dataframe_to_rows

app = Flask(__name__)
CORS(app)


@app.route('/')
def home():
    return jsonify({'message': 'Welcome to the PDF to Excel converter API!'})


@app.route('/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part'}), 400
    file = request.files['file']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400
    if file:
        # Ensure the 'uploads' directory exists
        if not os.path.exists('uploads'):
            os.makedirs('uploads')

        # Save the uploaded PDF file
        pdf_path = os.path.join('uploads', file.filename)
        file.save(pdf_path)

        # Extract tables and images, and save to Excel
        excel_path = pdf_path.replace('.pdf', '.xlsx')
        try:
            # Open the PDF file with pdfplumber
            with pdfplumber.open(pdf_path) as pdf:
                wb = Workbook()
                ws_data = wb.active
                ws_data.title = "Data"
                data_rows = []

                for page_num, page in enumerate(pdf.pages, start=1):
                    all_tables = []

                    # Extract tables from the current page
                    tables = page.extract_tables()
                    for table in tables:
                        df = pd.DataFrame(table)
                        all_tables.append(df)

                    if all_tables:
                        combined_df = pd.concat(all_tables, ignore_index=True)

                        if page_num == 1:  # Add headers only once for the first page
                            data_rows.extend(dataframe_to_rows(combined_df, index=False, header=True))
                        else:
                            data_rows.extend(dataframe_to_rows(combined_df, index=False, header=False))

                    # Open the PDF file with PyMuPDF to extract images
                    pdf_file = fitz.open(pdf_path)
                    output_dir = 'images'
                    os.makedirs(output_dir, exist_ok=True)

                    # Extract images from the current page
                    page_mupdf = pdf_file[page_num - 1]  # Page index is 0-based
                    image_list = page_mupdf.get_images(full=True)

                    for img_index, img in enumerate(image_list):
                        xref = img[0]
                        base_image = pdf_file.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]
                        image = Image.open(io.BytesIO(image_bytes))

                        # Save the image
                        image_filename = f'{output_dir}/page_{page_num}_img_{img_index + 1}.{image_ext}'
                        image.save(image_filename)

                        # Create a new sheet for each image
                        ws_images = wb.create_sheet(title=f"Page_{page_num}_Image_{img_index + 1}")
                        img_excel = ExcelImage(image_filename)
                        ws_images.add_image(img_excel, 'A1')

                # Write all data rows to the 'Data' sheet
                for row in data_rows:
                    ws_data.append(row)

                # Save the Excel workbook
                wb.save(excel_path)

                return send_file(excel_path, as_attachment=True)

        except Exception as e:
            return jsonify({'error': str(e)}), 500


if __name__ == '__main__':
    if not os.path.exists('uploads'):
        os.makedirs('uploads')
    app.run(debug=True, port=2000, host='0.0.0.0')

Thiết kế website giá rẻ

Danh mục

Extracting PDF data into Excel