enter image description here
using pdfplumber a python library to extract the data from pdf, but i cant get the images out of it in the particular table column.
using below packages to get the images
import fitz # PyMuPDF
from PIL import Image
so i need to use these both pdfplumber and above packages to place the images in the excel rows as it is in the pdf.
from flask import Flask, request, send_file, jsonify
import pdfplumber
import pandas as pd
import os
from flask_cors import CORS
import io
import fitz # PyMuPDF
from PIL import Image
from openpyxl import Workbook
from openpyxl.drawing.image import Image as ExcelImage
from openpyxl.utils.dataframe import dataframe_to_rows
app = Flask(__name__)
CORS(app)
@app.route('/')
def home():
return jsonify({'message': 'Welcome to the PDF to Excel converter API!'})
@app.route('/upload', methods=['POST'])
def upload_file():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
# Ensure the 'uploads' directory exists
if not os.path.exists('uploads'):
os.makedirs('uploads')
# Save the uploaded PDF file
pdf_path = os.path.join('uploads', file.filename)
file.save(pdf_path)
# Extract tables and images, and save to Excel
excel_path = pdf_path.replace('.pdf', '.xlsx')
try:
# Open the PDF file with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
wb = Workbook()
ws_data = wb.active
ws_data.title = "Data"
data_rows = []
for page_num, page in enumerate(pdf.pages, start=1):
all_tables = []
# Extract tables from the current page
tables = page.extract_tables()
for table in tables:
df = pd.DataFrame(table)
all_tables.append(df)
if all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)
if page_num == 1: # Add headers only once for the first page
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=True))
else:
data_rows.extend(dataframe_to_rows(combined_df, index=False, header=False))
# Open the PDF file with PyMuPDF to extract images
pdf_file = fitz.open(pdf_path)
output_dir = 'images'
os.makedirs(output_dir, exist_ok=True)
# Extract images from the current page
page_mupdf = pdf_file[page_num - 1] # Page index is 0-based
image_list = page_mupdf.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_file.extract_image(xref)
image_bytes = base_image["image"]
image_ext = base_image["ext"]
image = Image.open(io.BytesIO(image_bytes))
# Save the image
image_filename = f'{output_dir}/page_{page_num}_img_{img_index + 1}.{image_ext}'
image.save(image_filename)
# Create a new sheet for each image
ws_images = wb.create_sheet(title=f"Page_{page_num}_Image_{img_index + 1}")
img_excel = ExcelImage(image_filename)
ws_images.add_image(img_excel, 'A1')
# Write all data rows to the 'Data' sheet
for row in data_rows:
ws_data.append(row)
# Save the Excel workbook
wb.save(excel_path)
return send_file(excel_path, as_attachment=True)
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
if not os.path.exists('uploads'):
os.makedirs('uploads')
app.run(debug=True, port=2000, host='0.0.0.0')
New contributor
dheeraj gakkampudi is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
3