Since many of the pictures in the PPT are mostly in the form of combinations, I need to output the combined image form and extract it into the document. If it is to convert PPT to PDF, can it solve such a problem, can there be a big guy who can give some ideas?
Searched a lot, only a single picture can be output, and the order of the pictures in the single-page ppt is messy, I don’t know the reason, it should be from left to right from top to bottom, but it is not like this when extracting
from pptx import Presentation
from docx import Document
from pptx.shapes.picture import Picture
from docx.shared import Inches
import re
import os
def ppt_to_docx(ppt_path, docx_path):
# 加载指定的PPT文件
ppt = Presentation(ppt_path)
doc = Document()
index = 1
# 读取PPT中的每一页幻灯片
for i, slide in enumerate(ppt.slides):
# 读取幻灯片中的每个形状
for shape in slide.shapes:
# 判断形状是否有文本框
if shape.has_text_frame:
text_frame = shape.text_frame
# 将文本框中的所有文本添加到Word文档
for paragraph in text_frame.paragraphs:
# 清理文本以确保它是XML兼容的
cleaned_text = clean_text_for_xml(paragraph.text)
doc.add_paragraph(cleaned_text)
# 判断形状是否有表格
elif shape.has_table:
# 处理表格并添加到Word文档
process_table(shape.table, doc)
# 是否有图片
elif isinstance(shape, Picture):
# 将图片保存到临时文件
temp_img_path = f'temp_img_{index}.jpg'
with open(temp_img_path, 'wb') as f:
f.write(shape.image.blob)
# 将图片插入到Word文档
doc.add_picture(temp_img_path, width=Inches(3.5))
# 增加图片计数
index += 1
# 打印图片保存路径
print(f'Image saved as: {temp_img_path}')
# 清理临时图片文件
os.remove(temp_img_path)
# 保存Word文档
doc.save(docx_path)
def clean_text_for_xml(text):
texts = re.sub(u"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", text)
texts = re.sub("f", "", texts)
return texts
def process_table(table, doc):
# 创建一个新的表格,行数和列数与PPT中的表格相同
doc_table = doc.add_table(rows=len(table.rows), cols=len(table.columns))
doc_table.style = 'Light Grid' # 设置表格样式
# 遍历PPT表格的每一行
for ppt_row, row in enumerate(table.rows):
# 遍历行中的每个单元格
for ppt_col, cell in enumerate(row.cells):
# 获取单元格中的文本
cell_text = clean_text_for_xml(cell.text)
# 设置Word表格单元格的文本
cell = doc_table.cell(ppt_row, ppt_col)
cell.text = cell_text
# 用户指定的PPT文件路径
ppt_path = 'static/测试报告.pptx'
# 输出的Word文档路径
docx_path = 'static/demo1.docx'
# 调用函数执行转换
ppt_to_docx(ppt_path, docx_path)
user23640279 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
2