I wrote a py file that can extract all the images from a docx and save them in one folder. I found that it is effective for embedded images, but cannot recognize floating images.
I found that the problem is with this statement: for inline_sthape in doc.line_sthapes, it can only recognize embedded images and cannot recognize floating images.
What should I do?
import os
from docx import Document
from docx.shared import RGBColor
doc = Document('sample.docx')
#img num, tag
img_address_list = []
i = 0
for paragraph in doc.paragraphs:
for run in paragraph.runs:
run_xml = run._r
if '<w:drawing>' in run_xml.xml:
paragraph.add_run('<here_is_a_img!>') #add a tag
img_address_list.append(i)
i += 1
doc.save('here_is_a_img.docx')
img_num = len(img_address_list)
print(img_num)
#extract and save
i = 0
for inline_shape in doc.inline_shapes: #cannot recognize floating images
print(i) #without entering this loop, this code block is invalid
#<docx.shape.InlineShape object turn <docx.parts.image.ImagePart object
blip = inline_shape._inline.graphic.graphicData.pic.blipFill.blip
rID = blip.embed
image_part = doc.part.related_parts[rID]
image_name = 'img' + str(i) + '.jpg'
fr = open(image_name, "wb")
fr.write(image_part._blob)
fr.close()
i += 1