I’m currently developing a resume extractor and i’m in a phase of test with several modules. Right now, i want to draw the rectangles surrounding the main parts of a resume. Thanks to pytesseract i have each box the layout analysis + OCR can detect but I can’t print only the important ones. I tried to calculate the surface of the full page and not consider the rectangle bigger than the page but it’s not working.
for page in pages:
img = np.array(page)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
#Mise au format lisible en cv2
img = cv2.resize(img, None, fx=0.4, fy=0.35)
taille_max = img.shape[0]*img.shape[1]
print(taille_max)
d = pytesseract.image_to_data(img, output_type=Output.DICT)
rect = [] #liste des rectangles
n_boxes = len(d['level'])
for i in range(n_boxes): #On crée d'abord une liste des rectangles
(x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
rect.append((x,y,w,h))
#on trie ensuite les rectangles par leur surface.
print(rect)
rect.pop(0) #On supprime le premier rectangle qui engloble la page
rect = sorted(rect, key= lambda rek: rek[2]*rek[3])
print(rect)
# Liste pour stocker les rectangles à tracer
rectangles_a_tracer = []
# Vérifier l'inclusion des rectangles
for i, rect1 in enumerate(rect):
if rect1[2]*rect1[3] > taille_max:
continue
inclus = False
for j, rect2 in enumerate(rect):
if i != j: # Ne pas comparer le rectangle à lui-même
# Vérifier si rect1 est inclus dans rect2
if (rect1[0] >= rect2[0]) and (rect1[1] <= (rect2[0] + rect2[2])) and
(rect1[2] + rect1[0] <= rect2[2] + rect2[0]) and (rect1[3] + rect1[1] <= rect2[3] + rect2[1]):
inclus = True
break
if not inclus:
rectangles_a_tracer.append(rect1)
#On trace ensuite les rectangles.
for rect in rectangles_a_tracer:
x,y,w,h = rect
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 1)
cv2.imshow('img', img)
cv2.waitKey(0)[enter image description here][enter image description here]
New contributor
Anis-Samy LAAMRI is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.