I see a similar question has been asked (A Rectangle must have a non-negative height), but I can’t figure out how to proceed with the answer… Also, it looks a bit different. Anyway:
I’ve amended code I found here (https://github.com/jorisschellekens/borb-examples#53-extracting-text-using-regular-expressions) into C:UsersErikDesktopStrings_uit_pdfstringsuitpdf.py
. I’ve also got a pdf: C:UsersErikDesktopStrings_uit_pdfJim.pdf
. It’s rather large: https://drive.google.com/file/d/19ykr26QNVb8aK6hPdh0pikkEVRScSCBX/view?usp=sharing.
This is the code:
#!chapter_005/src/snippet_006.py
import typing
from borb.pdf import Document
from borb.pdf import PDF
from borb.toolkit import RegularExpressionTextExtraction
teonderzoekenpdf = "Jim.pdf"
zoekstring = 'Cited by'
def main():
# read the Document
# fmt: off
print("Stap 2")
doc: typing.Optional[Document] = None
l: RegularExpressionTextExtraction = RegularExpressionTextExtraction(zoekstring)
with open(teonderzoekenpdf, "rb") as in_file_handle:
print("Stap 3")
doc = PDF.loads(in_file_handle, [l])
print("Stap 3,5")
# fmt: on
# check whether we have read a Document
print("Stap 4")
assert doc is not None
print("Stap 5")
# print matching groups
for i, m in enumerate(l.get_matches()[0]):
print("Stap 6")
print("%d %s" % (i, m.group(0)))
#for r in m.get_bounding_boxes():
#print(
# "t%f %f %f %f" % (r.get_x(), r.get_y(), r.get_width(), r.get_height())
#)
if __name__ == "__main__":
print("Stap 1")
main()
This is the output:
PS C:UsersErikDesktopStrings_uit_pdf> c:; cd 'c:UsersErikDesktopStrings_uit_pdf'; & 'c:Python312python.exe' 'c:UsersErik.vscodeextensionsms-python.debugpy-2024.14.0-win32-x64bundledlibsdebugpyadapter/../..debugpylauncher' '51988' '--' 'c:UsersErikDesktopStrings_uit_pdfstringsuitpdf.py'
Stap 1
Stap 2
Stap 3
Traceback (most recent call last):
File "c:UsersErikDesktopStrings_uit_pdfstringsuitpdf.py", line 56, in <module>
main()
File "c:UsersErikDesktopStrings_uit_pdfstringsuitpdf.py", line 29, in main
doc = PDF.loads(in_file_handle, [l])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbpdfpdf.py", line 85, in loads
document: Document = ReadAnyObjectTransformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadreferencexref_transformer.py", line 305, in transform
trailer = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadobjectdictionary_transformer.py", line 69, in transform
v = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadreferencereference_transformer.py", line 169, in transform
transformed_referenced_object = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadpageroot_dictionary_transformer.py", line 112, in transform
transformed_root_dictionary = t.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadobjectdictionary_transformer.py", line 69, in transform
v = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadreferencereference_transformer.py", line 169, in transform
transformed_referenced_object = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadobjectdictionary_transformer.py", line 69, in transform
v = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadobjectarray_transformer.py", line 69, in transform
object_to_transform[i] = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadreferencereference_transformer.py", line 169, in transform
transformed_referenced_object = self.get_root_transformer().transform(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadany_object_transformer.py", line 113, in transform
return super().transform(
^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadtransformer.py", line 149, in transform
out = h.transform(
^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbioreadpagepage_dictionary_transformer.py", line 129, in transform
CanvasStreamProcessor(page_out, canvas, []).read(
File "c:Python312Libsite-packagesborbpdfcanvascanvas_stream_processor.py", line 277, in read
raise e
File "c:Python312Libsite-packagesborbpdfcanvascanvas_stream_processor.py", line 271, in read
operator.invoke(self, operands, event_listeners)
File "c:Python312Libsite-packagesborbpdfcanvasoperatortextshow_text_with_glyph_positioning.py", line 84, in invoke
l._event_occurred(tri)
File "c:Python312Libsite-packagesborbtoolkittextregular_expression_text_extraction.py", line 326, in _event_occurred
self._render_text(event)
File "c:Python312Libsite-packagesborbtoolkittextregular_expression_text_extraction.py", line 338, in _render_text
for e in text_render_info.split_on_glyphs():
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:Python312Libsite-packagesborbpdfcanvaseventchunk_of_text_render_event.py", line 177, in split_on_glyphs
e._baseline_bounding_box = Rectangle(
^^^^^^^^^^
File "c:Python312Libsite-packagesborbpdfcanvasgeometryrectangle.py", line 30, in __init__
assert width >= 0, "A Rectangle must have a non-negative width."
^^^^^^^^^^
AssertionError: A Rectangle must have a non-negative width.
I’m only looking for text, so I don’t see what the rectangle has to do with anything… But there are some boxes in the document, so maybe that’s where the error comes from?
Anyway, I would love to be able to fetch a list of mentions of my substring, and possibly a page number of every mention.
If anyone knows what I could do to get this to work, that would be great!
Thanks in advance!
2