I find the documentation for Textract really hard to understand. I’m baffled on why I’m struggling given that I’ve been able to create a document classifier using the Comprehend Batch API using this code.
def extract_text_from_pdf(bucket_name, document_name):
print(f"Extracting text from {document_name}")
try:
response = textract.start_document_text_detection(
DocumentLocation={'S3Object': {'Bucket': bucket_name, 'Name': document_name}}
)
job_id = response['JobId']
print(f"Started Textract job {job_id}")
while True:
response = textract.get_document_text_detection(JobId=job_id)
status = response['JobStatus']
print(f"Textract job status: {status}")
if status in ['SUCCEEDED', 'FAILED']:
break
time.sleep(5)
if status == 'SUCCEEDED':
text = ""
pagination_token = None
while True:
if pagination_token:
response = textract.get_document_text_detection(JobId=job_id, NextToken=pagination_token)
else:
response = textract.get_document_text_detection(JobId=job_id)
for item in response['Blocks']:
if item['BlockType'] == 'LINE':
text += item['Text'] + "n"
if 'NextToken' in response:
pagination_token = response['NextToken']
else:
break
print(f"Extracted {len(text)} characters of text")
return text
else:
print(f"Textract job failed for {document_name}")
return None
except Exception as e:
print(f"Error in text extraction: {str(e)}")
return None
def start_batch_classification_job(input_s3_uri, output_s3_uri, data_access_role_arn, classifier_arn):
try:
response = comprehend.start_document_classification_job(
InputDataConfig={
'S3Uri': input_s3_uri,
'InputFormat': 'ONE_DOC_PER_FILE'
},
OutputDataConfig={
'S3Uri': output_s3_uri
},
DataAccessRoleArn=data_access_role_arn,
DocumentClassifierArn=classifier_arn
)
return response['JobId']
except ClientError as e:
print(f"Error starting batch classification job: {e}")
return None
However, I’ve tried to use start_document_analysis and get_document_analysis the exact same way with the queries feature, but the program only returns results from the first page unless I specify the page number to find the query result from, which makes the feature useless for my use case. I know that I have to be doing something wrong here, but I’m not sure what else I’m supposed to do. I don’t understand why I’d have to set up Amazon SNS or SQS if I was able to retrieve all the text from multi-page PDF’s using the get/start document_text_detection just fine without them.
I tried using textractcaller
as well instead of textract, but I am getting the same results – I have to specify the page number for the queries.
import textractcaller as tc
import trp.trp2 as t2
import boto3
textract = boto3.client('textract', region_name="us-east-2")
q1 = tc.Query(text="What reports were used to decide the claim?", alias="REPORTS", pages=["4"])
q2 = tc.Query(text="What date was the claimant unable to work?", alias="AOD", pages=["4"])
q3 = tc.Query(text="What is the claimant name?", alias="NAME", pages=["1"])
q4 = tc.Query(text="What is the notice date?", alias="NOTICE_DATE", pages=["1"])
q5 = tc.Query(text="What is the claim number?", alias="CLAIM_NUMBER", pages=["1"])
q6 = tc.Query(text="What is the claimant address?", alias="ADDRESS", pages=["1"])
q7 = tc.Query(text="What conditions does the claimant have?", alias="CONDITIONS", pages=["4"])
textract_json = tc.call_textract(
input_document="s3://unseen-docs-classifier-test/batch-processing-classified-docs-4/001-SSDI Initial Medical Denial/SSDI Initial Denial.pdf",
queries_config=tc.QueriesConfig(queries=[q1, q2, q3, q4, q5, q6, q7]),
features=[tc.Textract_Features.QUERIES],
force_async_api=True,
boto3_textract_client=textract)
t_doc: t2.TDocument = t2.TDocumentSchema().load(textract_json) # type: ignore
# Print query results and full page content
print("Query Results and Page Content:")
for i, page in enumerate(t_doc.pages, start=1):
print(f"nPage {i} (ID: {page.id}):")
# Print query results
query_answers = t_doc.get_query_answers(page=page)
print("Query Answers:")
for item in query_answers:
print(f" Query: {item[0]}, Alias: {item[1]}, Answer: {item[2]}")
# Print all text content on the page
print("Page Content:")
page_text = page.text
print(page_text)
print("-" * 80) # Separator between pages
# Print some statistics
print("nStatistics:")
print(f"Total pages processed: {len(t_doc.pages)}")
print(f"Pages with query results: {sum(1 for page in t_doc.pages if t_doc.get_query_answers(page=page))}")
I just want textract to search through all pages for the highest confidence answer and return it. What am I missing here?