Thiết kế website giá rẻ

Question

I find the documentation for Textract really hard to understand. I’m baffled on why I’m struggling given that I’ve been able to create a document classifier using the Comprehend Batch API using this code.

def extract_text_from_pdf(bucket_name, document_name):
print(f"Extracting text from {document_name}")
try:
    response = textract.start_document_text_detection(
        DocumentLocation={'S3Object': {'Bucket': bucket_name, 'Name': document_name}}
    )
    job_id = response['JobId']
    print(f"Started Textract job {job_id}")

    while True:
        response = textract.get_document_text_detection(JobId=job_id)
        status = response['JobStatus']
        print(f"Textract job status: {status}")
        if status in ['SUCCEEDED', 'FAILED']:
            break
        time.sleep(5)

    if status == 'SUCCEEDED':
        text = ""
        pagination_token = None
        while True:
            if pagination_token:
                response = textract.get_document_text_detection(JobId=job_id, NextToken=pagination_token)
            else:
                response = textract.get_document_text_detection(JobId=job_id)

            for item in response['Blocks']:
                if item['BlockType'] == 'LINE':
                    text += item['Text'] + "n"

            if 'NextToken' in response:
                pagination_token = response['NextToken']
            else:
                break

        print(f"Extracted {len(text)} characters of text")
        return text
    else:
        print(f"Textract job failed for {document_name}")
        return None
except Exception as e:
    print(f"Error in text extraction: {str(e)}")
    return None

def start_batch_classification_job(input_s3_uri, output_s3_uri, data_access_role_arn, classifier_arn):
    try:
        response = comprehend.start_document_classification_job(
            InputDataConfig={
                'S3Uri': input_s3_uri,
                'InputFormat': 'ONE_DOC_PER_FILE'
            },
            OutputDataConfig={
                'S3Uri': output_s3_uri
            },
            DataAccessRoleArn=data_access_role_arn,
            DocumentClassifierArn=classifier_arn
        )
        return response['JobId']
    except ClientError as e:
        print(f"Error starting batch classification job: {e}")
        return None

However, I’ve tried to use start_document_analysis and get_document_analysis the exact same way with the queries feature, but the program only returns results from the first page unless I specify the page number to find the query result from, which makes the feature useless for my use case. I know that I have to be doing something wrong here, but I’m not sure what else I’m supposed to do. I don’t understand why I’d have to set up Amazon SNS or SQS if I was able to retrieve all the text from multi-page PDF’s using the get/start document_text_detection just fine without them.

I tried using textractcaller as well instead of textract, but I am getting the same results – I have to specify the page number for the queries.

import textractcaller as tc
import trp.trp2 as t2
import boto3

textract = boto3.client('textract', region_name="us-east-2")

q1 = tc.Query(text="What reports were used to decide the claim?", alias="REPORTS", pages=["4"])
q2 = tc.Query(text="What date was the claimant unable to work?", alias="AOD", pages=["4"])
q3 = tc.Query(text="What is the claimant name?", alias="NAME", pages=["1"])
q4 = tc.Query(text="What is the notice date?", alias="NOTICE_DATE", pages=["1"])
q5 = tc.Query(text="What is the claim number?", alias="CLAIM_NUMBER", pages=["1"])
q6 = tc.Query(text="What is the claimant address?", alias="ADDRESS", pages=["1"])
q7 = tc.Query(text="What conditions does the claimant have?", alias="CONDITIONS", pages=["4"])

textract_json = tc.call_textract(
    input_document="s3://unseen-docs-classifier-test/batch-processing-classified-docs-4/001-SSDI Initial Medical Denial/SSDI Initial Denial.pdf",
    queries_config=tc.QueriesConfig(queries=[q1, q2, q3, q4, q5, q6, q7]),
    features=[tc.Textract_Features.QUERIES],
    force_async_api=True,
    boto3_textract_client=textract)

t_doc: t2.TDocument = t2.TDocumentSchema().load(textract_json)  # type: ignore

# Print query results and full page content
print("Query Results and Page Content:")
for i, page in enumerate(t_doc.pages, start=1):
    print(f"nPage {i} (ID: {page.id}):")
    
    # Print query results
    query_answers = t_doc.get_query_answers(page=page)
    print("Query Answers:")
    for item in query_answers:
        print(f"  Query: {item[0]}, Alias: {item[1]}, Answer: {item[2]}")
    
    # Print all text content on the page
    print("Page Content:")
    page_text = page.text
    print(page_text)
    print("-" * 80)  # Separator between pages

# Print some statistics
print("nStatistics:")
print(f"Total pages processed: {len(t_doc.pages)}")
print(f"Pages with query results: {sum(1 for page in t_doc.pages if t_doc.get_query_answers(page=page))}")

I just want textract to search through all pages for the highest confidence answer and return it. What am I missing here?

Thiết kế website giá rẻ

Danh mục

Cant figure out how to use Textract Queries and start/get_document_analysis to operate on a multi-page PDF properly