I’m trying to extract tables from a multipage pdf using AWS Textract. The code appears to extract tables when there’s no NextToken passed and when removing the while statement, but when added, the data is not being received correctly. Any assistance will be appreciated.
Here’s some of the Node js code;
<code> let paginationToken = null;
let finished = false;
async function getJobResults(JobId, NextToken = null) {
while (!finished) {
let response;
if (paginationToken === null) {
let command = new GetDocumentAnalysisCommand({ JobId: JobId });
response = await AWS.send(command).catch((err) =>
console.log(err)
);
} else {
let command = new GetDocumentAnalysisCommand({
JobId: JobId, // required,
NextToken: paginationToken,
});
response = await AWS.send(command).catch((err) =>
console.log(err)
);
}
if (response.NextToken) {
paginationToken = response.NextToken;
} else {
finished = true;
}
}
return response;
}
let Time = 0;
const getJob = async () => {
const { Messages } = await sqsClient
.send(
new ReceiveMessageCommand({
QueueUrl: sqsQueueUrl,
MaxNumberOfMessages: 1,
})
)
.catch((err) => console.log(err));
if (Messages) {
console.log(`Message[0]: ${Messages[0].Body}`);
await sqsClient
.send(
new DeleteMessageCommand({
QueueUrl: sqsQueueUrl,
ReceiptHandle: Messages[0].ReceiptHandle,
})
)
.catch((err) => console.log(err));
if (
JSON.parse(JSON.parse(Messages[0].Body).Message).Status ===
JobStatus.SUCCEEDED
) {
return await getJobResults(JobID); // Is the issue with the return statement? JobID is being received correctly.
} else {
const tick = 5000;
Time += tick;
console.log(
`Waited ${Time/ 1000} seconds. No messages yet.`
);
setTimeout(getJob, tick);
return;
}
}
return await getJob();
};
console.log("HELLO");
let resultData = await getJob();
// standard example functions
async function getTableCsvResults() {
// Get the text blocks
const blocks = resultData.Blocks;
const blocksMap = {};
const tableBlocks = [];
for (const block of blocks) {
blocksMap[block.Id] = block;
if (block.BlockType === "TABLE") {
tableBlocks.push(block);
}
}
if (tableBlocks.length <= 0) {
return "<b> NO Table FOUND </b>";
}
let csv = "";
for (let index = 0; index < tableBlocks.length; index++) {
let tablesData = generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
// console.log(tablesData); // not being received
if (
tablesData &&
tablesData
) {
csv += generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
}
}
return csv;
}
</code>
<code> let paginationToken = null;
let finished = false;
async function getJobResults(JobId, NextToken = null) {
while (!finished) {
let response;
if (paginationToken === null) {
let command = new GetDocumentAnalysisCommand({ JobId: JobId });
response = await AWS.send(command).catch((err) =>
console.log(err)
);
} else {
let command = new GetDocumentAnalysisCommand({
JobId: JobId, // required,
NextToken: paginationToken,
});
response = await AWS.send(command).catch((err) =>
console.log(err)
);
}
if (response.NextToken) {
paginationToken = response.NextToken;
} else {
finished = true;
}
}
return response;
}
let Time = 0;
const getJob = async () => {
const { Messages } = await sqsClient
.send(
new ReceiveMessageCommand({
QueueUrl: sqsQueueUrl,
MaxNumberOfMessages: 1,
})
)
.catch((err) => console.log(err));
if (Messages) {
console.log(`Message[0]: ${Messages[0].Body}`);
await sqsClient
.send(
new DeleteMessageCommand({
QueueUrl: sqsQueueUrl,
ReceiptHandle: Messages[0].ReceiptHandle,
})
)
.catch((err) => console.log(err));
if (
JSON.parse(JSON.parse(Messages[0].Body).Message).Status ===
JobStatus.SUCCEEDED
) {
return await getJobResults(JobID); // Is the issue with the return statement? JobID is being received correctly.
} else {
const tick = 5000;
Time += tick;
console.log(
`Waited ${Time/ 1000} seconds. No messages yet.`
);
setTimeout(getJob, tick);
return;
}
}
return await getJob();
};
console.log("HELLO");
let resultData = await getJob();
// standard example functions
async function getTableCsvResults() {
// Get the text blocks
const blocks = resultData.Blocks;
const blocksMap = {};
const tableBlocks = [];
for (const block of blocks) {
blocksMap[block.Id] = block;
if (block.BlockType === "TABLE") {
tableBlocks.push(block);
}
}
if (tableBlocks.length <= 0) {
return "<b> NO Table FOUND </b>";
}
let csv = "";
for (let index = 0; index < tableBlocks.length; index++) {
let tablesData = generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
// console.log(tablesData); // not being received
if (
tablesData &&
tablesData
) {
csv += generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
}
}
return csv;
}
</code>
let paginationToken = null;
let finished = false;
async function getJobResults(JobId, NextToken = null) {
while (!finished) {
let response;
if (paginationToken === null) {
let command = new GetDocumentAnalysisCommand({ JobId: JobId });
response = await AWS.send(command).catch((err) =>
console.log(err)
);
} else {
let command = new GetDocumentAnalysisCommand({
JobId: JobId, // required,
NextToken: paginationToken,
});
response = await AWS.send(command).catch((err) =>
console.log(err)
);
}
if (response.NextToken) {
paginationToken = response.NextToken;
} else {
finished = true;
}
}
return response;
}
let Time = 0;
const getJob = async () => {
const { Messages } = await sqsClient
.send(
new ReceiveMessageCommand({
QueueUrl: sqsQueueUrl,
MaxNumberOfMessages: 1,
})
)
.catch((err) => console.log(err));
if (Messages) {
console.log(`Message[0]: ${Messages[0].Body}`);
await sqsClient
.send(
new DeleteMessageCommand({
QueueUrl: sqsQueueUrl,
ReceiptHandle: Messages[0].ReceiptHandle,
})
)
.catch((err) => console.log(err));
if (
JSON.parse(JSON.parse(Messages[0].Body).Message).Status ===
JobStatus.SUCCEEDED
) {
return await getJobResults(JobID); // Is the issue with the return statement? JobID is being received correctly.
} else {
const tick = 5000;
Time += tick;
console.log(
`Waited ${Time/ 1000} seconds. No messages yet.`
);
setTimeout(getJob, tick);
return;
}
}
return await getJob();
};
console.log("HELLO");
let resultData = await getJob();
// standard example functions
async function getTableCsvResults() {
// Get the text blocks
const blocks = resultData.Blocks;
const blocksMap = {};
const tableBlocks = [];
for (const block of blocks) {
blocksMap[block.Id] = block;
if (block.BlockType === "TABLE") {
tableBlocks.push(block);
}
}
if (tableBlocks.length <= 0) {
return "<b> NO Table FOUND </b>";
}
let csv = "";
for (let index = 0; index < tableBlocks.length; index++) {
let tablesData = generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
// console.log(tablesData); // not being received
if (
tablesData &&
tablesData
) {
csv += generateTableCsv(
tableBlocks[index],
blocksMap,
index + 1
);
}
}
return csv;
}