I’m using the MuPDF.js library to extract annotations (specifically highlights) from a PDF document. While I can detect the annotations and their types, I’m struggling to retrieve the highlighted text. Here’s my current code:
import * as fs from "fs";
import * as mupdfjs from "mupdf/mupdfjs";
async function extractAnnotations() {
try {
// Read the PDF file as a buffer
let fileData = fs.readFileSync("Example.pdf");
// Open the document using mupdfjs.PDFDocument.openDocument
let document = await mupdfjs.PDFDocument.openDocument(
fileData,
"application/pdf"
);
// Loop through all pages of the document
let pageCount = document.countPages(); // Get total number of pages
let i = 0;
while (i < pageCount) {
const page = new mupdfjs.PDFPage(document, i);
const annots = page.getAnnotations();
annots.forEach((annot, index) => {
console.log(`Annotation ${index + 1}:`);
console.log(` Has Rect: ${annot.hasRect()}`);
console.log(` Contents: ${annot.getContents()}`);
});
i++;
}
} catch (err) {
console.error("Error extracting annotations:", err);
}
}
extractAnnotations();
Accessing Annotation Details:
annot.hasRect() returns false for highlight annotations.
annot.getContents() return nothing
Checked Documentation:
The MuPDF.js documentation doesn’t provide specific examples for extracting text associated with annotations like highlights.
Joe is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
0