I’m using the pdf.js library to extract text from PDF files, but the extracted text isn’t formatted correctly, with some lines ending up at the end. The PDF file usually contains a resume, and since different resumes can have varying layouts and word structures, how can I segment the parsed text into different sections like introduction, education, and experience?
here is my code for parsing the pdf into text format
import React, { useState, useRef } from "react";
import * as pdfjs from "pdfjs-dist";
import { WorkerMessageHandler } from "pdfjs-dist/build/pdf.worker.min.mjs";
function PDFParser() {
const [extractedText, setExtractedText] = useState("");
const [pdfSrc, setPdfSrc] = useState(null);
const [selectedFileName, setSelectedFileName] = useState("");
const fileInputRef = useRef(null);
const handleFileChange = async (event) => {
const selectedFile = event.target.files[0];
if (!selectedFile) {
return;
}
const fileReader = new FileReader();
fileReader.onload = async () => {
const arrayBuffer = fileReader.result;
try {
pdfjs.GlobalWorkerOptions.workerSrc = "pdf.worker.min.mjs";
const pdf = await pdfjs.getDocument({ data: arrayBuffer }).promise;
const numPages = pdf.numPages;
let extractedText = "";
for (let i = 1; i <= numPages; i++) {
const page = await pdf.getPage(i);
const pageText = await page.getTextContent();
// Map over text items and join them with a newline character
const pageLines = pageText.items.map((item) => item.str).join("n");
// Append the lines from this page to the extracted text
if (extractedText !== "") {
extractedText += "n";
}
extractedText += pageLines;
}
setExtractedText(extractedText);
setPdfSrc(URL.createObjectURL(selectedFile));
setSelectedFileName(selectedFile.name);
} catch (error) {
console.error("Error parsing PDF:", error);
}
};
setExtractedText("");
fileReader.readAsArrayBuffer(selectedFile);
};
return (
<div>
<input
type="file"
onChange={handleFileChange}
accept=".pdf"
ref={fileInputRef}
style={{ display: "none" }}
/>
<button className="UploadButton" onClick={openFileDialog}>
Upload PDF
</button>
<div className="ScrollableContainer">
{extractedText && (
<HTMLContent text={extractedText}/>
)}
</div>
</div>
);
}
i have tried to convert it into html but pdfjs-dist does not allow to correctly convert it into htmL
so can someone suggest what other ways by which i can parse the text or suggest some library that help me do it