Hi can anyone help me how can I use https://pkg.go.dev/github.com/pdfcpu/[email protected] to extract human readable String via scrapping a pdf.
FYI I am using AWS lambda:
Here is my code snippet:
<code>package models
import (
"fmt"
"log"
"path/filepath"
"strings"
"github.com/cavaliergopher/grab/v3"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
)
// ScrapePDF downloads and extracts text from a PDF at a given URL
func ScrapePDF(pdfURL string) (string, error) {
// Download the PDF
client := grab.NewClient()
req, err := grab.NewRequest("/tmp", pdfURL)
if err != nil {
return "", fmt.Errorf("failed to create request: %v", err)
}
req.Filename = filepath.Join("/tmp", "scraped.pdf") // Use /tmp directory
log.Printf("Downloading %v...", req.URL())
resp := client.Do(req)
if err := resp.Err(); err != nil {
log.Printf("failed to download PDF: %v", err)
return "", fmt.Errorf("failed to download PDF: %v", err)
}
log.Printf("Downloaded to %v", resp.Filename)
// Extract text from the PDF
extractedText, err := ExtractTextFromPDF(resp.Filename)
if err != nil {
panic(fmt.Errorf("failed to extract text from PDF: %v", err))
}
return extractedText, nil
}
// ExtractTextFromPDF extracts text from a PDF file using pdf library
func ExtractTextFromPDF(pdfPath string) (string, error) {
log.Printf("Starting to extract text from PDF: %s", pdfPath)
ctx, err := api.ReadContextFile(pdfPath)
if err != nil {
return "", fmt.Errorf("error reading PDF context: %v", err)
}
var allText strings.Builder
for i := 1; i <= ctx.PageCount; i++ {
log.Printf("Processing page %d of %d", i, ctx.PageCount)
pageDict, _, _, err := ctx.PageDict(i, true)
if err != nil {
return "", fmt.Errorf("error getting page dictionary for page %d: %v", i, err)
}
content, err := extractTextFromPage(ctx, pageDict)
if err != nil {
return "", fmt.Errorf("error extracting text from page %d: %v", i, err)
}
allText.WriteString(content)
allText.WriteString("n")
}
log.Println("Finished extracting text from PDF")
if err != nil {
return "", fmt.Errorf("error decoding hex to text: %v", err)
}
return allText.String(), nil
}
// extractTextFromPage extracts text content from a PDF page content stream
func extractTextFromPage(ctx *model.Context, pageDict types.Dict) (string, error) {
// Extract the content stream of the page
log.Println("Extracting content stream from page")
log.Printf("This is content: %s", pageDict.String())
contentStreams, err := ctx.PageContent(pageDict)
if err != nil {
return "", fmt.Errorf("error getting page content: %v", err)
}
// Assuming contentStreams is a []byte, convert to string for simplicity
return string(contentStreams), nil
}
</code>
<code>package models
import (
"fmt"
"log"
"path/filepath"
"strings"
"github.com/cavaliergopher/grab/v3"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
)
// ScrapePDF downloads and extracts text from a PDF at a given URL
func ScrapePDF(pdfURL string) (string, error) {
// Download the PDF
client := grab.NewClient()
req, err := grab.NewRequest("/tmp", pdfURL)
if err != nil {
return "", fmt.Errorf("failed to create request: %v", err)
}
req.Filename = filepath.Join("/tmp", "scraped.pdf") // Use /tmp directory
log.Printf("Downloading %v...", req.URL())
resp := client.Do(req)
if err := resp.Err(); err != nil {
log.Printf("failed to download PDF: %v", err)
return "", fmt.Errorf("failed to download PDF: %v", err)
}
log.Printf("Downloaded to %v", resp.Filename)
// Extract text from the PDF
extractedText, err := ExtractTextFromPDF(resp.Filename)
if err != nil {
panic(fmt.Errorf("failed to extract text from PDF: %v", err))
}
return extractedText, nil
}
// ExtractTextFromPDF extracts text from a PDF file using pdf library
func ExtractTextFromPDF(pdfPath string) (string, error) {
log.Printf("Starting to extract text from PDF: %s", pdfPath)
ctx, err := api.ReadContextFile(pdfPath)
if err != nil {
return "", fmt.Errorf("error reading PDF context: %v", err)
}
var allText strings.Builder
for i := 1; i <= ctx.PageCount; i++ {
log.Printf("Processing page %d of %d", i, ctx.PageCount)
pageDict, _, _, err := ctx.PageDict(i, true)
if err != nil {
return "", fmt.Errorf("error getting page dictionary for page %d: %v", i, err)
}
content, err := extractTextFromPage(ctx, pageDict)
if err != nil {
return "", fmt.Errorf("error extracting text from page %d: %v", i, err)
}
allText.WriteString(content)
allText.WriteString("n")
}
log.Println("Finished extracting text from PDF")
if err != nil {
return "", fmt.Errorf("error decoding hex to text: %v", err)
}
return allText.String(), nil
}
// extractTextFromPage extracts text content from a PDF page content stream
func extractTextFromPage(ctx *model.Context, pageDict types.Dict) (string, error) {
// Extract the content stream of the page
log.Println("Extracting content stream from page")
log.Printf("This is content: %s", pageDict.String())
contentStreams, err := ctx.PageContent(pageDict)
if err != nil {
return "", fmt.Errorf("error getting page content: %v", err)
}
// Assuming contentStreams is a []byte, convert to string for simplicity
return string(contentStreams), nil
}
</code>
package models
import (
"fmt"
"log"
"path/filepath"
"strings"
"github.com/cavaliergopher/grab/v3"
"github.com/pdfcpu/pdfcpu/pkg/api"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model"
"github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types"
)
// ScrapePDF downloads and extracts text from a PDF at a given URL
func ScrapePDF(pdfURL string) (string, error) {
// Download the PDF
client := grab.NewClient()
req, err := grab.NewRequest("/tmp", pdfURL)
if err != nil {
return "", fmt.Errorf("failed to create request: %v", err)
}
req.Filename = filepath.Join("/tmp", "scraped.pdf") // Use /tmp directory
log.Printf("Downloading %v...", req.URL())
resp := client.Do(req)
if err := resp.Err(); err != nil {
log.Printf("failed to download PDF: %v", err)
return "", fmt.Errorf("failed to download PDF: %v", err)
}
log.Printf("Downloaded to %v", resp.Filename)
// Extract text from the PDF
extractedText, err := ExtractTextFromPDF(resp.Filename)
if err != nil {
panic(fmt.Errorf("failed to extract text from PDF: %v", err))
}
return extractedText, nil
}
// ExtractTextFromPDF extracts text from a PDF file using pdf library
func ExtractTextFromPDF(pdfPath string) (string, error) {
log.Printf("Starting to extract text from PDF: %s", pdfPath)
ctx, err := api.ReadContextFile(pdfPath)
if err != nil {
return "", fmt.Errorf("error reading PDF context: %v", err)
}
var allText strings.Builder
for i := 1; i <= ctx.PageCount; i++ {
log.Printf("Processing page %d of %d", i, ctx.PageCount)
pageDict, _, _, err := ctx.PageDict(i, true)
if err != nil {
return "", fmt.Errorf("error getting page dictionary for page %d: %v", i, err)
}
content, err := extractTextFromPage(ctx, pageDict)
if err != nil {
return "", fmt.Errorf("error extracting text from page %d: %v", i, err)
}
allText.WriteString(content)
allText.WriteString("n")
}
log.Println("Finished extracting text from PDF")
if err != nil {
return "", fmt.Errorf("error decoding hex to text: %v", err)
}
return allText.String(), nil
}
// extractTextFromPage extracts text content from a PDF page content stream
func extractTextFromPage(ctx *model.Context, pageDict types.Dict) (string, error) {
// Extract the content stream of the page
log.Println("Extracting content stream from page")
log.Printf("This is content: %s", pageDict.String())
contentStreams, err := ctx.PageContent(pageDict)
if err != nil {
return "", fmt.Errorf("error getting page content: %v", err)
}
// Assuming contentStreams is a []byte, convert to string for simplicity
return string(contentStreams), nil
}
I am trying to scrape pdf and get content from pdf.
Can someone help me figure out what is incorrect in this or suggest me way to scrape pdf and get content via any other library or way.