I am using Google Cloud Vision API (OCR) to detect text in PDF files using the PHP API Library. The OCR is done perfectly and I have saved the complete set of JSON output files (ex. output-1-to-2.json) with the full OCR data (which contains the positional details, confidence, full text etc).
Here is a sample JSON output (file: output-1-to-2.json) of a simple 2 paged PDF containing the words ‘April’ in page#1 and ‘May’ in page#2 (as images):
{
"inputConfig":{
"gcsSource":{
"uri":"gs://my-ocr-bucket/php8723/sample.pdf"
},
"mimeType":"application/pdf"
},
"responses":[
{
"fullTextAnnotation":{
"pages":[
{
"property":{
"detectedLanguages":[
{
"languageCode":"en",
"confidence":1
}
]
},
"width":595,
"height":841,
"blocks":[
{
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.059453033
},
{
"x":0.0789916,
"y":0.060642093
}
]
},
"paragraphs":[
{
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.059453033
},
{
"x":0.0789916,
"y":0.060642093
}
]
},
"words":[
{
"property":{
"detectedLanguages":[
{
"languageCode":"en",
"confidence":1
}
]
},
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.049940545
},
{
"x":0.11596639,
"y":0.059453033
},
{
"x":0.0789916,
"y":0.060642093
}
]
},
"symbols":[
{
"text":"A",
"confidence":0.98833746
},
{
"text":"p",
"confidence":0.9870904
},
{
"text":"r",
"confidence":0.99477327
},
{
"text":"i",
"confidence":0.9951743
},
{
"property":{
"detectedBreak":{
"type":"LINE_BREAK"
}
},
"text":"l",
"confidence":0.98942703
}
],
"confidence":0.9909605
}
],
"confidence":0.9909605
}
],
"blockType":"TEXT",
"confidence":0.9909605
}
],
"confidence":0.9909605
}
],
"text":"April"
},
"context":{
"uri":"gs://my-ocr-bucket/php8723/sample.pdf",
"pageNumber":1
}
},
{
"fullTextAnnotation":{
"pages":[
{
"width":595,
"height":841,
"blocks":[
{
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.05469679
},
{
"x":0.11092437,
"y":0.05588585
},
{
"x":0.11092437,
"y":0.065398335
},
{
"x":0.07731093,
"y":0.064209275
}
]
},
"paragraphs":[
{
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.05469679
},
{
"x":0.11092437,
"y":0.05588585
},
{
"x":0.11092437,
"y":0.065398335
},
{
"x":0.07731093,
"y":0.064209275
}
]
},
"words":[
{
"boundingBox":{
"normalizedVertices":[
{
"x":0.0789916,
"y":0.05469679
},
{
"x":0.11092437,
"y":0.05588585
},
{
"x":0.11092437,
"y":0.065398335
},
{
"x":0.07731093,
"y":0.064209275
}
]
},
"symbols":[
{
"text":"M",
"confidence":0.98251665
},
{
"text":"a",
"confidence":0.9763874
},
{
"property":{
"detectedBreak":{
"type":"LINE_BREAK"
}
},
"text":"y",
"confidence":0.9850642
}
],
"confidence":0.98132277
}
],
"confidence":0.98132277
}
],
"blockType":"TEXT",
"confidence":0.98132277
}
],
"confidence":0.98132277
}
],
"text":"May"
},
"context":{
"uri":"gs://my-ocr-bucket/php8723/sample.pdf",
"pageNumber":2
}
}
]
}
Now I am stuck with embedding these OCR data (json files) in the PDF to make the PDF searchable. That means I need to edit the PDF and add the OCR data inside it.
My question: is there any tool in PHP to insert/add the JSON formatted OCR data generated by Google Vision inside a PDF file and make the PDF searchable?
While doing some r&d I have come up with Python based solutions gcv2hocr and hocr-tools which claims to do the above job. But these seem to be platform dependent (Linux only) and is also complicated to use. So can this be done with PHP or any other simple to use platform independent tools?
Than you for reading this so far.