I want to use PDFBox to get the texts from the pdf with their bounding boxes. I’ve managed to get together a code from here that does almost this, but as you can see the bounding boxes I get (drawn with blue) are not the correct size as if I were to select the texts. Where could the calculation go wrong in my code?
class CustomPDFTextStripper : PDFTextStripper() {
@Throws(IOException::class)
override fun writeString(text: String, textPositions: List<TextPosition>) {
// Initialize bounding box coordinates
val wordSeparator = wordSeparator
val word: MutableList<TextPosition> = ArrayList()
// Get the page height to correctly adjust the Y-coordinates
val page = document.getPage(0)
val pageHeight = page.mediaBox.height
for (text in textPositions) {
val thisChar = text.unicode
if (thisChar != null && thisChar.isNotEmpty()) {
if (thisChar != wordSeparator) {
word.add(text)
} else if (word.isNotEmpty()) {
printWord(word, pageHeight)
word.clear()
}
}
}
if (word.isNotEmpty()) {
printWord(word, pageHeight)
}
}
@Throws(IOException::class)
fun printWord(word: List<TextPosition>, pageHeight: Float) {
if (word.isEmpty()) return
// Create a bounding box for the word
var boundingBox: Rectangle2D? = null
for (text in word) {
val box = Rectangle2D.Float(text.xDirAdj, pageHeight - text.yDirAdj - text.heightDir, text.widthDirAdj, text.heightDir)
if (boundingBox == null) {
boundingBox = box
} else {
boundingBox.add(box)
}
}
// Draw the bounding box
val page = document.getPage(0) // Assumes drawing on the first page
PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true).use { contentStream ->
contentStream.setStrokingColor(Color.BLUE)
contentStream.setLineWidth(1f)
if (boundingBox != null) {
contentStream.addRect(boundingBox.x.toFloat(), boundingBox.y.toFloat()+boundingBox.height.toFloat(), boundingBox.width.toFloat(), boundingBox.height.toFloat())
contentStream.stroke()
}
}
// Print word and bounding box details
val builder = StringBuilder()
for (text in word) {
builder.append(text.unicode)
}
println("${builder.toString()} [(X=${boundingBox!!.x}, Y=${boundingBox.y}) height=${boundingBox.height} width=${boundingBox.width}]")
}
}
The result: