I am trying to retain tag from tagged pdf document. I tried one approach but I am facing to
get correct bounding box of table and list using following code. please anyone help on this.
<code> public void Process(TaggedJsonPart taggedJsonPart) throws IOException {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
addToMap(theseMarkedContents, markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
Map<PDPage, PDPageContentStream> visualizations = new HashMap<>();
showStructure(document, root, markedContents, visualizations, 0, null, null);
}
</code>
<code> public void Process(TaggedJsonPart taggedJsonPart) throws IOException {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
addToMap(theseMarkedContents, markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
Map<PDPage, PDPageContentStream> visualizations = new HashMap<>();
showStructure(document, root, markedContents, visualizations, 0, null, null);
}
</code>
public void Process(TaggedJsonPart taggedJsonPart) throws IOException {
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents = new HashMap<>();
for (PDPage page : document.getPages()) {
PDFMarkedContentExtractor extractor = new PDFMarkedContentExtractor();
extractor.processPage(page);
Map<Integer, PDMarkedContent> theseMarkedContents = new HashMap<>();
markedContents.put(page, theseMarkedContents);
for (PDMarkedContent markedContent : extractor.getMarkedContents()) {
addToMap(theseMarkedContents, markedContent);
}
}
PDStructureNode root = document.getDocumentCatalog().getStructureTreeRoot();
Map<PDPage, PDPageContentStream> visualizations = new HashMap<>();
showStructure(document, root, markedContents, visualizations, 0, null, null);
}
<code>Map<PDPage, Rectangle2D> showStructure(PDDocument document, PDStructureNode node,
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents, Map<PDPage, PDPageContentStream> visualizations,
Integer _depth, TaggedPdfContent pg, List<TaggedPdfContent> pages) throws IOException {
Map<PDPage, Rectangle2D> boxes = null;
String structType = null;
PDPage page = null;
// page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
int indexHere = index;
if ("Document".equalsIgnoreCase(structType) || "Sect".equalsIgnoreCase(structType)
|| "Part".equalsIgnoreCase(structType)) {
index = 0;
} else {
indexHere = index++;
}
for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
boxes = union(boxes, showStructure(document, PDStructureNode.create((COSDictionary) base),
markedContents, visualizations, null, pg, pages));
} else if (base instanceof COSNumber) {
boxes = union(boxes, page, showContent(((COSNumber) base).intValue(), theseMarkedContents));
} else {
// System.out.printf("?%sn", base);
}
}
} else if (object instanceof PDStructureNode) {
if (object instanceof PDStructureElement) {
// isHavingPdStrEle =true;
_depth++;
page = ((PDStructureElement) object).getPage();
if (((PDStructureElement) object).getStandardStructureType().equals("Note")) {
Map<PDPage, Rectangle2D> nboxes = union(null, showStructure(document, (PDStructureNode) object,
markedContents, visualizations, _depth, pg, pages));
this.insetNodeEleInfo(nboxes, indexHere, structType, new ArrayList<String>(), null);
continue;
} else if (((PDStructureElement) object).getStandardStructureType().equals("Figure")) {
}
}
boxes = union(boxes, showStructure(document, (PDStructureNode) object, markedContents, visualizations,
_depth, pg, pages));
_depth--;
} else if (object instanceof Integer) {
if (page == null) {
page = ((PDStructureElement) node).getPage();
}
boxes = union(boxes, page, showContent((Integer) object, theseMarkedContents));
} else if (object instanceof PDMarkedContentReference) {
PDPage mcr_page = ((PDMarkedContentReference) object).getPage();
boxes = union(boxes, mcr_page, showContent((Integer) ((PDMarkedContentReference) object).getMCID(),
markedContents.get(mcr_page)));
} else {
System.out.printf("?%sn", object);
}
}
if (structType != null && !structType.equalsIgnoreCase("Sect") && !structType.equalsIgnoreCase("Part")
&& !structType.equalsIgnoreCase("Document")) {
if (structType.equals("Figure")) {
// insetFigureNodeEleInfo(((PDStructureElement)node),indexHere);
} else {
List<String> _childIds = this._ChildrenObjIdDepthWiseMap.get(_depth);
if (_childIds == null) {
_childIds = new ArrayList<String>();
}
String ObjIDWithEleType = this.insetNodeEleInfo(boxes, indexHere, structType, _childIds, null);
_childIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth, new ArrayList<String>());
List<String> _prevchildIds = this._ChildrenObjIdDepthWiseMap.get(_depth - 1);
if (_prevchildIds == null) {
_prevchildIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth - 1, _prevchildIds);
}
_prevchildIds.add(ObjIDWithEleType);
}
}
return boxes;
}
</code>
<code>Map<PDPage, Rectangle2D> showStructure(PDDocument document, PDStructureNode node,
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents, Map<PDPage, PDPageContentStream> visualizations,
Integer _depth, TaggedPdfContent pg, List<TaggedPdfContent> pages) throws IOException {
Map<PDPage, Rectangle2D> boxes = null;
String structType = null;
PDPage page = null;
// page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
int indexHere = index;
if ("Document".equalsIgnoreCase(structType) || "Sect".equalsIgnoreCase(structType)
|| "Part".equalsIgnoreCase(structType)) {
index = 0;
} else {
indexHere = index++;
}
for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
boxes = union(boxes, showStructure(document, PDStructureNode.create((COSDictionary) base),
markedContents, visualizations, null, pg, pages));
} else if (base instanceof COSNumber) {
boxes = union(boxes, page, showContent(((COSNumber) base).intValue(), theseMarkedContents));
} else {
// System.out.printf("?%sn", base);
}
}
} else if (object instanceof PDStructureNode) {
if (object instanceof PDStructureElement) {
// isHavingPdStrEle =true;
_depth++;
page = ((PDStructureElement) object).getPage();
if (((PDStructureElement) object).getStandardStructureType().equals("Note")) {
Map<PDPage, Rectangle2D> nboxes = union(null, showStructure(document, (PDStructureNode) object,
markedContents, visualizations, _depth, pg, pages));
this.insetNodeEleInfo(nboxes, indexHere, structType, new ArrayList<String>(), null);
continue;
} else if (((PDStructureElement) object).getStandardStructureType().equals("Figure")) {
}
}
boxes = union(boxes, showStructure(document, (PDStructureNode) object, markedContents, visualizations,
_depth, pg, pages));
_depth--;
} else if (object instanceof Integer) {
if (page == null) {
page = ((PDStructureElement) node).getPage();
}
boxes = union(boxes, page, showContent((Integer) object, theseMarkedContents));
} else if (object instanceof PDMarkedContentReference) {
PDPage mcr_page = ((PDMarkedContentReference) object).getPage();
boxes = union(boxes, mcr_page, showContent((Integer) ((PDMarkedContentReference) object).getMCID(),
markedContents.get(mcr_page)));
} else {
System.out.printf("?%sn", object);
}
}
if (structType != null && !structType.equalsIgnoreCase("Sect") && !structType.equalsIgnoreCase("Part")
&& !structType.equalsIgnoreCase("Document")) {
if (structType.equals("Figure")) {
// insetFigureNodeEleInfo(((PDStructureElement)node),indexHere);
} else {
List<String> _childIds = this._ChildrenObjIdDepthWiseMap.get(_depth);
if (_childIds == null) {
_childIds = new ArrayList<String>();
}
String ObjIDWithEleType = this.insetNodeEleInfo(boxes, indexHere, structType, _childIds, null);
_childIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth, new ArrayList<String>());
List<String> _prevchildIds = this._ChildrenObjIdDepthWiseMap.get(_depth - 1);
if (_prevchildIds == null) {
_prevchildIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth - 1, _prevchildIds);
}
_prevchildIds.add(ObjIDWithEleType);
}
}
return boxes;
}
</code>
Map<PDPage, Rectangle2D> showStructure(PDDocument document, PDStructureNode node,
Map<PDPage, Map<Integer, PDMarkedContent>> markedContents, Map<PDPage, PDPageContentStream> visualizations,
Integer _depth, TaggedPdfContent pg, List<TaggedPdfContent> pages) throws IOException {
Map<PDPage, Rectangle2D> boxes = null;
String structType = null;
PDPage page = null;
// page = null;
if (node instanceof PDStructureElement) {
PDStructureElement element = (PDStructureElement) node;
structType = element.getStructureType();
page = element.getPage();
}
Map<Integer, PDMarkedContent> theseMarkedContents = markedContents.get(page);
int indexHere = index;
if ("Document".equalsIgnoreCase(structType) || "Sect".equalsIgnoreCase(structType)
|| "Part".equalsIgnoreCase(structType)) {
index = 0;
} else {
indexHere = index++;
}
for (Object object : node.getKids()) {
if (object instanceof COSArray) {
for (COSBase base : (COSArray) object) {
if (base instanceof COSDictionary) {
boxes = union(boxes, showStructure(document, PDStructureNode.create((COSDictionary) base),
markedContents, visualizations, null, pg, pages));
} else if (base instanceof COSNumber) {
boxes = union(boxes, page, showContent(((COSNumber) base).intValue(), theseMarkedContents));
} else {
// System.out.printf("?%sn", base);
}
}
} else if (object instanceof PDStructureNode) {
if (object instanceof PDStructureElement) {
// isHavingPdStrEle =true;
_depth++;
page = ((PDStructureElement) object).getPage();
if (((PDStructureElement) object).getStandardStructureType().equals("Note")) {
Map<PDPage, Rectangle2D> nboxes = union(null, showStructure(document, (PDStructureNode) object,
markedContents, visualizations, _depth, pg, pages));
this.insetNodeEleInfo(nboxes, indexHere, structType, new ArrayList<String>(), null);
continue;
} else if (((PDStructureElement) object).getStandardStructureType().equals("Figure")) {
}
}
boxes = union(boxes, showStructure(document, (PDStructureNode) object, markedContents, visualizations,
_depth, pg, pages));
_depth--;
} else if (object instanceof Integer) {
if (page == null) {
page = ((PDStructureElement) node).getPage();
}
boxes = union(boxes, page, showContent((Integer) object, theseMarkedContents));
} else if (object instanceof PDMarkedContentReference) {
PDPage mcr_page = ((PDMarkedContentReference) object).getPage();
boxes = union(boxes, mcr_page, showContent((Integer) ((PDMarkedContentReference) object).getMCID(),
markedContents.get(mcr_page)));
} else {
System.out.printf("?%sn", object);
}
}
if (structType != null && !structType.equalsIgnoreCase("Sect") && !structType.equalsIgnoreCase("Part")
&& !structType.equalsIgnoreCase("Document")) {
if (structType.equals("Figure")) {
// insetFigureNodeEleInfo(((PDStructureElement)node),indexHere);
} else {
List<String> _childIds = this._ChildrenObjIdDepthWiseMap.get(_depth);
if (_childIds == null) {
_childIds = new ArrayList<String>();
}
String ObjIDWithEleType = this.insetNodeEleInfo(boxes, indexHere, structType, _childIds, null);
_childIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth, new ArrayList<String>());
List<String> _prevchildIds = this._ChildrenObjIdDepthWiseMap.get(_depth - 1);
if (_prevchildIds == null) {
_prevchildIds = new ArrayList<String>();
this._ChildrenObjIdDepthWiseMap.put(_depth - 1, _prevchildIds);
}
_prevchildIds.add(ObjIDWithEleType);
}
}
return boxes;
}
<code>Map<PDPage, Rectangle2D> union(Map<PDPage, Rectangle2D> map, PDPage page, Rectangle2D rectangle) {
if (map == null)
map = new HashMap<>();
map.put(page, union(map.get(page), rectangle));
return map;
}
</code>
<code>Map<PDPage, Rectangle2D> union(Map<PDPage, Rectangle2D> map, PDPage page, Rectangle2D rectangle) {
if (map == null)
map = new HashMap<>();
map.put(page, union(map.get(page), rectangle));
return map;
}
</code>
Map<PDPage, Rectangle2D> union(Map<PDPage, Rectangle2D> map, PDPage page, Rectangle2D rectangle) {
if (map == null)
map = new HashMap<>();
map.put(page, union(map.get(page), rectangle));
return map;
}
<code>Rectangle2D showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) throws IOException {
Rectangle2D box = null;
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
TextPosition textPosition = (TextPosition) object;
textContent.append(textPosition.getUnicode());
int[] codes = textPosition.getCharacterCodes();
if (codes.length != 1) {
System.out.printf("<!-- text position with unexpected number of codes: %d -->", codes.length);
} else {
box = union(box,
calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0])
.getBounds2D());
}
} else if (object instanceof PDMarkedContent) {
PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
} else {
textContent.append("?" + object);
}
}
return box;
}
</code>
<code>Rectangle2D showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) throws IOException {
Rectangle2D box = null;
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
TextPosition textPosition = (TextPosition) object;
textContent.append(textPosition.getUnicode());
int[] codes = textPosition.getCharacterCodes();
if (codes.length != 1) {
System.out.printf("<!-- text position with unexpected number of codes: %d -->", codes.length);
} else {
box = union(box,
calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0])
.getBounds2D());
}
} else if (object instanceof PDMarkedContent) {
PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
} else {
textContent.append("?" + object);
}
}
return box;
}
</code>
Rectangle2D showContent(int mcid, Map<Integer, PDMarkedContent> theseMarkedContents) throws IOException {
Rectangle2D box = null;
PDMarkedContent markedContent = theseMarkedContents != null ? theseMarkedContents.get(mcid) : null;
List<Object> contents = markedContent != null ? markedContent.getContents() : Collections.emptyList();
StringBuilder textContent = new StringBuilder();
for (Object object : contents) {
if (object instanceof TextPosition) {
TextPosition textPosition = (TextPosition) object;
textContent.append(textPosition.getUnicode());
int[] codes = textPosition.getCharacterCodes();
if (codes.length != 1) {
System.out.printf("<!-- text position with unexpected number of codes: %d -->", codes.length);
} else {
box = union(box,
calculateGlyphBounds(textPosition.getTextMatrix(), textPosition.getFont(), codes[0])
.getBounds2D());
}
} else if (object instanceof PDMarkedContent) {
PDMarkedContent thisMarkedContent = (PDMarkedContent) object;
box = union(box, showContent(thisMarkedContent.getMCID(), theseMarkedContents));
} else {
textContent.append("?" + object);
}
}
return box;
}
<code>private Shape calculateGlyphBounds(Matrix textRenderingMatrix, PDFont font, int code) throws IOException {
GeneralPath path = null;
AffineTransform at = textRenderingMatrix.createAffineTransform();
at.concatenate(font.getFontMatrix().createAffineTransform());
if (font instanceof PDType3Font) {
// It is difficult to calculate the real individual glyph bounds for type 3
// fonts
// because these are not vector fonts, the content stream could contain almost
// anything
// that is found in page content streams.
PDType3Font t3Font = (PDType3Font) font;
PDType3CharProc charProc = t3Font.getCharProc(code);
if (charProc != null) {
BoundingBox fontBBox = t3Font.getBoundingBox();
PDRectangle glyphBBox = charProc.getGlyphBBox();
if (glyphBBox != null) {
// PDFBOX-3850: glyph bbox could be larger than the font bbox
glyphBBox.setLowerLeftX(Math.max(fontBBox.getLowerLeftX(), glyphBBox.getLowerLeftX()));
glyphBBox.setLowerLeftY(Math.max(fontBBox.getLowerLeftY(), glyphBBox.getLowerLeftY()));
glyphBBox.setUpperRightX(Math.min(fontBBox.getUpperRightX(), glyphBBox.getUpperRightX()));
glyphBBox.setUpperRightY(Math.min(fontBBox.getUpperRightY(), glyphBBox.getUpperRightY()));
path = glyphBBox.toGeneralPath();
}
}
} else if (font instanceof PDVectorFont) {
PDVectorFont vectorFont = (PDVectorFont) font;
path = vectorFont.getPath(code);
if (font instanceof PDTrueTypeFont) {
PDTrueTypeFont ttFont = (PDTrueTypeFont) font;
int unitsPerEm = ttFont.getTrueTypeFont().getHeader().getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
if (font instanceof PDType0Font) {
PDType0Font t0font = (PDType0Font) font;
if (t0font.getDescendantFont() instanceof PDCIDFontType2) {
int unitsPerEm = ((PDCIDFontType2) t0font.getDescendantFont()).getTrueTypeFont().getHeader()
.getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
}
} else if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;
// these two lines do not always work, e.g. for the TT fonts in file 032431.pdf
// which is why PDVectorFont is tried first.
String name = simpleFont.getEncoding().getName(code);
path = simpleFont.getPath(name);
} else {
// shouldn't happen, please open issue in JIRA
System.out.println("Unknown font class: " + font.getClass());
}
if (path == null) {
return null;
}
return at.createTransformedShape(path.getBounds2D());
}
</code>
<code>private Shape calculateGlyphBounds(Matrix textRenderingMatrix, PDFont font, int code) throws IOException {
GeneralPath path = null;
AffineTransform at = textRenderingMatrix.createAffineTransform();
at.concatenate(font.getFontMatrix().createAffineTransform());
if (font instanceof PDType3Font) {
// It is difficult to calculate the real individual glyph bounds for type 3
// fonts
// because these are not vector fonts, the content stream could contain almost
// anything
// that is found in page content streams.
PDType3Font t3Font = (PDType3Font) font;
PDType3CharProc charProc = t3Font.getCharProc(code);
if (charProc != null) {
BoundingBox fontBBox = t3Font.getBoundingBox();
PDRectangle glyphBBox = charProc.getGlyphBBox();
if (glyphBBox != null) {
// PDFBOX-3850: glyph bbox could be larger than the font bbox
glyphBBox.setLowerLeftX(Math.max(fontBBox.getLowerLeftX(), glyphBBox.getLowerLeftX()));
glyphBBox.setLowerLeftY(Math.max(fontBBox.getLowerLeftY(), glyphBBox.getLowerLeftY()));
glyphBBox.setUpperRightX(Math.min(fontBBox.getUpperRightX(), glyphBBox.getUpperRightX()));
glyphBBox.setUpperRightY(Math.min(fontBBox.getUpperRightY(), glyphBBox.getUpperRightY()));
path = glyphBBox.toGeneralPath();
}
}
} else if (font instanceof PDVectorFont) {
PDVectorFont vectorFont = (PDVectorFont) font;
path = vectorFont.getPath(code);
if (font instanceof PDTrueTypeFont) {
PDTrueTypeFont ttFont = (PDTrueTypeFont) font;
int unitsPerEm = ttFont.getTrueTypeFont().getHeader().getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
if (font instanceof PDType0Font) {
PDType0Font t0font = (PDType0Font) font;
if (t0font.getDescendantFont() instanceof PDCIDFontType2) {
int unitsPerEm = ((PDCIDFontType2) t0font.getDescendantFont()).getTrueTypeFont().getHeader()
.getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
}
} else if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;
// these two lines do not always work, e.g. for the TT fonts in file 032431.pdf
// which is why PDVectorFont is tried first.
String name = simpleFont.getEncoding().getName(code);
path = simpleFont.getPath(name);
} else {
// shouldn't happen, please open issue in JIRA
System.out.println("Unknown font class: " + font.getClass());
}
if (path == null) {
return null;
}
return at.createTransformedShape(path.getBounds2D());
}
</code>
private Shape calculateGlyphBounds(Matrix textRenderingMatrix, PDFont font, int code) throws IOException {
GeneralPath path = null;
AffineTransform at = textRenderingMatrix.createAffineTransform();
at.concatenate(font.getFontMatrix().createAffineTransform());
if (font instanceof PDType3Font) {
// It is difficult to calculate the real individual glyph bounds for type 3
// fonts
// because these are not vector fonts, the content stream could contain almost
// anything
// that is found in page content streams.
PDType3Font t3Font = (PDType3Font) font;
PDType3CharProc charProc = t3Font.getCharProc(code);
if (charProc != null) {
BoundingBox fontBBox = t3Font.getBoundingBox();
PDRectangle glyphBBox = charProc.getGlyphBBox();
if (glyphBBox != null) {
// PDFBOX-3850: glyph bbox could be larger than the font bbox
glyphBBox.setLowerLeftX(Math.max(fontBBox.getLowerLeftX(), glyphBBox.getLowerLeftX()));
glyphBBox.setLowerLeftY(Math.max(fontBBox.getLowerLeftY(), glyphBBox.getLowerLeftY()));
glyphBBox.setUpperRightX(Math.min(fontBBox.getUpperRightX(), glyphBBox.getUpperRightX()));
glyphBBox.setUpperRightY(Math.min(fontBBox.getUpperRightY(), glyphBBox.getUpperRightY()));
path = glyphBBox.toGeneralPath();
}
}
} else if (font instanceof PDVectorFont) {
PDVectorFont vectorFont = (PDVectorFont) font;
path = vectorFont.getPath(code);
if (font instanceof PDTrueTypeFont) {
PDTrueTypeFont ttFont = (PDTrueTypeFont) font;
int unitsPerEm = ttFont.getTrueTypeFont().getHeader().getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
if (font instanceof PDType0Font) {
PDType0Font t0font = (PDType0Font) font;
if (t0font.getDescendantFont() instanceof PDCIDFontType2) {
int unitsPerEm = ((PDCIDFontType2) t0font.getDescendantFont()).getTrueTypeFont().getHeader()
.getUnitsPerEm();
at.scale(1000d / unitsPerEm, 1000d / unitsPerEm);
}
}
} else if (font instanceof PDSimpleFont) {
PDSimpleFont simpleFont = (PDSimpleFont) font;
// these two lines do not always work, e.g. for the TT fonts in file 032431.pdf
// which is why PDVectorFont is tried first.
String name = simpleFont.getEncoding().getName(code);
path = simpleFont.getPath(name);
} else {
// shouldn't happen, please open issue in JIRA
System.out.println("Unknown font class: " + font.getClass());
}
if (path == null) {
return null;
}
return at.createTransformedShape(path.getBounds2D());
}
So, I have used above code to retain a tag from tagged pdf and storing it as a JSon. But bounding box of table and list is not getting correct. Paragraph and header bounding are coming correct by above logic. so please any one help. Thanks in advance.