What I mean the format of classical chinese document?
The paragraph is composed of lines, the first line is at right most, the second line is at left of the first line, etc.
The line is composed of characters, the first character is at top most, the second character is at below of the first character, etc.
I have file LR-10709-24-25.pdf in format of classical chinese document, for some reason I need the locations of texts for analysis.
- Apply the program(see below) to history-2-3.pdf which is in ordinary english like format, got correct result:
- Apply the same program to LR-10709-24-25.pdf, got the very wrong result:
- I guess it is about coordinate, current transform matrix, textMatrix, TextRenderInfo, but I need help to understand these things by this problem.
Here is my program
using iText.Kernel.Colors;
using iText.Kernel.Geom;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using System.Text;
string srcFileName = "LR-10709-24-25.pdf";
string destFileName = "LR-10709-24-25-enclose.pdf";
//string srcFileName = "history-2-3.pdf";
//string destFileName = "history-2-3-enclose.pdf";
PdfDocument pdfDoc = new PdfDocument(new PdfReader(srcFileName), new PdfWriter(destFileName));
StringBuilder sb = new StringBuilder();
for (int i = 0; i < pdfDoc.GetNumberOfPages(); i++)
{
SimplePositionalTextEventListener listener = new SimplePositionalTextEventListener();
new PdfCanvasProcessor(listener).ProcessPageContent(pdfDoc.GetPage(i + 1));
List<SimpleTextWithRectangle> result = listener.GetResultantTextWithPosition();
int R = 0, G = 0, B = 0;
foreach (SimpleTextWithRectangle textWithRectangle in result)
{
R += 40; R = R % 256;
G += 20; G = G % 256;
B += 80; B = B % 256;
PdfCanvas canvas = new PdfCanvas(pdfDoc.GetPage(i + 1));
canvas.SetStrokeColor(new DeviceRgb(R, G, B));
var rect = textWithRectangle.GetRectangle();
canvas.Rectangle(rect);
canvas.Stroke();
}
}
pdfDoc.Close();
Console.WriteLine("Press any key to continue!");
Console.ReadKey();
class SimpleTextWithRectangle
{
private Rectangle rectangle;
private string text;
public SimpleTextWithRectangle(Rectangle rectangle, String text)
{
this.rectangle = rectangle;
this.text = text;
}
public Rectangle GetRectangle()
{
return rectangle;
}
public string GetText()
{
return text;
}
}
class SimplePositionalTextEventListener : IEventListener
{
private List<SimpleTextWithRectangle> textWithRectangleList = new List<SimpleTextWithRectangle>();
private void renderText(TextRenderInfo renderInfo)
{
if (renderInfo.GetText().Trim().Length == 0)
return;
LineSegment ascent = renderInfo.GetAscentLine();
LineSegment descent = renderInfo.GetDescentLine();
float initX = descent.GetStartPoint().Get(0);
float initY = descent.GetStartPoint().Get(1);
float endX = ascent.GetEndPoint().Get(0);
float endY = ascent.GetEndPoint().Get(1);
Rectangle rectangle = new Rectangle(initX, initY, endX - initX, endY - initY);
SimpleTextWithRectangle textWithRectangle = new SimpleTextWithRectangle(rectangle, renderInfo.GetText());
textWithRectangleList.Add(textWithRectangle);
}
public List<SimpleTextWithRectangle> GetResultantTextWithPosition()
{
return textWithRectangleList;
}
public void EventOccurred(IEventData data, EventType type)
{
renderText((TextRenderInfo)data);
}
public ICollection<EventType> GetSupportedEvents()
{
return new List<EventType> { EventType.RENDER_TEXT };
}
}
and two pdf files
history-2-3.pdf
LR-10709-24-25.pdf