我正在尝试读取pdf文档表,但遇到了问题。
如果我定期打开PDF,它显示为:
item[tab]item[tab]item[tab]item[tab]item
item[tab]item[tab]item[tab]item[tab]item
item[tab]item[tab]item[tab]item[tab]item
我使用以下方法转换PDF:
StringBuilder result = new StringBuilder();
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC));
LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
{
result.AppendLine("INFO_START_PAGE");
string output = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i));
/*Note, in the GetTextFromPage i replaced the method to output [tab] instead of a regular space on
big spaces*/
foreach(string data in output.Replace("\r\n", "\n").Replace("\n", "×").Split('×'))
{
result.AppendLine(data.Trim().Replace(" ", "[tab]"));
}
result.AppendLine("INFO_END_PAGE");
}
pdfDoc.Close();
return result.ToString();
当我尝试在某些情况下使用Pdf到文本转换读取此信息时,它表示为:
item[tab]item[tab]item[tab]item[tab]item
item[tab]item[tab]item[tab]
item[tab]item
item[tab]item[tab]item[tab]item[tab]item
有没有解决此问题的方法?
提取为
Artikelnr. Omschrijving Aantal
Per stuk Kosten
VERHUUR L. GELEVERDE ARBEID PDC 8 € 43,70 € 349,60
VERHUUR O. GELEVERDE ARBEID PDC 3 € 60,95 € 182,85
VERHUUR L.L. GELEVERDE ARBEID EM 24
€ 32,20 € 772,80
正如对该问题的评论所猜想的那样,确实存在一个小的垂直步骤,在所有行中,前三列均设置为相同的垂直位置,而后两列的垂直位置略有不同,
Row First columns y Last columns y
Heading row 536 535.893
First row 516 516.229
Second row 495 495.478
Third row 475 474.788
人们特别认识到,由文本提取破坏的行是其中y位置的小数点前数字不同(536与535、475与474)的行,而那些具有相同的小数点位数不变的行。
这样做的原因是该类TextChunkLocationDefaultImp
(默认情况下用于存储文本块位置和用于比较这些位置的方法)以整数形式存储块的y位置(实际上,它的抽象也适用于未水平书写的文本)变量(private readonly int distPerpendicular
)并在测试方法中SameLine
要求distPerpendicular
值相等。
namespace iText.Kernel.Pdf.Canvas.Parser.Listener {
internal class TextChunkLocationDefaultImp : ITextChunkLocation {
...
/// <summary>Perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system).
/// </summary>
/// <remarks>
/// Perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system).
/// We round to the nearest integer to handle the fuzziness of comparing floats.
/// </remarks>
private readonly int distPerpendicular;
...
/// <param name="as">the location to compare to</param>
/// <returns>true is this location is on the the same line as the other</returns>
public virtual bool SameLine(ITextChunkLocation @as) {
...
float distPerpendicularDiff = DistPerpendicular() - @as.DistPerpendicular();
if (distPerpendicularDiff == 0) {
return true;
}
...
}
...
}
}
(实际上,SameLine
如果所比较的文本块之一的长度为零,则向下进一步允许小偏差。显然,零长度的块有时用于变音标记,而这些标记有时应用于不同的高度。在您的示例中,这无关紧要文件)。
如上所述,问题是由于的行为引起的TextChunkLocationDefaultImp.SameLine
。因此,我们必须改变这种行为。但是,通常,我们通常不想更改iText类本身的代码。
幸运的是,它LocationTextExtractionStrategy
具有一个允许注入ITextChunkLocationStrategy
实现的构造函数,即ITextChunkLocation
实例的工厂对象。
因此,对于我们的任务,我们必须编写一个ITextChunkLocation
不太严格的替代实现,以及一个ITextChunkLocationStrategy
生成我们ITextChunkLocation
实现实例的实现。
不幸的是,虽然TextChunkLocationDefaultImp
是internal
对的iText和拥有众多的私有变量。因此,我们不能简单地从中获取实现,而必须将其整体复制并粘贴并将更改应用于该副本。
从而,
class LaxTextChunkLocationStrategy : LocationTextExtractionStrategy.ITextChunkLocationStrategy
{
public LaxTextChunkLocationStrategy()
{
}
public virtual ITextChunkLocation CreateLocation(TextRenderInfo renderInfo, LineSegment baseline)
{
return new TextChunkLocationLaxImp(baseline.GetStartPoint(), baseline.GetEndPoint(), renderInfo.GetSingleSpaceWidth());
}
}
class TextChunkLocationLaxImp : ITextChunkLocation
{
private const float DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION = 2;
private readonly Vector startLocation;
private readonly Vector endLocation;
private readonly Vector orientationVector;
private readonly int orientationMagnitude;
private readonly int distPerpendicular;
private readonly float distParallelStart;
private readonly float distParallelEnd;
private readonly float charSpaceWidth;
public TextChunkLocationLaxImp(Vector startLocation, Vector endLocation, float charSpaceWidth)
{
this.startLocation = startLocation;
this.endLocation = endLocation;
this.charSpaceWidth = charSpaceWidth;
Vector oVector = endLocation.Subtract(startLocation);
if (oVector.Length() == 0)
{
oVector = new Vector(1, 0, 0);
}
orientationVector = oVector.Normalize();
orientationMagnitude = (int)(Math.Atan2(orientationVector.Get(Vector.I2), orientationVector.Get(Vector.I1)) * 1000);
Vector origin = new Vector(0, 0, 1);
distPerpendicular = (int)(startLocation.Subtract(origin)).Cross(orientationVector).Get(Vector.I3);
distParallelStart = orientationVector.Dot(startLocation);
distParallelEnd = orientationVector.Dot(endLocation);
}
public virtual int OrientationMagnitude()
{
return orientationMagnitude;
}
public virtual int DistPerpendicular()
{
return distPerpendicular;
}
public virtual float DistParallelStart()
{
return distParallelStart;
}
public virtual float DistParallelEnd()
{
return distParallelEnd;
}
public virtual Vector GetStartLocation()
{
return startLocation;
}
public virtual Vector GetEndLocation()
{
return endLocation;
}
public virtual float GetCharSpaceWidth()
{
return charSpaceWidth;
}
public virtual bool SameLine(ITextChunkLocation @as)
{
if (OrientationMagnitude() != @as.OrientationMagnitude())
{
return false;
}
int distPerpendicularDiff = DistPerpendicular() - @as.DistPerpendicular();
if (Math.Abs(distPerpendicularDiff) < 2)
{
return true;
}
LineSegment mySegment = new LineSegment(startLocation, endLocation);
LineSegment otherSegment = new LineSegment(@as.GetStartLocation(), @as.GetEndLocation());
return Math.Abs(distPerpendicularDiff) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION && (mySegment.GetLength() == 0 || otherSegment.GetLength() == 0);
}
public virtual float DistanceFromEndOf(ITextChunkLocation other)
{
return DistParallelStart() - other.DistParallelEnd();
}
public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
{
if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
{
return false;
}
float dist = DistanceFromEndOf(previous);
if (dist < 0)
{
dist = previous.DistanceFromEndOf(this);
//The situation when the chunks intersect. We don't need to add space in this case
if (dist < 0)
{
return false;
}
}
return dist > GetCharSpaceWidth() / 2.0f;
}
internal static bool ContainsMark(ITextChunkLocation baseLocation, ITextChunkLocation markLocation)
{
return baseLocation.GetStartLocation().Get(Vector.I1) <= markLocation.GetStartLocation().Get(Vector.I1) &&
baseLocation.GetEndLocation().Get(Vector.I1) >= markLocation.GetEndLocation().Get(Vector.I1) && Math.
Abs(baseLocation.DistPerpendicular() - markLocation.DistPerpendicular()) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION;
}
}
现在,让您的代码使用这些类,替换
string output = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i));
通过
LocationTextExtractionStrategy laxStrategy = new LocationTextExtractionStrategy(new LaxTextChunkLocationStrategy());
string output = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(i), laxStrategy);
文本提取结果变为
Artikelnr. Omschrijving Aantal Per stuk Kosten
VERHUUR L. GELEVERDE ARBEID PDC 8 € 43,70 € 349,60
VERHUUR O. GELEVERDE ARBEID PDC 3 € 60,95 € 182,85
VERHUUR L.L. GELEVERDE ARBEID EM 24 € 32,20 € 772,80
如所期望的。
在您询问的评论中
请问您如何通过pdf知道行的确切位置?
我使用iText RUPS检查了页面:
在屏幕截图中选择的流的内容中,我发现:
q
...
q
1 0 0 1 60 536 cm
BT
8 0 0 8 0 0 Tm
/F3 1 Tf
(Artikelnr) Tj
8 0 0 8 31.84 0 Tm
(.) Tj
ET
Q
Q
q
...
q
1 0 0 1 147 536 cm
BT
8 0 0 8 0 0 Tm
/F3 1 Tf
(Omschrijving) Tj
ET
Q
Q
q
...
q
1 0 0 1 370 536 cm
BT
8 0 0 8 0 0 Tm
/F3 1 Tf
(Aantal) Tj
ET
Q
Q
q
...
q
1 0 0 1 433.404 535.893 cm
BT
8 0 0 8 0 0 Tm
/F3 1 Tf
(Per stuk) Tj
ET
Q
Q
q
...
q
1 0 0 1 504.878 535.893 cm
BT
8 0 0 8 0 0 Tm
/F3 1 Tf
(Kosten) Tj
ET
Q
Q
您会看到前三个标题之前
1 0 0 1 XXX 536 cm
在最后两个标题之前
1 0 0 1 XXX 535.893 cm
由于文本矩阵始终设置8 0 0 8 XXX 0 Tm
为沿y轴没有平移部分,因此上述cm指令设置了坐标系,以便分别在y位置536或535.893绘制文本。
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句