使用PoDoFo LIB在PDF接线员阵列TJ提取文本 [英] Extract text from array TJ in PDF operator using PoDoFo lib

查看:632
本文介绍了使用PoDoFo LIB在PDF接线员阵列TJ提取文本的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我试图提取PDF文件usind的PoDoFo库文本,它正在为 TJ 运营商,不能为这样做的(阵列)的 TJ 运营商。我已经发现了这片code(有我的小修改)<一个href=\"http://libpodofo.sourcearchive.com/documentation/0.8.4/dir_385d570d5b330000523c59e421af4f6c.html\"相对=nofollow>这里:

 为const char * pszToken = NULL;
    PdfVariant VAR;
    EPdfContentsType ETYPE;    PdfContentsTokenizer标记生成器(PPAGE);    双dCurPosX = 0.0;
    双dCurPosY = 0.0;
    双dCurFontSize = 0.0;
    布尔bTextBlock = FALSE;
    PdfFont * pCurFont = NULL;    性病::栈&LT; PdfVariant&GT;堆栈;而(tokenizer.ReadNext(VLAN时,pszToken,VAR))
{    如果(ETYPE == ePdfContentsType_Keyword)
    {
        //支持'L'和'M'令牌        _RPT1(_CRT_WARN%s \\ n,pszToken);        如果(STRCMP(pszToken,L)== 0 ||
            STRCMP(pszToken,M)== 0)
        {
            dCurPosX = stack.top()GetReal()。
            stack.pop();
            dCurPosY = stack.top()GetReal()。
            stack.pop();
        }
        否则,如果(STRCMP(pszTokenTd的)== 0)
        {
            dCurPosY = stack.top()GetReal()。
            stack.pop();
            dCurPosX = stack.top()GetReal()。
            stack.pop();
        }
        否则,如果(STRCMP(pszToken,以旧换新)== 0)
        {
            dCurPosY = stack.top()GetReal()。
            stack.pop();
            dCurPosX = stack.top()GetReal()。
            stack.pop();
        }
        否则,如果(STRCMP(pszToken,BT)== 0)
        {
            bTextBlock = TRUE;
            // BT不重置字体
            // dCurFontSize = 0.0;
            // pCurFont = NULL;
        }
        否则,如果(STRCMP(pszToken,ET)== 0)
        {
            如果(!bTextBlock)
                fprintf中(标准错误,警告:发现ET没有BT \\ n);
        }        如果(bTextBlock)
        {
            如果(STRCMP(pszToken,TF)== 0)
            {
                dCurFontSize = stack.top()GetReal()。
                stack.pop();
                PdfName的fontName = stack.top()的GetName()。
                PdfObject * pFont = pPage-&GT; GetFromResources(PdfName(字体)的fontName);
                如果(!pFont)
                {
                    PODOFO_RAISE_ERROR_INFO(ePdfError_InvalidHandle,无法创建字体!);
                }                pCurFont = pDocument-&GT; GETFONT(pFont);
                如果(!pCurFont)
                {
                    fprintf中(标准错误,警告:无法创建对象%I%I R的\\ n字型
                        pFont-方式&gt;参考()ObjectNumber()
                        pFont-&GT;参考()GenerationNumber());
                }
            }
            否则,如果(STRCMP(pszToken,TJ)== 0 ||
                STRCMP(pszToken,')== 0)
            {
                AddTextElement(dCurPosX,dCurPosY,pCurFont,stack.top()的GetString());
                stack.pop();
            }
            否则,如果(STRCMP(pszToken,\\)== 0)
            {
                AddTextElement(dCurPosX,dCurPosY,pCurFont,stack.top()的GetString());
                stack.pop();
                stack.pop(); //移除堆叠字符间距
                stack.pop(); //从堆栈中删除的字间距
            }
            否则,如果(STRCMP(pszToken,TJ)== 0)
            {
                PdfArray阵列= stack.top()的getArray()。
                stack.pop();                的for(int i = 0; I&LT;的static_cast&LT; INT&GT;(array.GetSize());我++)
                {
                    _RPT1(_CRT_WARN,变种:%S,数组[I] .GetDataTypeString());
                    如果(阵列[我] .IsHexString()){
                        如果(!pCurFont){
                            _RPT1(_CRT_WARN:无法获取字体!%d个\\ N,I);
                        }
                        其他{
                            如果(pCurFont-&GT;!GetEncoding()){
                                _RPT1(_CRT_WARN:无法获取编码\\ n,0);
                            }其他{
                                PdfString S =阵列[I] .GetString();
                                _RPT1(_CRT_WARN:有效:%S,s.IsValid()是?而不是);
                                _RPT1(_CRT_WARN;十六进制:%s吗?,s.IsHex()是:而不是);
                                _RPT1(_CRT_WARN,UNI code:%s吗?,s.IsUni code()是:而不是);                                PdfString UNI code = pCurFont-&GT; GetEncoding() - GT; ConvertToUni code(S,pCurFont);
                                为const char * szText = UNI code.GetStringUtf8()c_str()。
                                _RPT1(_CRT_WARN:%S \\ N?,strlen的(szText)大于0 szText:无);                            }                        }
                    }
                    否则,如果(阵列[我] .IsNumber()){
                        _RPT1(_CRT_WARN,数:%d \\ n,数组[I] .GetNumber());
                    }                    如果(阵列[我] .IsString())// ||数组[I] .IsHexString())
                        AddTextElement(dCurPosX,dCurPosY,pCurFont,数组[I] .GetString());
                }
            }
        }
    }
    否则,如果(ETYPE == ePdfContentsType_Variant)
    {
        stack.push(VAR);        _RPT1(_CRT_WARN,变种:%S \\ n,var.GetDataTypeString());
    }
    其他
    {
        //不可能的;类型必须是关键字或变体
        PODOFO_RAISE_ERROR(ePdfError_InternalLogic);
    }
}

和为code我得到这样的输出:

  BT
 变种:名称
 变种:房地产
 TF
 变种:数
 变种:数
 变种:数
 RG
 变种:房地产
 变种:数
 变种:数
 变种:数
 变种:房地产
 变种:房地产
 TM
 变种:数组
 TJ
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-7
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-15
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-15
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-11
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-11
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-19
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-11
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-15
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 变种:编号:-11
 变种:十六进制串:有效:是的;十六进制:是的; UNI code:没有:没有
 ET

PDF格式的流对象想这(我很抱歉,但我不是不准给你PDF文件):

 
Q
q
Q
q
q
q
1 0 0 1 37.68785.28厘米
91.92 0 0 0 31.440厘米
/ IMG1待办事项
Q
Q
q
q
1 0 0 1 431.28780.24厘米
42.72 0 0 0 7.20厘米
/ IMG2待办事项
Q
Q
q
BT
/ F1 8.88铁蛋白
0 0 0 RG
0.9998 0 0 1 377.28 704.4的Tm
[<0026>-7<004F>-15<004C>-15<0048>-11<0051>-11<0057>-19<0058>-11<004F>-15<0058>-11<004C>] TJ
ET
Q
q
1 0 0 1 00厘米
0.4799瓦特
0 0 0 RG
377.28703.44米
415.2703.44升
小号
Q
q
BT
/ F1 8.16铁蛋白
0 0 0 RG
0.9998 0 0 1 377.28 687.36的Tm
[<0030>9<0027>-13<002C>-16<0003>1<0026>-13<0032>13<0031>-13<0036>-9<0037>-6<0035>-13<0038>-13<0026>-13<0037>-6<0003>1<0037>-6<0035>-13<0024>-9<0031>-13<0036>-9<0003>1<0028>-9<003B>-9<0033>-9<0028>-9<0035>-13<0037>-6<0003>1<0036>-9<0035>-13<002F>] TJ
ET

PDF文件应找到这里或的这里


解决方案

1 答案原来的问题为其中中央code部分是这样的:

 否则如果(STRCMP(pszToken,TJ)== 0)
{
    PdfArray阵列= stack.top()的getArray()。
    stack.pop();    的for(int i = 0; I&LT;的static_cast&LT; INT&GT;(array.GetSize());我++)
    {
        如果(阵列[我] .IsString())
            AddTextElement(dCurPosX,dCurPosY,pCurFont,数组[I] .GetString());
        }
    }
}

和问题是:


  

我已经注意到,在数组[我] .IsString()永远不会是真实的。这是从 TJ 运营商处获得文本的正确方法?


答案很简单:

在PoDoFo十六进制字符串 PdfVariants IsHexString被识别()而不是 IsString( )。因此,你必须测试两个字符串形式:

 如果(阵列[我] .IsString()||阵列[我] .IsHexString())

长的答案:

有串在PDF两种基本形式:


  

String对象应当用以下两种方法之一:


  
  

      
  • 由于括在括号中的文字字符序列()(使用左括号(28H)和右括号(29H));见7.3.4.2,文本字符串。


  •   
  • 由于尖括号&LT十六进制数据; >(使用小于号(三通遥控)和大于号(3EH));见7.3.4.3,十六进制字符串。


  •   

  
  

(中的 ISO 32000-1


PoDoFo车型都使用这在分析经常的背景下被包裹在 PdfVariant 甚至 PdfString 类更具体的在 PdfObject

当确定其中包含的对象的类型,虽然, PdfVariant 区分文字字符串和十六进制字符串之间:

如果这个变种是一个字符串

  / ** \\返回true(即GetDataType()== ePdfDataType_String)
 * /
内联BOOL IsString()const的{返回GetDataType()== ePdfDataType_String; }/ ** \\,如果这个变种是一个十六进制字符串,返回真(即GetDataType()== ePdfDataType_HexString)
 * /
内联BOOL IsHexString()const的{返回GetDataType()== ePdfDataType_HexString; }


  

PdfVariant.h


类型的 PdfString PdfVariant 裹在确定:

  PdfVariant :: PdfVariant(常量PdfString&安培; rsString)
{
    在里面();
    明确();    m_eDataType = rsString.IsHex()? ePdfDataType_HexString:ePdfDataType_String;
    m_Data.pData =新PdfString(rsString);
}


  

PdfVariant.cpp


在的情况下,您的 TJ 参数数组组成部分,所涉及的串读为十六进制字符串。

在您的code,因此,你必须要考虑两个 IsHexString() IsString()

 如果(阵列[我] .IsString()||阵列[我] .IsHexString())

2 之后,和之后的code进行了修订,使用检查 IsHexString()为中心的问题。

  PdfString S =阵列[I] .GetString();
_RPT1(_CRT_WARN:有效:%S,s.IsValid()是?而不是);
_RPT1(_CRT_WARN;十六进制:%s吗?,s.IsHex()是:而不是);
_RPT1(_CRT_WARN,UNI code:%s吗?,s.IsUni code()是:而不是);PdfString UNI code = pCurFont-&GT; GetEncoding() - GT; ConvertToUni code(S,pCurFont);
为const char * szText = UNI code.GetStringUtf8()c_str()。
_RPT1(_CRT_WARN:%S \\ N?,strlen的(szText)大于0 szText:无);

和(在注释中规定)的问题。


  

s.GetLength()返回2和 UNI code.GetLength()返回0,转换没有工作?


该示例文件的分析 Document2.pdf 表明,有问题的文档不包含所需信息文本提取。该文件中的唯一字体present这是用来十六进制编码的 / F1 ,其字体字典里包含适当的 / ToUni code 地图可靠的文本提取。

不幸的是,虽然,PoDoFo尚不似乎使用的地图为解析目的已经实现正确。我没有看到它的任何地方检索 / ToUni code 地图,使可用于文本解析所包含的信息。它看起来像PoDoFo不能用正确地解析使用键入0又名复合字体的文档中的文本。

I am trying to extract text from a PDF file usind the PoDoFo library, it is working for the Tj operator and fails to do so for the (array) TJ operator. I ve found this piece of code(with my small modification) here :

 const char*      pszToken = NULL;
    PdfVariant       var;
    EPdfContentsType eType;

    PdfContentsTokenizer tokenizer( pPage );

    double dCurPosX     = 0.0;
    double dCurPosY     = 0.0;
    double dCurFontSize = 0.0;
    bool   bTextBlock   = false;
    PdfFont* pCurFont   = NULL;

    std::stack<PdfVariant> stack;



while( tokenizer.ReadNext( eType, pszToken, var ) )
{

    if( eType == ePdfContentsType_Keyword )
    {
        // support 'l' and 'm' tokens

        _RPT1(_CRT_WARN, " %s\n", pszToken);

        if( strcmp( pszToken, "l" ) == 0 || 
            strcmp( pszToken, "m" ) == 0 )
        {
            dCurPosX = stack.top().GetReal();
            stack.pop();
            dCurPosY = stack.top().GetReal();
            stack.pop();
        }
        else if (strcmp(pszToken, "Td") == 0)
        {
            dCurPosY = stack.top().GetReal();
            stack.pop();
            dCurPosX = stack.top().GetReal();
            stack.pop();
        }
        else if (strcmp(pszToken, "Tm") == 0)
        {
            dCurPosY = stack.top().GetReal();
            stack.pop();
            dCurPosX = stack.top().GetReal(); 
            stack.pop();
        }
        else if( strcmp( pszToken, "BT" ) == 0 ) 
        {
            bTextBlock   = true;     
            // BT does not reset font
            // dCurFontSize = 0.0;
            // pCurFont     = NULL;
        }
        else if( strcmp( pszToken, "ET" ) == 0 ) 
        {
            if( !bTextBlock ) 
                fprintf( stderr, "WARNING: Found ET without BT!\n" );
        }

        if( bTextBlock ) 
        {
            if( strcmp( pszToken, "Tf" ) == 0 ) 
            {
                dCurFontSize = stack.top().GetReal();
                stack.pop();
                PdfName fontName = stack.top().GetName();
                PdfObject* pFont = pPage->GetFromResources( PdfName("Font"), fontName );
                if( !pFont ) 
                {
                    PODOFO_RAISE_ERROR_INFO( ePdfError_InvalidHandle, "Cannot create font!" );
                }

                pCurFont = pDocument->GetFont( pFont );
                if( !pCurFont ) 
                {
                    fprintf( stderr, "WARNING: Unable to create font for object %i %i R\n",
                        pFont->Reference().ObjectNumber(),
                        pFont->Reference().GenerationNumber() );
                }
            }
            else if( strcmp( pszToken, "Tj" ) == 0 ||
                strcmp( pszToken, "'" ) == 0 ) 
            {
                AddTextElement( dCurPosX, dCurPosY, pCurFont, stack.top().GetString() );
                stack.pop();
            }
            else if( strcmp( pszToken, "\"" ) == 0 )
            {
                AddTextElement( dCurPosX, dCurPosY, pCurFont, stack.top().GetString() );
                stack.pop();
                stack.pop(); // remove char spacing from stack
                stack.pop(); // remove word spacing from stack
            }
            else if( strcmp( pszToken, "TJ" ) == 0 ) 
            {
                PdfArray array = stack.top().GetArray();
                stack.pop();

                for( int i=0; i<static_cast<int>(array.GetSize()); i++ ) 
                {
                    _RPT1(_CRT_WARN, " variant: %s", array[i].GetDataTypeString());
                    if(array[i].IsHexString()) {
                        if(!pCurFont) {
                            _RPT1(_CRT_WARN, " : Could not Get font!!%d\n", i);
                        }
                        else {
                            if(!pCurFont->GetEncoding()) {
                                _RPT1(_CRT_WARN, ": could not get encoding\n",0);
                            } else {
                                PdfString s = array[i].GetString();
                                _RPT1(_CRT_WARN, " : valid :%s   ", s.IsValid()?"yes":"not");
                                _RPT1(_CRT_WARN, " ;hex :%s   ", s.IsHex()?"yes":"not");
                                _RPT1(_CRT_WARN, " ;unicode: %s   ", s.IsUnicode()?"yes":"not");

                                PdfString unicode = pCurFont->GetEncoding()->ConvertToUnicode(s,pCurFont);
                                const char* szText = unicode.GetStringUtf8().c_str();
                                _RPT1(_CRT_WARN, " : %s\n", strlen(szText)> 0? szText: "nothing");

                            }

                        }
                    }
                    else if(array[i].IsNumber()) {
                        _RPT1(_CRT_WARN, " : %d\n", array[i].GetNumber());
                    }

                    if( array[i].IsString() )//|| array[i].IsHexString())
                        AddTextElement( dCurPosX, dCurPosY, pCurFont, array[i].GetString() );
                }
            }
        }
    }
    else if ( eType == ePdfContentsType_Variant )
    {
        stack.push( var );

        _RPT1(_CRT_WARN, " variant: %s\n", var.GetDataTypeString());
    }
    else
    {
        // Impossible; type must be keyword or variant
        PODOFO_RAISE_ERROR( ePdfError_InternalLogic );
    }
}

and for the code I get this output:

    BT
 variant: Name
 variant: Real
 Tf
 variant: Number
 variant: Number
 variant: Number
 rg
 variant: Real
 variant: Number
 variant: Number
 variant: Number
 variant: Real
 variant: Real
 Tm
 variant: Array
 TJ
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -7
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -15
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -15
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -11
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -11
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -19
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -11
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -15
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 variant: Number : -11
 variant: HexString : valid :yes    ;hex :yes    ;unicode: not    : nothing
 ET

The PDF stream object would like this (I'm sorry but I m not allowed to give you the pdf file):

    q
Q
q
Q
q
q
q
1 0 0 1 37.68 785.28 cm
91.92 0 0 31.44 0 0 cm
/Img1 Do
Q
Q
q
q
1 0 0 1 431.28 780.24 cm
42.72 0 0 7.2 0 0 cm
/Img2 Do
Q
Q
q
BT
/F1 8.88 Tf
0 0 0 rg
0.9998 0 0 1 377.28 704.4 Tm
[<0026>-7<004F>-15<004C>-15<0048>-11<0051>-11<0057>-19<0058>-11<004F>-15<0058>-11<004C>] TJ
ET
Q
q
1 0 0 1 0 0 cm
0.4799 w
0 0 0 RG
377.28 703.44 m
415.2 703.44 l
S
Q
q
BT
/F1 8.16 Tf
0 0 0 rg
0.9998 0 0 1 377.28 687.36 Tm
[<0030>9<0027>-13<002C>-16<0003>1<0026>-13<0032>13<0031>-13<0036>-9<0037>-6<0035>-13<0038>-13<0026>-13<0037>-6<0003>1<0037>-6<0035>-13<0024>-9<0031>-13<0036>-9<0003>1<0028>-9<003B>-9<0033>-9<0028>-9<0035>-13<0037>-6<0003>1<0036>-9<0035>-13<002F>] TJ
ET

The PDF file should be found here or here

解决方案

1. The answer to the original question for which the central code part was this:

else if( strcmp( pszToken, "TJ" ) == 0 ) 
{
    PdfArray array = stack.top().GetArray();
    stack.pop();

    for( int i=0; i<static_cast<int>(array.GetSize()); i++ ) 
    {
        if( array[i].IsString() )
            AddTextElement( dCurPosX, dCurPosY, pCurFont, array[i].GetString() );
        }
    }
}

and the question was:

I've noticed that the the array[i].IsString() never gets to be true. Is this the right way to get the text from a TJ operator?

The short answer:

Hexadecimal strings in PoDoFo PdfVariants are recognized by IsHexString() instead of IsString(). Thus, you have to test for both string flavors:

if( array[i].IsString() || array[i].IsHexString() )

The long answer:

There are two basic flavors of strings in PDF:

String objects shall be written in one of the following two ways:

  • As a sequence of literal characters enclosed in parentheses ( ) (using LEFT PARENTHESIS (28h) and RIGHT PARENThESIS (29h)); see 7.3.4.2, "Literal Strings."

  • As hexadecimal data enclosed in angle brackets < > (using LESS-THAN SIGN (3Ch) and GREATER-THAN SIGN (3Eh)); see 7.3.4.3, "Hexadecimal Strings."

(section 7.3.4 in ISO 32000-1)

PoDoFo models both using the PdfString class which in the context of parsing often is wrapped inside a PdfVariant or even more specifically in a PdfObject.

When determining the type of the object contained in it, though, the PdfVariant differentiates between literal strings and hexadecimal strings:

/** \returns true if this variant is a string (i.e. GetDataType() == ePdfDataType_String)
 */
inline bool IsString() const { return GetDataType() == ePdfDataType_String; }

/** \returns true if this variant is a hex-string (i.e. GetDataType() == ePdfDataType_HexString)
 */
inline bool IsHexString() const { return GetDataType() == ePdfDataType_HexString; }

(PdfVariant.h)

The type of the PdfString inside a PdfVariant is determined when wrapped:

PdfVariant::PdfVariant( const PdfString & rsString )
{
    Init();
    Clear();

    m_eDataType  = rsString.IsHex() ? ePdfDataType_HexString : ePdfDataType_String;
    m_Data.pData = new PdfString( rsString );
}

(PdfVariant.cpp)

In case of your TJ argument array components, the strings in question are read as hexadecimal strings.

In your code, therefore, you have to consider both IsHexString() and IsString():

if( array[i].IsString() || array[i].IsHexString() )

2. Thereafter, and after the code was revised to check using IsHexString(), the question centered on

PdfString s = array[i].GetString();
_RPT1(_CRT_WARN, " : valid :%s   ", s.IsValid()?"yes":"not");
_RPT1(_CRT_WARN, " ;hex :%s   ", s.IsHex()?"yes":"not");
_RPT1(_CRT_WARN, " ;unicode: %s   ", s.IsUnicode()?"yes":"not");

PdfString unicode = pCurFont->GetEncoding()->ConvertToUnicode(s,pCurFont);
const char* szText = unicode.GetStringUtf8().c_str();
_RPT1(_CRT_WARN, " : %s\n", strlen(szText)> 0? szText: "nothing");

and the problem (as stated in comments) that

the s.GetLength() returns 2 and unicode.GetLength() returns 0, the conversion didn't work?

An analysis of the example documents Document2.pdf shows that the document in question does contain the required informations for text extraction. The only font present in that document which is used with hexadecimal encoding is /F1, and its font dictionary does contain an appropriate /ToUnicode map for reliable text extraction.

Unfortunately, though, PoDoFo does not yet seem to have implemented properly using that map for parsing purposes. I do not see it anywhere retrieving the /ToUnicode map to make the contained informations available for text parsing. It looks like PoDoFo cannot be used to properly parse the text of documents using Type0 aka composite font.

这篇关于使用PoDoFo LIB在PDF接线员阵列TJ提取文本的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆