解析波斯PDF文件为TXT和图像 [英] Parse a persian pdf file to txt and its images

查看:183
本文介绍了解析波斯PDF文件为TXT和图像的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我用这个code为转换一个英文版的PDF,它很好地工作,但是当我使用它的波斯文件,它的输出没有波斯的性格!我怎么可以解析的Uni code PDF格式的文本文件和文件夹包含的图像文件?

 使用系统;
使用System.IO;
使用iTextSharp.text.pdf;
使用System.Text.RegularEx pressions;

命名空间Spider.Utils
{
    ///<总结>
    ///解析PDF文件,并​​从中提取出文字。
    ///< /总结>
    公共类PDFParser
    {
        /// BT =开头的文本对象运营商
        /// ET =结束文本对象运营商
        /// Td的移动到下一行的开始
        /// 5 TS =标
        /// -5 TS =标

        #地区的字段

        #地区_numberOfCharsToKeep
        ///<总结>
        ///字符数保持,提取文本的时候。
        ///< /总结>
        私有静态诠释_numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #地区ExtractText
        ///<总结>
        ///提取从PDF文件的文本。
        ///< /总结>
        ///< PARAM NAME =inFileName>的完整路径,PDF文件和LT; /参数>
        ///< PARAM NAME =outFileName>输出文件的名称和LT; /参数>
        ///<返回>将提取的文字< /回报>
        公共BOOL ExtractText(字符串inFileName,串outFileName)
        {
            的StreamWriter不过outFile = NULL;
            尝试
            {
                //创建一个阅读器,用于给定的PDF文件
                PdfReader读卡器=新PdfReader(inFileName);
                //不过outFile = File.CreateText(outFileName);
                不过outFile =新的StreamWriter(outFileName,假的,System.Text.Encoding.UTF8);

                Console.Write(处理);

                INT totalLen = 68;
                浮动charUnit =((浮点)totalLen)/(浮点)reader.NumberOfPages;
                INT totalWritten = 0;
                浮curUnit = 0;

                对于(INT页= 1;网页< = reader.NumberOfPages;网页++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(页))+);

                    //编写的进展情况。
                    如果(charUnit> = 1.0F)
                    {
                        的for(int i = 0; I<(INT)charUnit;我++)
                        {
                            Console.Write(#);
                            totalWritten ++;
                        }
                    }
                    其他
                    {
                        curUnit + = charUnit;
                        如果(curUnit> = 1.0F)
                        {
                            的for(int i = 0; I<(INT)curUnit;我++)
                            {
                                Console.Write(#);
                                totalWritten ++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                如果(totalWritten< totalLen)
                {
                    的for(int i = 0;我≤(totalLen  -  totalWritten);我++)
                    {
                        Console.Write(#);
                    }
                }
                返回true;
            }
            抓住
            {
                返回false;
            }
            最后
            {
                如果(!不过outFile = NULL)outFile.Close();
            }
        }
        #endregion

        #地区ExtractTextFromPDFBytes
        ///<总结>
        ///此方法处理pssed的Adobe(文本)对象的uncom $ P $
        ///并提取文本。
        ///< /总结>
        ///< PARAM NAME =输入> uncom pressed< /参数>
        ///<返回>< /回报>
        公共字符串ExtractTextFromPDFBytes(byte []的输入)
        {
            如果(输入== NULL || input.Length == 0)返回;

            尝试
            {
                字符串resultString =;

                //下垂的表现,如果我们是我们目前的文本对象中
                布尔inTextObject = FALSE;

                //信号表示,如果下一个字符是文字
                // 例如。 '\\'获得'\'字或'\('得到'('
                布尔nextLiteral = FALSE;

                //()括号嵌套层次。文本内出现()
                INT bracketDepth = 0;

                //保存previous字符得到提取号码等:
                的char [] previousCharacters =新的char [_numberOfCharsToKeep]
                对于(INT J = 0; J< _numberOfCharsToKeep; J ++)previousCharacters [J] ='';


                的for(int i = 0; I< input.Length;我++)
                {
                    炭C =(char)的输入[I]
                    如果(输入[I] == 213)
                        C ='.ToCharArray()[0];

                    如果(inTextObject)
                    {
                        //位置的文字
                        如果(bracketDepth == 0)
                        {
                            如果(CheckToken(新的String [] {TD,Td的},previousCharacters))
                            {
                                resultString + =\ñ\ r;
                            }
                            其他
                            {
                                如果(CheckToken(新的String [] {',T *,\},previousCharacters))
                                {
                                    resultString + =\ N的;
                                }
                                其他
                                {
                                    如果(CheckToken(新的String [] {TJ},previousCharacters))
                                    {
                                        resultString + =;
                                    }
                                }
                            }
                        }

                        //文本对象的结束,也进入到一个新行。
                        如果(bracketDepth == 0安培;&安培;
                            CheckToken(新的String [] {ET},previousCharacters))
                        {

                            inTextObject = FALSE;
                            resultString + =;
                        }
                        其他
                        {
                            //开始输出文本
                            如果((三=='(')及及(bracketDepth == 0)&安培;&安培;!(nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            其他
                            {
                                //停止输出文本
                                如果((三==')')及&安培; (bracketDepth == 1)及;&安培; (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                其他
                                {
                                    //只是一个普通的文本字符:
                                    如果(bracketDepth == 1)
                                    {
                                        //只有无论什么打印出一个字符。
                                        //不跨preT。
                                        如果(C =='\\'和;&安培;!nextLiteral)
                                        {
                                            resultString + = c.ToString();
                                            nextLiteral = TRUE;
                                        }
                                        其他
                                        {
                                            如果(((c取代; ='')及及(℃下='〜'))||
                                                ((c取代; = 128)及及(℃下255)))
                                            {
                                                resultString + = c.ToString();
                                            }

                                            nextLiteral = FALSE;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    //存储近期字符
                    //当我们要回去了检查
                    对于(INT J = 0; J< _numberOfCharsToKeep  -  1; J ++)
                    {
                        previousCharacters [J] = previousCharacters [J + 1];
                    }
                    previousCharacters [_numberOfCharsToKeep  -  1] = C;

                    //启动文本对象
                    如果(inTextObject&安培;!&安培; CheckToken(新的String [] {BT},previousCharacters))
                    {
                        inTextObject = TRUE;
                    }
                }

                返回CleanupContent(resultString);
            }
            抓住
            {
                返回 ;
            }
        }

        私人字符串CleanupContent(文本字符串)
        {
            字符串[]模式= {@\\\(@\\\),@\\ 226,@\\ 222,@\\ 223,@\\ 224, @\\ 340,@\\ 342,@\\ 344,@\\ 300,@\\ 302,@\\ 304,@\\ 351,@ \\ 350,@\\ 352,@\\ 353,@\\ 311,@\\ 310,@\\ 312,@\\ 313,@ \\ 362,@\\ 364,@\\ 366,@\\ 322,@\\ 324,@\\ 326,@\\ 354,@\ \ 356,@\\ 357,@\\ 314,@\\ 316,@\\ 317,@\\ 347,@\\ 307,@\\ 371,@\\ 373,@\\ 374,@\\ 331,@\\ 333,@\\ 334,@\\ 256,@\\ 231 @\\ 253,@\\ 273,@\\ 251,@\\ 221};
            字符串[]替换= {(,), - ,',\,\,A,A,A,A,A ,A,E,E,E,E,E,E,E,E,O,O,O O,O,O,I,我,我,我,我,我,C,C,U,U ,U,U,U,U,®,™,«,»,©,'};

            的for(int i = 0; I< patterns.Length;我++)
            {
                字符串regExPattern =图案[我]
                正则表达式的regex ​​=新的正则表达式(regExPattern,RegexOptions.IgnoreCase);
                文= regex.Replace(文字,替换[I]);
            }

            返回文本;
        }

        #endregion

        #地区CheckToken
        ///<总结>
        ///检查某2字符标记只是来了(如:BT)
        ///< /总结>
        ///< PARAM NAME =记号>在搜索令牌LT; /参数>
        ///< PARAM NAME =最近>在最近的字符数组< /参数>
        ///<返回>< /回报>
        私人布尔CheckToken(字符串[]令牌的char []近期)
        {
            的foreach(在令牌字符串标记)
            {
                如果((近期[_numberOfCharsToKeep  -  3] ==令牌[0])及和放大器;
                    (近期[_numberOfCharsToKeep  -  2] ==令牌[1])及和放大器;
                    ((近期[_numberOfCharsToKeep  -  1] =='')||
                    (近期[_numberOfCharsToKeep  -  1] == 0X0D)||
                    (近期[_numberOfCharsToKeep  -  1] ==的0x0A))及和放大器;
                    ((近期[_numberOfCharsToKeep  -  4] =='')||
                    (近期[_numberOfCharsToKeep  -  4] == 0X0D)||
                    (近期[_numberOfCharsToKeep  -  4] ==的0x0A))
                    )
                {
                    返回true;
                }
            }
            返回false;
        }
        #endregion
    }
}
 

解决方案

有没有你不使用相对较新的解析类具体的原因是什么?我不知道波斯语,但第一波斯PDF我发现在谷歌工作的,以及它的少code:

  PdfReader读卡器=新PdfReader(pdfPath);
PdfReaderContentParser分析器=新PdfReaderContentParser(读卡器);
StringBuilder的SB =新的StringBuilder();
的for(int i = 1; I< = reader.NumberOfPages;我++){
  ITextExtractionStrategy策略= parser.ProcessContent(
    我,新SimpleTextExtractionStrategy()
  );
  sb.Append(strategy.GetResultantText());
}
 

若干问题得到了最近固定的,所以我使用的是最新iTextSharp的SVN构建。另外你的问题的标题的包括解析图像,但你的code没有,所以上面的例子是不可以中提取任何图片。

I used this code for convert a English pdf and it work perfectly, but when i use it for Persian file, its output has no Persian character !! how i can parse a Unicode pdf to a text file and a folder contains image files?

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}

解决方案

Is there a specific reason you're not using the relatively new parsing classes? I don't know the Persian language, but the first Persian PDF I found on Google works, and it's much less code:

PdfReader reader = new PdfReader(pdfPath);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
StringBuilder sb = new StringBuilder();
for (int i = 1; i <= reader.NumberOfPages; i++) {
  ITextExtractionStrategy strategy = parser.ProcessContent(
    i, new SimpleTextExtractionStrategy()
  );
  sb.Append(strategy.GetResultantText());
}

A number of bugs have been fixed recently, so I'm using the latest iTextSharp SVN build. Also your question title includes parsing images, but your code doesn't, so the example above is not extracting any images.

这篇关于解析波斯PDF文件为TXT和图像的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆