使用iText 7，导出Flate编码图像的正确方法是什么? [英] Using iText 7, what's the proper way to export a Flate encoded image?

查看：52 发布时间：2021/2/9 19:54:51 c# pdf itext7

本文介绍了使用iText 7，导出Flate编码图像的正确方法是什么?的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！

问题描述

我正在尝试创建代码以使用iText版本7.19导出PDF中的图像.我在Flate编码的图像上遇到了一些问题.我以微软免费书中的所有Flate编码的图像为例(请参阅

I am trying to create code to export out the images within a PDF using iText Version 7.19. I'm having some issues with Flate encoded images. All the Flate encoded images from the Microsoft free book I'm using as an example (see Moving to Microsoft Visual Studio 2010) always coming out pink and depending upon how I try to copy the bytes they can come out distorted.

如果我尝试一次复制所有图像字节(请参见下面的代码中的SaveFlateEncodedImage2方法)，它们会像这样失真地出现:

If I attempt to copy all the image bytes at once (see the SaveFlateEncodedImage2 method in the code below), they come out distorted like this one:

如果我尝试逐行复制它们(请参见下面的代码中的SaveFlateEncodedImage方法)，则它们是粉红色的

If I attempt to copy them row by row (see the SaveFlateEncodedImage method in the code below), they are pink like this one

这是我用来导出它们的代码:

Here is the code that I'm using to export them:

using iText.Kernel;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Filters;
using System;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Runtime.InteropServices;

namespace ITextPdfStuff
{
    public class MyPdfImageExtractor
    {
        private readonly string _pdfFileName;

        public MyPdfImageExtractor(string pdfFileName)
        {
            _pdfFileName = pdfFileName;
        }

        public void ExtractToDirectory(string directoryName)
        {
            using (var reader = new PdfReader(_pdfFileName))
            {
                // Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
                reader.SetUnethicalReading(true);

                using (var pdfDoc = new PdfDocument(reader))
                {
                    ExtractImagesOnAllPages(pdfDoc, directoryName);
                }
            }
        }


        private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
        {
            Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");

            // Extract objects https://itextpdf.com/en/resources/examples/itext-7/extracting-objects-pdf
            for (int objNumber = 1; objNumber <= pdfDoc.GetNumberOfPdfObjects(); objNumber++)
            {

                PdfObject currentObject = pdfDoc.GetPdfObject(objNumber);

                if (currentObject != null && currentObject.IsStream())
                {
                    try
                    {                 
                        ExtractImagesOneImage(currentObject as PdfStream, Path.Combine(directoryName, $"image{objNumber}.png"));
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine($"Object number {objNumber} is NOT an image! -- error: {ex.Message}");
                    }
                }
            }
        }

        private void ExtractImagesOneImage(PdfStream someStream, string fileName)
        {
            var pdfDict = (PdfDictionary)someStream;
            string subType = pdfDict.Get(PdfName.Subtype)?.ToString() ?? string.Empty;

            bool isImage = subType == "/Image";

            if (isImage == false)
                return;

            bool decoded = false;


            string filter = pdfDict.Get(PdfName.Filter).ToString();

            if (filter == "/FlateDecode")
            {
                SaveFlateEncodedImage(fileName, pdfDict, someStream.GetBytes(false));
            }
            else
            {
                byte[] imgData;

                try
                {
                    imgData = someStream.GetBytes(decoded);
                }
                catch (PdfException ex)
                {
                    imgData = someStream.GetBytes(!decoded);
                }

                SaveNormalImage(fileName, imgData);
            }

        }

        private void SaveNormalImage(string fileName, byte[] imgData)
        {
            using (var memStream = new System.IO.MemoryStream(imgData))
            using (var image = System.Drawing.Image.FromStream(memStream))
            {
                image.Save(fileName, ImageFormat.Png);
                Console.WriteLine($"{Path.GetFileName(fileName)}");
            }
        }

        private void SaveFlateEncodedImage(string fileName, PdfDictionary pdfDict, byte[] imgData)
        {
            int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
            int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
            int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());

            // Example that helped: https://stackoverflow.com/a/8517377/97803
            PixelFormat pixelFormat;
            switch (bpp)
            {
                case 1:
                    pixelFormat = PixelFormat.Format1bppIndexed;
                    break;
                case 8:
                    pixelFormat = PixelFormat.Format8bppIndexed;
                    break;
                case 24:
                    pixelFormat = PixelFormat.Format24bppRgb;
                    break;
                default:
                    throw new Exception("Unknown pixel format " + bpp);
            }

            // .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
            // Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
            imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
            //  byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);

            // Copy the image one row at a time
            using (var bmp = new Bitmap(width, height, pixelFormat))
            {
                BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);

                int length = (int)Math.Ceiling(width * bpp / 8.0);
                for (int i = 0; i < height; i++)
                {
                    int offset = i * length;
                    int scanOffset = i * bmpData.Stride;
                    Marshal.Copy(imgData, offset, new IntPtr(bmpData.Scan0.ToInt64() + scanOffset), length);
                }

                bmp.UnlockBits(bmpData);
                bmp.Save(fileName, ImageFormat.Png);
            }

            Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
        }


        /// <summary>This method distorts the image badly</summary>
        private void SaveFlateEncodedImage2(string fileName, PdfDictionary pdfDict, byte[] imgData)
        {
            int width = int.Parse(pdfDict.Get(PdfName.Width).ToString());
            int height = int.Parse(pdfDict.Get(PdfName.Height).ToString());
            int bpp = int.Parse(pdfDict.Get(PdfName.BitsPerComponent).ToString());

            // Example that helped: https://stackoverflow.com/a/8517377/97803
            PixelFormat pixelFormat;
            switch (bpp)
            {
                case 1:
                    pixelFormat = PixelFormat.Format1bppIndexed;
                    break;
                case 8:
                    pixelFormat = PixelFormat.Format8bppIndexed;
                    break;
                case 24:
                    pixelFormat = PixelFormat.Format24bppRgb;
                    break;
                default:
                    throw new Exception("Unknown pixel format " + bpp);
            }

            // .NET docs https://api.itextpdf.com/iText7/dotnet/7.1.9/classi_text_1_1_kernel_1_1_pdf_1_1_filters_1_1_flate_decode_strict_filter.html
            // Java docs have more detail: https://api.itextpdf.com/iText7/java/7.1.7/com/itextpdf/kernel/pdf/filters/FlateDecodeFilter.html
            imgData = FlateDecodeStrictFilter.FlateDecode(imgData, true);
            // byte[] streamBytes = FlateDecodeStrictFilter.DecodePredictor(imgData, pdfDict);

            // Copy the entire image in one go
            using (var bmp = new Bitmap(width, height, pixelFormat))
            {
                BitmapData bmpData = bmp.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.WriteOnly, pixelFormat);
                Marshal.Copy(imgData, 0, bmpData.Scan0, imgData.Length);
                bmp.UnlockBits(bmpData);
                bmp.Save(fileName, ImageFormat.Png);
            }

            Console.WriteLine($"FlateDecode! {Path.GetFileName(fileName)}");
        }
    }
}

可以在.NET Core控制台应用程序中实例化并像这样调用代码:

The code can be instantiated and called like this from within a .NET Core console application:

  string existingFileName = @"c:\temp\ReallyLongBook1.pdf";
  var imageExtractor = new MyPdfImageExtractor(existingFileName);
  imageExtractor.ExtractToDirectory(@"c:\temp\images");

我正在通过此代码运行以下免费的Microsoft书籍: 移动到Microsoft Visual Studio 2010

I'm running the following free Microsoft book through this code: Moving to Microsoft Visual Studio 2010

有问题的图片在第10页上，它是黑白的(不是粉红色的).

The image in question is on page 10 and it's black and white (not pink).

我不是PDF专家，而且我一直在研究此代码两天，现在挑选了一些示例以尝试将其组合在一起.任何帮助我摆脱粉红色图像的帮助，将不胜感激.

I'm no PDF expert and I've been banging on this code for a couple of days now picking apart a number of examples to try to piece this together. Any help that would get me past my pink images, would be greatly appreciated.

-------更新2020年2月4日------

-------Update Feb 4, 2020------

此处是MKL建议的更改之后的修订版本.他的变化所提取的图像比我的更多，并产生了我上面提到的书中出现的适当外观的图像:

Here is the revised version after MKL's suggested changes. His change extracted more images than mine and produced proper looking images that appear in the book I mentioned above:

using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using iText.Kernel.Pdf.Xobject;
using System;
using System.Collections.Generic;
using System.IO;

namespace ITextPdfStuff
{
    public class MyPdfImageExtractor
    {
        private readonly string _pdfFileName;

        public MyPdfImageExtractor(string pdfFileName)
        {
            _pdfFileName = pdfFileName;
        }

        public void ExtractToDirectory(string directoryName)
        {
            using (var reader = new PdfReader(_pdfFileName))
            {
                // Avoid iText.Kernel.Crypto.BadPasswordException: https://stackoverflow.com/a/48065052/97803
                reader.SetUnethicalReading(true);

                using (var pdfDoc = new PdfDocument(reader))
                {
                    ExtractImagesOnAllPages(pdfDoc, directoryName);
                }
            }
        }

        private void ExtractImagesOnAllPages(PdfDocument pdfDoc, string directoryName)
        {
            Console.WriteLine($"Number of pdf {pdfDoc.GetNumberOfPdfObjects()} objects");

            IEventListener strategy = new ImageRenderListener(Path.Combine(directoryName, @"image{0}.{1}"));
            PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
            for (var i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
            {
                parser.ProcessPageContent(pdfDoc.GetPage(i));
            }
        }
    }


    public class ImageRenderListener : IEventListener
    {
        public ImageRenderListener(string format)
        {
            this.format = format;
        }

        public void EventOccurred(IEventData data, EventType type)
        {
            if (data is ImageRenderInfo imageData)
            {
                try
                {
                    PdfImageXObject imageObject = imageData.GetImage();
                    if (imageObject == null)
                    {
                        Console.WriteLine("Image could not be read.");
                    }
                    else
                    {
                        File.WriteAllBytes(string.Format(format, index++, imageObject.IdentifyImageFileExtension()), imageObject.GetImageBytes());
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Image could not be read: {0}.", ex.Message);
                }
            }
        }

        public ICollection<EventType> GetSupportedEvents()
        {
            return null;
        }

        string format;
        int index = 0;
    }
}

使用iText 7，导出Flate编码图像的正确方法是什么? [英] Using iText 7, what's the proper way to export a Flate encoded image?

问题描述

推荐答案

相关文章

C#/.NET最新文章

热门教程

热门工具

登录关闭

使用iText 7，导出Flate编码图像的正确方法是什么? [英] Using iText 7, what&#39;s the proper way to export a Flate encoded image?

问题描述

推荐答案

相关文章

C#/.NET最新文章

热门教程

热门工具

登录 关闭

使用iText 7，导出Flate编码图像的正确方法是什么? [英] Using iText 7, what's the proper way to export a Flate encoded image?

登录关闭