突出显示现有PDF中的单词 [英] Highlight words inside existing PDF
本文介绍了突出显示现有PDF中的单词的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
在给定我已经提取的特定坐标的情况下,我需要突出显示现有PDF中的一组单词。 我正在使用Apache的pdfbox(最新版本2.0.8)。 有一个示例文件可以用于此目的(pdfbox网站内的AddAnnotations.java),但我认为此示例是用较旧的Java版本编译的,因为以下导入不起作用:
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;
有人能帮我吗?使用此库突出显示单词的最简单方法是什么?
下面是高亮显示推荐答案文档中所有单词的代码。修改此脚本可以轻松地仅突出显示一组特定的单词。请注意,这只是一项测试,需要进一步检查是否有以新行结尾的单词以及放置在负面横向/纵向PDF页面中的单词。也可以优化此脚本。
此脚本是使用Apache PDFBox 2.0.8构建的。
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
public class TestAnnotatePDF extends PDFTextStripper
{
static List<double[]> coordinates;
static ArrayList tokenStream;
public TestAnnotatePDF() throws IOException
{
//data structed containing coordinates information for each token
coordinates = new ArrayList<>();
//List of words extracted from text (considering a whitespace-based tokenization)
tokenStream = new ArrayList();
}
public static void main(String [] args) throws IOException
{
try
{
//Loading an existing document
File file = new File("MyDocument");
PDDocument document = PDDocument.load(file);
//extended PDFTextStripper class
PDFTextStripper stripper = new TestAnnotatePDF();
//Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
//The method writeText will invoke an override version of writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
//Print collected information
System.out.println(tokenStream);
System.out.println(tokenStream.size());
System.out.println(coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
//scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++)
{
//get current page
PDPage page = document.getPage(page_index);
//Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
//Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
//Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
//Scan collected coordinates
for (int i=0; i<coordinates.size(); i++)
{
//if the current coordinates are not related to the current
//page, ignore them
if ((int) coordinates.get(i)[4] != (page_index+1))
continue;
else
{
//get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
//page rotated of 90degrees
if (rotation == 90)
{
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width)/page_width;
//define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
}
else //i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
//Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float)0.3); // 30% transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy+height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY()-2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY()-2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
//Saving the document in a new file
File highlighted_doc = new File("MyDocument_final.pdf");
document.save(highlighted_doc);
document.close();
}
catch(IOException e)
{
System.out.println(e);
}
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
String token = "";
int token_length = textPositions.size();
int counter = 1;
double minx = 0,maxx = 0,miny = 0,maxy =0;
double height = 0;
double width = 0;
int rotation = 0;
for (TextPosition text : textPositions)
{
rotation = text.getRotation();
if (text.getHeight() > height)
height = text.getHeight();
if (text.getWidth() > width)
width = text.getWidth();
//if it is the first char of the current word
if (counter == 1)
{
minx = text.getX();
miny = text.getY();
}
//if it is the last char of the current word
if (counter == token_length)
{
maxx = text.getEndX();
maxy = text.getY();
}
token += text;
counter += 1;
}
tokenStream.add(token);
double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
coordinates.add(word_coordinates);
}}
这篇关于突出显示现有PDF中的单词的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文