天天看點

利用pdfbox讀取pdf檔案内容和圖檔

     最近用pdfbox讀取pdf檔案中的内容和圖檔,可以擷取每一頁的内容和圖檔,但有個問題是沒法擷取圖檔在頁面的位置。源碼如下:

package com.util;

import java.awt.image.BufferedImage;

import java.io.BufferedInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.InputStream;

import java.io.StringWriter;

import java.text.SimpleDateFormat;

import java.util.Calendar;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import java.util.Set;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.pdmodel.PDDocumentCatalog;

import org.apache.pdfbox.pdmodel.PDPage;

import org.apache.pdfbox.pdmodel.PDResources;

import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

import org.apache.pdfbox.util.PDFTextStripper;

public class PdfBoxUtil {

try {

InputStream inputStream = new BufferedInputStream(new FileInputStream(new File("D:/android/a.pdf")));

//PDFParser parser = new PDFParser( inputStream );

//parser.parse();

PDDocument pdfDocument = PDDocument.load(inputStream); 

//PDDocument pdfDocument = parser.getPDDocument(); 

StringWriter writer = new StringWriter();

PDFTextStripper stripper = new PDFTextStripper();

stripper.writeText(pdfDocument, writer);

String contents = writer.getBuffer().toString();

PDDocumentCatalog cata = pdfDocument.getDocumentCatalog();

List pages = cata.getAllPages();

int count = 1;

for (int i = 0; i < pages.size(); i++) {

PDPage page = (PDPage) pages.get(i); 

if (null != page) { 

//本頁面文字内容

StringWriter sw = new StringWriter(); 

PDFTextStripper pst = new PDFTextStripper();

pst.setStartPage(i+1);

  pst.setEndPage(i+1);

pst.writeText(pdfDocument, sw); 

String content = sw.getBuffer().toString();

System.out.println(content);

PDResources res  = page.findResources() ; 

// 擷取頁面圖檔資訊

Map  imgs = res.getImages(); 

if (null != imgs) {

Set keySet = imgs.keySet();

Iterator it = keySet.iterator();

while (it.hasNext()) {

Object obj = it.next();

PDXObjectImage img = (PDXObjectImage) imgs.get(obj);

img.write2file("D:/" + count);

count++;

}

}

}

}

} catch (Exception e) {

// TODO 自動生成 catch 塊

e.printStackTrace();

}

}

public static String dateFormat(Calendar calendar) throws Exception {

if (null == calendar)

return null;

String date = null;

try {

String pattern = "yyyy-MM-dd";

SimpleDateFormat format = new SimpleDateFormat(pattern);

date = format.format(calendar.getTime());

} catch (Exception e) {

throw e;

}

return date == null ? "" : date;

}

}

最後實在沒辦法,隻好将每一頁的内容轉換成圖檔。替換如上紅色部分代碼,将每一頁列印成圖檔。

if (null != page) { 

BufferedImage img1 =  page.convertToImage(); 

File file = new File("D:/"+i+".PNG");

ImageIO.write(img1, "PNG", file);

}

環保網