word文档转pdf，支持.doc和.docx

2023-03-18 21:50:36

word文档有需要兼容.doc和.docx两种文档格式。其中.docx通过poi直接就可以将word转成pdf，.doc则无法这样实现，上网查询很多资料，大概思路是正确的，既将.doc文档转成html，再将html转成pdf，具体实现的时候，却发现很多方法都不完善，要么转换的html标签不闭合，无法转pdf，要么是转pdf时中文不显示，在下将方法汇总之后，整理出一套亲测可用的代码，现附上，如下：

maven依赖：

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi</artifactId>

<version>3.14</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-scratchpad</artifactId>

<version>3.14</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-ooxml</artifactId>

<version>3.14</version>

</dependency>

<dependency>

<groupId>fr.opensagres.xdocreport</groupId>

<artifactId>xdocreport</artifactId>

<version>1.0.6</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>poi-ooxml-schemas</artifactId>

<version>3.14</version>

</dependency>

<dependency>

<groupId>org.apache.poi</groupId>

<artifactId>ooxml-schemas</artifactId>

<version>1.3</version>

</dependency>

<!-- <dependency>

<groupId>com.lowagie</groupId>

<artifactId>itext</artifactId>

<version>2.0.8</version>

</dependency> -->

<dependency>

<groupId>org.xhtmlrenderer</groupId>

<artifactId>core-renderer</artifactId>

<version>R8</version>

</dependency>

<dependency>

<groupId>org.jsoup</groupId>

<artifactId>jsoup</artifactId>

<version>1.11.3</version>

</dependency>

代码：

package cn.test.util.utils;

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import java.util.ArrayList;

import java.util.List;

import java.util.Map;

import javax.xml.parsers.DocumentBuilder;

import javax.xml.parsers.DocumentBuilderFactory;

import javax.xml.parsers.ParserConfigurationException;

import javax.xml.transform.OutputKeys;

import javax.xml.transform.Transformer;

import javax.xml.transform.TransformerConfigurationException;

import javax.xml.transform.TransformerException;

import javax.xml.transform.TransformerFactory;

import javax.xml.transform.dom.DOMSource;

import javax.xml.transform.stream.StreamResult;

import org.apache.commons.collections.MapUtils;

import org.apache.commons.lang.StringUtils;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.converter.PicturesManager;

import org.apache.poi.hwpf.converter.WordToHtmlConverter;

import org.apache.poi.hwpf.usermodel.Picture;

import org.apache.poi.hwpf.usermodel.PictureType;

import org.apache.poi.xwpf.converter.pdf.PdfConverter;

import org.apache.poi.xwpf.converter.pdf.PdfOptions;

import org.apache.poi.xwpf.usermodel.XWPFDocument;

import org.apache.poi.xwpf.usermodel.XWPFParagraph;

import org.apache.poi.xwpf.usermodel.XWPFRun;

import org.apache.poi.xwpf.usermodel.XWPFTable;

import org.apache.poi.xwpf.usermodel.XWPFTableCell;

import org.apache.poi.xwpf.usermodel.XWPFTableRow;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Entities;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.w3c.dom.Document;

import org.xhtmlrenderer.pdf.ITextFontResolver;

import org.xhtmlrenderer.pdf.ITextRenderer;

import com.itextpdf.text.DocumentException;

import com.itextpdf.text.pdf.PdfCopy;

import com.itextpdf.text.pdf.PdfImportedPage;

import com.itextpdf.text.pdf.PdfReader;

import com.lowagie.text.pdf.BaseFont;

public class Word2PDFUtils {

private static final Logger logger = LoggerFactory

.getLogger(Word2PDFUtils.class);

public static void main(String[] args) {

try {

word2PDF("D://Test/test.doc",

"D:/Test/test.pdf");

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

}

public static File word2PDF(String wordFilePath, String pdfFilePath)

throws Exception {

if (StringUtils.isBlank(pdfFilePath)

|| StringUtils.isBlank(wordFilePath)) {

logger.info("word2PDF 文件路径为空，wordFilePath={}，pdfFilePath={}",

wordFilePath, pdfFilePath);

return null;

}

File wordFile = new File(wordFilePath);

File pdfFile = new File(pdfFilePath);

return word2PDF(wordFile, pdfFile);

}

public static File word2PDF(File wordFile, File pdfFile) throws Exception {

if (null == wordFile || null == pdfFile) {

logger.info("word2PDF 文件对象为空，wordFile={}，pdfFile={}", wordFile,

pdfFile);

return null;

}

String wordName = wordFile.getName();

if (!wordName.endsWith(".doc") && !wordName.endsWith(".docx")) {

// 格式不对

logger.info("不是word文档格式，文件路径={}", wordFile.getAbsolutePath());

return null;

}

File pdfParentFile = pdfFile.getParentFile();

if (!pdfParentFile.exists()) {

pdfParentFile.mkdirs();

}

String absolutePath = pdfParentFile.getAbsolutePath();

wordName = wordName.substring(0, wordName.indexOf("."));

String pdfPath = absolutePath + "/pdf/" + wordName + ".pdf";

File tempPdfFile = new File(pdfPath);

if (wordFile.getName().endsWith("doc")) {

String htmlPath = absolutePath + "/html/" + wordName + ".html";

File htmlFile = new File(htmlPath);

// doc格式word文档，先转成html，再格式化标签成xhtml，最后转成pdf

wordDocToHtml(wordFile, htmlFile);

convertHtmlToPdf(htmlFile, tempPdfFile);

// 删除html文件

boolean delete = htmlFile.delete();

logger.info("删除htmlFile路径path={}，结果={}",

htmlFile.getAbsolutePath(), delete);

} else if (wordFile.getName().endsWith("docx")) {

// docx格式转pdf

wordConverterToPdf(new FileInputStream(wordFile),

new FileOutputStream(tempPdfFile), null);

}

// 抽取第一页

splitPDFFile(tempPdfFile.getAbsolutePath(), pdfFile.getAbsolutePath(),

1, 2);

// 删除临时的pdf文件

boolean delete = tempPdfFile.delete();

logger.info("删除tempPdfFile路径path={}，结果={}",

tempPdfFile.getAbsolutePath(), delete);

return pdfFile;

}

private static void wordConverterToPdf(InputStream source,

OutputStream target, Map<String, String> params) throws Exception {

wordConverterToPdf(source, target, null, params);

}

private static void wordConverterToPdf(InputStream source,

OutputStream target, PdfOptions options, Map<String, String> params)

throws Exception {

XWPFDocument doc = new XWPFDocument(source);

paragraphReplace(doc.getParagraphs(), params);

// 存在需要替换的再循环

if (MapUtils.isNotEmpty(params)) {

for (XWPFTable table : doc.getTables()) {

for (XWPFTableRow row : table.getRows()) {

for (XWPFTableCell cell : row.getTableCells()) {

paragraphReplace(cell.getParagraphs(), params);

}

}

}

}

PdfConverter.getInstance().convert(doc, target, options);

}

private static void paragraphReplace(List<XWPFParagraph> paragraphs,

Map<String, String> params) {

if (MapUtils.isNotEmpty(params)) {

for (XWPFParagraph p : paragraphs) {

for (XWPFRun r : p.getRuns()) {

String content = r.getText(r.getTextPosition());

if (StringUtils.isNotEmpty(content)

&& params.containsKey(content)) {

r.setText(params.get(content), 0);

}

}

}

}

}

private static void wordDocToHtml(File wordFile, File htmlFile) {

if (null == wordFile || null == htmlFile) {

return;

}

File parentFile = htmlFile.getParentFile();

if (!parentFile.exists()) {

parentFile.mkdirs();

}

String absolutePath = parentFile.getAbsolutePath();

HWPFDocument wordDocument;

final String imagepath = absolutePath + "/temp/wordimage/";

try {

// 根据输入文件路径与名称读取文件流

InputStream in = new FileInputStream(wordFile);

// 把文件流转化为输入wordDom对象

wordDocument = new HWPFDocument(in);

// 通过反射构建dom创建者工厂

DocumentBuilderFactory domBuilderFactory = DocumentBuilderFactory

.newInstance();

// 生成dom创建者

DocumentBuilder domBuilder = domBuilderFactory.newDocumentBuilder();

// 生成dom对象

Document dom = domBuilder.newDocument();

// 生成针对Dom对象的转化器

WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(

dom);

// 转化器重写内部方法

wordToHtmlConverter.setPicturesManager(new PicturesManager() {

@Override

public String savePicture(byte[] content,

PictureType pictureType, String suggestedName,

float widthInches, float heightInches) {

File imgPath = new File(imagepath);

if (!imgPath.exists()) {// 图片目录不存在则创建

imgPath.mkdirs();

}

File file = new File(imagepath + suggestedName);

try {

OutputStream os = new FileOutputStream(file);

os.write(content);

os.close();

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

return imagepath + suggestedName;

}

});

// 转化器开始转化接收到的dom对象

wordToHtmlConverter.processDocument(wordDocument);

// 保存文档中的图片

List<?> pics = wordDocument.getPicturesTable().getAllPictures();

if (pics != null) {

for (int i = 0; i < pics.size(); i++) {

Picture pic = (Picture) pics.get(i);

try {

pic.writeImageContent(new FileOutputStream(imagepath

+ pic.suggestFullFileName()));

} catch (FileNotFoundException e) {

e.printStackTrace();

}

}

}

// 从加载了输入文件中的转换器中提取DOM节点

Document htmlDocument = wordToHtmlConverter.getDocument();

// 从提取的DOM节点中获得内容

DOMSource domSource = new DOMSource(htmlDocument);

// 字节码输出流

OutputStream out = new FileOutputStream(htmlFile);

// 输出流的源头

StreamResult streamResult = new StreamResult(out);

// 转化工厂生成序列转化器

TransformerFactory tf = TransformerFactory.newInstance();

Transformer serializer = tf.newTransformer();

// 设置序列化内容格式

serializer.setOutputProperty(OutputKeys.ENCODING, "Unicode");//此处根据你的word文档的编码格式进行设置

serializer.setOutputProperty(OutputKeys.INDENT, "yes");

serializer.setOutputProperty(OutputKeys.METHOD, "html");

serializer.transform(domSource, streamResult);

out.close();

in.close();

} catch (FileNotFoundException e1) {

e1.printStackTrace();

} catch (IOException e1) {

e1.printStackTrace();

} catch (TransformerConfigurationException e) {

e.printStackTrace();

} catch (TransformerException e) {

e.printStackTrace();

} catch (ParserConfigurationException e) {

e.printStackTrace();

}

}

private static void wordDocToHtml(String wordFilePath, String htmlFilePath) {

if (org.apache.commons.lang3.StringUtils.isAnyBlank(wordFilePath,

htmlFilePath)) {

return;

}

File wordFile = new File(wordFilePath);

File htmlFile = new File(htmlFilePath);

wordDocToHtml(wordFile, htmlFile);

}

private static boolean convertHtmlToPdf(File htmlFile, File pdfFile)

throws Exception {

if (null == htmlFile || null == pdfFile) {

logger.info("html转pdf时，有file为空，htmlFile={}，pdfFile={}", htmlFile,

pdfFile);

return false;

}

String absoluteFilePath = htmlFile.getParentFile().getAbsolutePath();

if (!pdfFile.getParentFile().exists()) {

pdfFile.getParentFile().mkdirs();

}

// .doc转成的html中有些标签：例如<mate>不严谨，会出现标签不闭合问题，在转pdf时会报异常，故此处用jsoup将html转化成xhtml，将标签严谨化

// 格式化html标签

org.jsoup.nodes.Document parse = Jsoup.parse(htmlFile, "utf-8");

parse.outputSettings()

.syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)

.escapeMode(Entities.EscapeMode.xhtml);

String html = parse.html();

// 此处将body标签中的字体设置为SimSun，必须是这种样式，才会识别中文支持的文件，如果不设置，会出现转成的pdf中文不显示问题（此处需要替换的字段，可用将自己转成的html打印出来，查看是否是宋体，如不是，将宋体改为你转换成html的字体格式）

html = html.replace("font-family:宋体", "font-family: SimSun");

OutputStream os = new FileOutputStream(pdfFile);

ITextRenderer renderer = new ITextRenderer();

renderer.setDocumentFromString(html);

// 解决中文支持问题

ITextFontResolver fontResolver = renderer.getFontResolver();

String path = Word2PDFUtils.class.getClassLoader()

.getResource("simsun.ttc").getPath();

logger.info(path);

fontResolver.addFont(path, BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);

// 解决图片的相对路径问题

renderer.getSharedContext().setBaseURL(

"file:" + absoluteFilePath + "/temp/htmlimage");

renderer.layout();

renderer.createPDF(os);

os.flush();

os.close();

return true;

}

private static boolean convertHtmlToPdf(String inputFile, String outputFile)

throws Exception {

if (org.apache.commons.lang3.StringUtils.isAnyBlank(inputFile,

outputFile)) {

logger.info("html转pdf是，路径为空，inputFile={}，outputFile={}", inputFile,

outputFile);

;

return false;

}

File htmlFile = new File(inputFile);

File pdfFile = new File(outputFile);

return convertHtmlToPdf(htmlFile, pdfFile);

}

private static void splitPDFFile(String respdfFile, String savepath,

int from, int end) {

com.itextpdf.text.Document document = null;

PdfCopy copy = null;

try {

PdfReader reader = new PdfReader(respdfFile);

int n = reader.getNumberOfPages();

if (end == 0) {

end = n;

}

ArrayList<String> savepaths = new ArrayList<String>();

savepaths.add(savepath);

document = new com.itextpdf.text.Document(reader.getPageSize(1));

copy = new PdfCopy((com.itextpdf.text.Document) document,

new FileOutputStream(savepaths.get(0)));

document.open();

for (int j = from; j < end; j++) {

document.newPage();

PdfImportedPage page = copy.getImportedPage(reader, j);

copy.addPage(page);

}

document.close();

reader.close();

copy.close();

} catch (IOException e) {

e.printStackTrace();

} catch (DocumentException e) {

e.printStackTrace();

}

}

}

其中.doc文档转html方法与其他一样，只是转完html时需要用jsoup转一遍xhtml，使标签严谨化，然后转pdf，转pdf时加入中文字体支持，

如果报没有搜索到方法的异常，可能是jar包版本的问题，就将<!-- <dependency>

<groupId>com.lowagie</groupId>

<artifactId>itext</artifactId>

<version>2.0.8</version>

</dependency> -->依赖放开试试，我开始的时候遇见过这个异常，后来随着导入的依赖增多，这个依赖注掉也不会有这个异常了。可能是其他的依赖里有这个版本--2.0.8的itext的jar包，但是不确定你的其他依赖里是否存在，故此说明

另外附上文件simsun.ttc百度云下载地址：

链接：https://pan.baidu.com/s/1iH4iqJB2X_0gB7T4_CClzA

提取码：7rmn

itext poi word文档转pdf

上一篇: webview展示doc docx pdf，excels

下一篇: linux openoffice doc转pdf,Linux Openoffice转换Office为pdf

继续阅读