這裡我們使用WordExtractor讀取Word文檔,WordExtractor來自于Apache的poi類庫項目,官方下載下傳位址:https://poi.apache.org/download.html
import java.io.FileInputStream;
import org.textmining.text.extraction.WordExtractor;
public class WordTest {
public static void main(String args[]) throws Exception {
new WordTest().readByOther();
}
public void readByText() throws Exception {
FileInputStream in = new FileInputStream("C://test.doc ");
WordExtractor extractor = new WordExtractor();
String str = extractor.extractText(in);
System.out.println(str);
}
}
public class HwpfTest {
@SuppressWarnings("deprecation")
@Test
public void testReadByExtractor() throws Exception {
InputStream is = new FileInputStream("D:\\test.doc");
WordExtractor extractor = new WordExtractor(is);
//輸出word文檔所有的文本
System.out.println(extractor.getText());
System.out.println(extractor.getTextFromPieces());
//輸出頁眉的内容
System.out.println("頁眉:" + extractor.getHeaderText());
//輸出頁腳的内容
System.out.println("頁腳:" + extractor.getFooterText());
//輸出目前word文檔的中繼資料資訊,包括作者、文檔的修改時間等。
System.out.println(extractor.getMetadataTextExtractor().getText());
//擷取各個段落的文本
String paraTexts[] = extractor.getParagraphText();
for (int i=0; i<paraTexts.length; i++) {
System.out.println("Paragraph " + (i+1) + " : " + paraTexts[i]);
}
//輸出目前word的一些資訊
printInfo(extractor.getSummaryInformation());
//輸出目前word的一些資訊
this.printInfo(extractor.getDocSummaryInformation());
this.closeStream(is);
}
/**
* 輸出SummaryInfomation
* @param info
*/
private void printInfo(SummaryInformation info) {
//作者
System.out.println(info.getAuthor());
//字元統計
System.out.println(info.getCharCount());
//頁數
System.out.println(info.getPageCount());
//标題
System.out.println(info.getTitle());
//主題
System.out.println(info.getSubject());
}
/**
* 輸出DocumentSummaryInfomation
* @param info
*/
private void printInfo(DocumentSummaryInformation info) {
//分類
System.out.println(info.getCategory());
//公司
System.out.println(info.getCompany());
}
/**
* 關閉輸入流
* @param is
*/
private void closeStream(InputStream is) {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}