天天看點

Java讀取doc文檔

這裡我們使用WordExtractor讀取Word文檔,WordExtractor來自于Apache的poi類庫項目,官方下載下傳位址:https://poi.apache.org/download.html

import java.io.FileInputStream;
  
import org.textmining.text.extraction.WordExtractor;
  
public class WordTest {
 public static void main(String args[]) throws Exception {
  new WordTest().readByOther();
 }
  
 public void readByText() throws Exception {
  FileInputStream in = new FileInputStream("C://test.doc ");
  WordExtractor extractor = new WordExtractor();
  String str = extractor.extractText(in);
  System.out.println(str);
 }
} 
           
public class HwpfTest {
  
  @SuppressWarnings("deprecation")
  @Test
  public void testReadByExtractor() throws Exception {
   InputStream is = new FileInputStream("D:\\test.doc");
   WordExtractor extractor = new WordExtractor(is);
   //輸出word文檔所有的文本
   System.out.println(extractor.getText());
   System.out.println(extractor.getTextFromPieces());
   //輸出頁眉的内容
   System.out.println("頁眉:" + extractor.getHeaderText());
   //輸出頁腳的内容
   System.out.println("頁腳:" + extractor.getFooterText());
   //輸出目前word文檔的中繼資料資訊,包括作者、文檔的修改時間等。
   System.out.println(extractor.getMetadataTextExtractor().getText());
   //擷取各個段落的文本
   String paraTexts[] = extractor.getParagraphText();
   for (int i=0; i<paraTexts.length; i++) {
     System.out.println("Paragraph " + (i+1) + " : " + paraTexts[i]);
   }
   //輸出目前word的一些資訊
   printInfo(extractor.getSummaryInformation());
   //輸出目前word的一些資訊
   this.printInfo(extractor.getDocSummaryInformation());
   this.closeStream(is);
  }
  
  /**
  * 輸出SummaryInfomation
  * @param info
  */
  private void printInfo(SummaryInformation info) {
   //作者
   System.out.println(info.getAuthor());
   //字元統計
   System.out.println(info.getCharCount());
   //頁數
   System.out.println(info.getPageCount());
   //标題
   System.out.println(info.getTitle());
   //主題
   System.out.println(info.getSubject());
  }
  
  /**
  * 輸出DocumentSummaryInfomation
  * @param info
  */
  private void printInfo(DocumentSummaryInformation info) {
   //分類
   System.out.println(info.getCategory());
   //公司
   System.out.println(info.getCompany());
  }
  
  /**
  * 關閉輸入流
  * @param is
  */
  private void closeStream(InputStream is) {
   if (is != null) {
     try {
      is.close();
     } catch (IOException e) {
      e.printStackTrace();
     }
   }
  }
  
}