天天看點

java chm檔案解析_Java 解析chm檔案實戰(原創)

需求:java解析chm檔案,并将内容插入資料庫和redis.

Java解析chm檔案,網上除了github上有個家夥隻言片語了一下,沒有啥資料參考,包括chm4j這東西,沒啥介紹,本着服務大衆的精神,整理了下流程, 時間倉促,錯誤之處在所難免,望指正.

第一步:下載下傳chm4j.jar以及依賴

第二步:建立java工程,建一個解析ParseChm類,建一個解析測試類,類似:

ParseChm類:

//下面的包,請導入chm4j.jar,并且把chm4j.dll拷貝到jre的lib目錄内,linux或mac請拷貝libchm4j.so即//可,因為chm4j.jar依賴于c++

package cn.lswe.baseframe.utils;

import java.io.File;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.OutputStream;

import org.chm4j.*;

import cn.lswe.baseframe.validator.Conf;

public class ParseChm {

public static void main(String... args) {

try {

ChmFile cFile = new ChmFile(Conf.ChmSOurce);

String dir = Conf.dir;

ChmEntry.Attribute attributes = ChmEntry.Attribute.ALL;

ChmEntry[] entries = cFile.entries(attributes);

for (ChmEntry entry : entries) {

listChmEntry(dir, entry, attributes);

}

} catch (IOException ex) {

System.out.println("Error : " + ex.getMessage());

}

}

private static void listChmEntry(String output, ChmEntry entry, ChmEntry.Attribute attributes) throws IOException {

printEntry(entry);

String er=GuidHelper.CreateGuid().toString();

File dest = new File(output, entry.getPath());

if (entry.hasAttribute(ChmEntry.Attribute.DIRECTORY)) {

if (!dest.isDirectory()) {

if (!dest.mkdirs()) {

throw new IOException("failed to create directory : " + dest);

}

}

for (ChmEntry e : entry.entries(attributes)) {

listChmEntry(output, e, attributes);

}

} else {

InputStream in = null;

OutputStream out = null;

try {

in = entry.getInputStream();

out = new FileOutputStream(dest);

int bufferSize = 1024;

byte[] data = new byte[bufferSize];

int nbRead;

while ((nbRead = in.read(data)) > 0) {

out.write(data, 0, nbRead);

out.flush();

}

} catch (IOException ex) {

System.out.println(ex.getMessage());

} finally {

try {

if (out != null) {

out.close();

}

} finally {

if (in != null) {

in.close();

}

}

}

}

}

private static void printEntry(ChmEntry entry) {

StringBuilder sb = new StringBuilder("Extract entry " + entry + "(");

boolean first = true;

for (ChmEntry.Attribute attribute : entry.getAttributes()) {

if (first) {

first = false;

} else {

sb.append(", ");

}

sb.append(attribute);

}

sb.append(")");

System.out.println(sb.toString());

}

}

這就得到了若幹中轉html檔案(兩萬多個),注意,chm檔案的格式相當複雜,決定了這樣的處理方法,事實上我接下來的word,freemind檔案,統統都這樣處理的.

測試類:

package cn.lswe.baseframe.spider;

import java.io.FileNotFoundException;

import java.util.LinkedList;

import java.util.List;

import cn.lswe.baseframe.utils.FileHelper;

import cn.lswe.baseframe.validator.Conf;

import us.codecraft.webmagic.Page;

import us.codecraft.webmagic.Site;

import us.codecraft.webmagic.Spider;

import us.codecraft.webmagic.pipeline.ConsolePipeline;

import us.codecraft.webmagic.processor.PageProcessor;

public class ParseDisease implements PageProcessor {

private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

@Override

public void process(Page page) {

Listlinks=new LinkedList();

links.add(Conf.diseaseDataSource);

page.addTargetRequests(links);

List tdList = page.getHtml().xpath("table").xpath("td").all();

//這裡利用了webmagic爬蟲架構,可以參照這個連結做:

int j=0;

int k=0;

for(int i=0;i

//在這裡,可以篩選,處理你的内容了

//插入jedis,插入資料庫都ok

j=i+1;

k=i+2;

String td1=tdList.get(i);

String td2=tdList.get(i);//第二列要拆分為一個數組,它是第一列的下一級分類

String td3=tdList.get(i);//如果第一列是字母,第三列和第二列一一對應,如果第一列是漢字,第三列和第二列第二行開始一一對應,其編碼是”B”+第一行第三列+本行第三列

}

System.out.println(tdList);

}

@Override

public Site getSite() {

return site;

}

@SuppressWarnings("deprecation")

public static void testSpider() {

// Conf.diseaseDataSource嗅探的起點,比如為了速度,請把所有資源檔案部署

//到localhost

Spider.create(new ParseDisease())

.addUrl(Conf.diseaseDataSource)

.pipeline(new  ConsolePipeline()).thread(5).run();

//開啟5個線程抓取

}

}.

調用方法:

@ResponseBody

@RequestMapping("/test/spider")

public void spider()

{

//OschinaBlogPageProcesser.testSpider();

ParseChm.testSpider();

}

java chm檔案解析_Java 解析chm檔案實戰(原創)