需求:java解析chm檔案,并将内容插入資料庫和redis.
Java解析chm檔案,網上除了github上有個家夥隻言片語了一下,沒有啥資料參考,包括chm4j這東西,沒啥介紹,本着服務大衆的精神,整理了下流程, 時間倉促,錯誤之處在所難免,望指正.
第一步:下載下傳chm4j.jar以及依賴
第二步:建立java工程,建一個解析ParseChm類,建一個解析測試類,類似:
ParseChm類:
//下面的包,請導入chm4j.jar,并且把chm4j.dll拷貝到jre的lib目錄内,linux或mac請拷貝libchm4j.so即//可,因為chm4j.jar依賴于c++
package cn.lswe.baseframe.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.chm4j.*;
import cn.lswe.baseframe.validator.Conf;
public class ParseChm {
public static void main(String... args) {
try {
ChmFile cFile = new ChmFile(Conf.ChmSOurce);
String dir = Conf.dir;
ChmEntry.Attribute attributes = ChmEntry.Attribute.ALL;
ChmEntry[] entries = cFile.entries(attributes);
for (ChmEntry entry : entries) {
listChmEntry(dir, entry, attributes);
}
} catch (IOException ex) {
System.out.println("Error : " + ex.getMessage());
}
}
private static void listChmEntry(String output, ChmEntry entry, ChmEntry.Attribute attributes) throws IOException {
printEntry(entry);
String er=GuidHelper.CreateGuid().toString();
File dest = new File(output, entry.getPath());
if (entry.hasAttribute(ChmEntry.Attribute.DIRECTORY)) {
if (!dest.isDirectory()) {
if (!dest.mkdirs()) {
throw new IOException("failed to create directory : " + dest);
}
}
for (ChmEntry e : entry.entries(attributes)) {
listChmEntry(output, e, attributes);
}
} else {
InputStream in = null;
OutputStream out = null;
try {
in = entry.getInputStream();
out = new FileOutputStream(dest);
int bufferSize = 1024;
byte[] data = new byte[bufferSize];
int nbRead;
while ((nbRead = in.read(data)) > 0) {
out.write(data, 0, nbRead);
out.flush();
}
} catch (IOException ex) {
System.out.println(ex.getMessage());
} finally {
try {
if (out != null) {
out.close();
}
} finally {
if (in != null) {
in.close();
}
}
}
}
}
private static void printEntry(ChmEntry entry) {
StringBuilder sb = new StringBuilder("Extract entry " + entry + "(");
boolean first = true;
for (ChmEntry.Attribute attribute : entry.getAttributes()) {
if (first) {
first = false;
} else {
sb.append(", ");
}
sb.append(attribute);
}
sb.append(")");
System.out.println(sb.toString());
}
}
這就得到了若幹中轉html檔案(兩萬多個),注意,chm檔案的格式相當複雜,決定了這樣的處理方法,事實上我接下來的word,freemind檔案,統統都這樣處理的.
測試類:
package cn.lswe.baseframe.spider;
import java.io.FileNotFoundException;
import java.util.LinkedList;
import java.util.List;
import cn.lswe.baseframe.utils.FileHelper;
import cn.lswe.baseframe.validator.Conf;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
public class ParseDisease implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
@Override
public void process(Page page) {
Listlinks=new LinkedList();
links.add(Conf.diseaseDataSource);
page.addTargetRequests(links);
List tdList = page.getHtml().xpath("table").xpath("td").all();
//這裡利用了webmagic爬蟲架構,可以參照這個連結做:
int j=0;
int k=0;
for(int i=0;i
//在這裡,可以篩選,處理你的内容了
//插入jedis,插入資料庫都ok
j=i+1;
k=i+2;
String td1=tdList.get(i);
String td2=tdList.get(i);//第二列要拆分為一個數組,它是第一列的下一級分類
String td3=tdList.get(i);//如果第一列是字母,第三列和第二列一一對應,如果第一列是漢字,第三列和第二列第二行開始一一對應,其編碼是”B”+第一行第三列+本行第三列
}
System.out.println(tdList);
}
@Override
public Site getSite() {
return site;
}
@SuppressWarnings("deprecation")
public static void testSpider() {
// Conf.diseaseDataSource嗅探的起點,比如為了速度,請把所有資源檔案部署
//到localhost
Spider.create(new ParseDisease())
.addUrl(Conf.diseaseDataSource)
.pipeline(new ConsolePipeline()).thread(5).run();
//開啟5個線程抓取
}
}.
調用方法:
@ResponseBody
@RequestMapping("/test/spider")
public void spider()
{
//OschinaBlogPageProcesser.testSpider();
ParseChm.testSpider();
}
