天天看點

jsoup解析HTML,爬取小說執行個體

1.java 的 File.separator 斜杠

2.jsoup解析标簽,element的text()方法直接取出兩個标簽中間的文本

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Test {

	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		
			Document doc = Jsoup.connect("http://www.biquge5.com/2_2975/1388243.html").get();
			Elements links = doc.select("a[href]");
			for (Element link:links){
				if (link.text().contentEquals("上一章")||link.text().contentEquals("下一章"))
				System.out.println(link.attr("abs:href").trim()+"---"+link.text());
			}
			Element content = doc.getElementById("content");
			//System.out.println(content.text());	
			String [] sentences ;
			sentences = content.text().split(" ");
			for (String sen : sentences){
				sen = sen.trim();
				sen = sen+"\r\n";
				try {
					File dir = new File("F:"+File.separator+"book");
					if(!dir.exists()){
						dir.mkdirs();
						System.out.println("小說"+"F:"+File.separator+"book"+"目錄下");
					}
					File file = new File( "F:"+File.separator+"book"+File.separator+"text.txt");
					FileOutputStream os = new FileOutputStream(file,true);
					try {
						os.write(sen.getBytes());
						os.close();
					} catch (IOException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					
				}}
				catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
					e.printStackTrace();
				}				
			}	
		}
}