天天看點

java爬蟲爬取百度圖檔

    java 爬蟲 實作按照關鍵詞爬取圖檔,并在下載下傳在指定目錄下。百度圖檔以瀑布流式顯示,但pn參數決定了頁面第一張圖檔為第pn張,且每頁顯示三十張,可以通過pn=0,pn=30,pn=...來實作圖檔的不斷加載,而不用模拟滾輪效果去以瀑布流加載新的圖檔。

    用到了jsoup的jar包,後來發現jsoup解析不了百度的元代碼...,僅僅是取了個document,都放在源碼下載下傳裡了

源碼下載下傳:點選打開連結

java爬蟲爬取百度圖檔

DownBaiduPicture.java

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
 * 爬取百度圖檔
 * 可設定	<關鍵詞>    <頁數(30張/頁)>    <縮略/原 圖>    <分辨率(縮略圖設定無效)>
 * 原圖有一定幾率下載下傳失敗(程式本身魯棒性不強而且可能源站點有下載下傳限制)
 * 縮略圖不會
 * @author M
 *
 */
public class DownBaiduPicture {
	static int BUFFERSIZE = 819200;
	static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
	static String baseUrl = "https://image.baidu.com/search/index?ct=&z=&tn=baiduimage&ipn=r&word=";
	static String pnUrl = "&pn=";
	static String connect = "&;
	static String widthUrl = "&width=";
	static String heightUrl = "&height=";
	private String key = "美女";
	private int pn = 0;
	private int width = 0;
	private int height = 0;
	private String file = null;
	private int flag = 0;
	/**
	 * 初始化下載下傳路徑
	 * @param str
	 */
	public DownBaiduPicture(String str){
		file = str;
	}
	/**
	 * 設定下載下傳圖檔參數
	 * @param word  <關鍵詞>
	 * @param page	<頁數>
	 * @param flg	<0為縮略圖/1為原圖>
	 * @param wid	<分辨率寬>
	 * @param hei	<分辨率高>
	 */
	public void setPicture(String word, int page, int flg, int wid, int hei){
		key = word;
		pn = page;
		width = wid;
		height = hei;
		flag = flg;
	}
	/**
	 * 預設分辨率
	 * @param word
	 * @param page
	 * @param flg
	 */
	public void setPicture(String word, int page, int flg){
		this.setPicture(word, page, flg, 0, 0);
	}
	/**
	 * 下載下傳圖檔
	 * @param srcUrl		<圖檔源位址>
	 * @param outputFile	<輸出檔案路徑名>
	 * @throws IOException	<檔案異常>
	 */
	public void downloadEach(String srcUrl, String outputFile) throws IOException{
		System.out.println(srcUrl+"\t"+"start");
		URL url = new URL(srcUrl);
		URLConnection uc = url.openConnection();
		if(flag == 0){
			HttpsURLConnection hus = (HttpsURLConnection)uc;
			hus.setDoOutput(true);
			hus.setRequestProperty("User-Agent", UserAgent);
			hus.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
			hus.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
			hus.setRequestProperty("Connection", "keep-alive");
			BufferedInputStream bis = null;
			BufferedOutputStream bos = null;
			try {
				bis = new BufferedInputStream(hus.getInputStream());
				bos = new BufferedOutputStream(new FileOutputStream(outputFile));
				byte[] temp = new byte[BUFFERSIZE];
				int count = 0;
				while((count = bis.read(temp)) != -1){
					bos.write(temp, 0, count);
					bos.flush();
				}
				System.out.println(srcUrl+"\t"+"end");
			}catch (IOException e) {
				System.out.println(srcUrl+"\t"+"error");
				errorFileDel(outputFile);
			}finally {
				bos.close();
				bis.close();
			}
			return;
		}
		HttpURLConnection huc = (HttpURLConnection)uc;
		huc.setDoOutput(true);
		huc.setRequestProperty("User-Agent", UserAgent);
		huc.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
		huc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
		huc.setRequestProperty("Connection", "keep-alive");

		BufferedInputStream bis = null;
		BufferedOutputStream bos = null;
		try {
			bis = new BufferedInputStream(huc.getInputStream());
			bos = new BufferedOutputStream(new FileOutputStream(outputFile));
			byte[] temp = new byte[BUFFERSIZE];
			int count = 0;
			while((count = bis.read(temp)) != -1){
				bos.write(temp, 0, count);
				bos.flush();
			}
			System.out.println(srcUrl+"\t"+"end");
		}catch (IOException e) {
			System.out.println(srcUrl+"\t"+"error");
			errorFileDel(outputFile);
		}finally {
			bos.close();
			bis.close();
		}
	}
	/**
	 * 解析頁面的圖檔連結
	 * @throws IOException
	 */
	public void downLoad() throws IOException{
		for(int i = 0; i < pn; i ++){
			String urlRes = baseUrl+key+pnUrl+(i*30)+connect+widthUrl;
			urlRes += width == 0? "": width;
			urlRes += height == 0? heightUrl : heightUrl + height;
			
			System.out.println(urlRes);
			Document document = null;
			document = Jsoup.connect(new String(urlRes.getBytes("utf-8")))
							.userAgent(UserAgent)
							.get();
			String str = document.toString();
			String reg = flag == 0? "thumbURL\":\"https://.+?\"" : "objURL\":\"http://.+?\"" ;
			Pattern pattern = Pattern.compile(reg);
			Matcher matcher = pattern.matcher(str);
			String pathname = file+"/"+key+"/"+i;
			new File(pathname).mkdirs();
			int count = 0;
			while(matcher.find()){
				count++;
				int start = flag == 0? 11 : 9;
				String findUrl = matcher.group().substring(start, matcher.group().length()-1);
				String opn;
				int index;
				if((index = findUrl.lastIndexOf("."))!=-1&&
						(findUrl.substring(index).equals(".png")||
						 findUrl.substring(index).equals(".PNG")||
						 findUrl.substring(index).equals(".jif")||
						 findUrl.substring(index).equals(".GIF"))){
					opn = count + findUrl.substring(index);
				}
				else{
					opn = count + ".jpg";
				}
				try {
					downloadEach(findUrl, pathname+"/"+opn);
				} catch (Exception e) {
					System.out.println(findUrl+"\terror");
					continue;
				}
			}
		}
	}
	/**
	 * 隻在下載下傳原圖時起作用,
	 * 删除出現錯誤的圖檔(仍然杜絕不了圖檔格式損壞問題)
	 * @param outputFile <錯誤檔案路徑>
	 */
	public static void errorFileDel(String outputFile){
		File errorFile = new File(outputFile);
		if(errorFile.exists()){
			errorFile.delete();
		}
	}
}
           

Test.java

import java.io.IOException;
public class Test {


	public static void main(String[] args) {
		String basepath = "E://test";
		DownBaiduPicture dbp = new DownBaiduPicture(basepath);
		dbp.setPicture("桌面", 3 , 1 , 1366 , 768);
		try {
			dbp.downLoad();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			System.out.println("網絡不通!");
		}
	}
}