天天看点

java爬虫爬取百度图片

    java 爬虫 实现按照关键词爬取图片,并在下载在指定目录下。百度图片以瀑布流式显示,但pn参数决定了页面第一张图片为第pn张,且每页显示三十张,可以通过pn=0,pn=30,pn=...来实现图片的不断加载,而不用模拟滚轮效果去以瀑布流加载新的图片。

    用到了jsoup的jar包,后来发现jsoup解析不了百度的元代码...,仅仅是取了个document,都放在源码下载里了

源码下载:点击打开链接

java爬虫爬取百度图片

DownBaiduPicture.java

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
 * 爬取百度图片
 * 可设置	<关键词>    <页数(30张/页)>    <缩略/原 图>    <分辨率(缩略图设置无效)>
 * 原图有一定几率下载失败(程序本身鲁棒性不强而且可能源站点有下载限制)
 * 缩略图不会
 * @author M
 *
 */
public class DownBaiduPicture {
	static int BUFFERSIZE = 819200;
	static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
	static String baseUrl = "https://image.baidu.com/search/index?ct=&z=&tn=baiduimage&ipn=r&word=";
	static String pnUrl = "&pn=";
	static String connect = "&;
	static String widthUrl = "&width=";
	static String heightUrl = "&height=";
	private String key = "美女";
	private int pn = 0;
	private int width = 0;
	private int height = 0;
	private String file = null;
	private int flag = 0;
	/**
	 * 初始化下载路径
	 * @param str
	 */
	public DownBaiduPicture(String str){
		file = str;
	}
	/**
	 * 设置下载图片参数
	 * @param word  <关键词>
	 * @param page	<页数>
	 * @param flg	<0为缩略图/1为原图>
	 * @param wid	<分辨率宽>
	 * @param hei	<分辨率高>
	 */
	public void setPicture(String word, int page, int flg, int wid, int hei){
		key = word;
		pn = page;
		width = wid;
		height = hei;
		flag = flg;
	}
	/**
	 * 缺省分辨率
	 * @param word
	 * @param page
	 * @param flg
	 */
	public void setPicture(String word, int page, int flg){
		this.setPicture(word, page, flg, 0, 0);
	}
	/**
	 * 下载图片
	 * @param srcUrl		<图片源地址>
	 * @param outputFile	<输出文件路径名>
	 * @throws IOException	<文件异常>
	 */
	public void downloadEach(String srcUrl, String outputFile) throws IOException{
		System.out.println(srcUrl+"\t"+"start");
		URL url = new URL(srcUrl);
		URLConnection uc = url.openConnection();
		if(flag == 0){
			HttpsURLConnection hus = (HttpsURLConnection)uc;
			hus.setDoOutput(true);
			hus.setRequestProperty("User-Agent", UserAgent);
			hus.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
			hus.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
			hus.setRequestProperty("Connection", "keep-alive");
			BufferedInputStream bis = null;
			BufferedOutputStream bos = null;
			try {
				bis = new BufferedInputStream(hus.getInputStream());
				bos = new BufferedOutputStream(new FileOutputStream(outputFile));
				byte[] temp = new byte[BUFFERSIZE];
				int count = 0;
				while((count = bis.read(temp)) != -1){
					bos.write(temp, 0, count);
					bos.flush();
				}
				System.out.println(srcUrl+"\t"+"end");
			}catch (IOException e) {
				System.out.println(srcUrl+"\t"+"error");
				errorFileDel(outputFile);
			}finally {
				bos.close();
				bis.close();
			}
			return;
		}
		HttpURLConnection huc = (HttpURLConnection)uc;
		huc.setDoOutput(true);
		huc.setRequestProperty("User-Agent", UserAgent);
		huc.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
		huc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
		huc.setRequestProperty("Connection", "keep-alive");

		BufferedInputStream bis = null;
		BufferedOutputStream bos = null;
		try {
			bis = new BufferedInputStream(huc.getInputStream());
			bos = new BufferedOutputStream(new FileOutputStream(outputFile));
			byte[] temp = new byte[BUFFERSIZE];
			int count = 0;
			while((count = bis.read(temp)) != -1){
				bos.write(temp, 0, count);
				bos.flush();
			}
			System.out.println(srcUrl+"\t"+"end");
		}catch (IOException e) {
			System.out.println(srcUrl+"\t"+"error");
			errorFileDel(outputFile);
		}finally {
			bos.close();
			bis.close();
		}
	}
	/**
	 * 解析页面的图片链接
	 * @throws IOException
	 */
	public void downLoad() throws IOException{
		for(int i = 0; i < pn; i ++){
			String urlRes = baseUrl+key+pnUrl+(i*30)+connect+widthUrl;
			urlRes += width == 0? "": width;
			urlRes += height == 0? heightUrl : heightUrl + height;
			
			System.out.println(urlRes);
			Document document = null;
			document = Jsoup.connect(new String(urlRes.getBytes("utf-8")))
							.userAgent(UserAgent)
							.get();
			String str = document.toString();
			String reg = flag == 0? "thumbURL\":\"https://.+?\"" : "objURL\":\"http://.+?\"" ;
			Pattern pattern = Pattern.compile(reg);
			Matcher matcher = pattern.matcher(str);
			String pathname = file+"/"+key+"/"+i;
			new File(pathname).mkdirs();
			int count = 0;
			while(matcher.find()){
				count++;
				int start = flag == 0? 11 : 9;
				String findUrl = matcher.group().substring(start, matcher.group().length()-1);
				String opn;
				int index;
				if((index = findUrl.lastIndexOf("."))!=-1&&
						(findUrl.substring(index).equals(".png")||
						 findUrl.substring(index).equals(".PNG")||
						 findUrl.substring(index).equals(".jif")||
						 findUrl.substring(index).equals(".GIF"))){
					opn = count + findUrl.substring(index);
				}
				else{
					opn = count + ".jpg";
				}
				try {
					downloadEach(findUrl, pathname+"/"+opn);
				} catch (Exception e) {
					System.out.println(findUrl+"\terror");
					continue;
				}
			}
		}
	}
	/**
	 * 只在下载原图时起作用,
	 * 删除出现错误的图片(仍然杜绝不了图片格式损坏问题)
	 * @param outputFile <错误文件路径>
	 */
	public static void errorFileDel(String outputFile){
		File errorFile = new File(outputFile);
		if(errorFile.exists()){
			errorFile.delete();
		}
	}
}
           

Test.java

import java.io.IOException;
public class Test {


	public static void main(String[] args) {
		String basepath = "E://test";
		DownBaiduPicture dbp = new DownBaiduPicture(basepath);
		dbp.setPicture("壁纸", 3 , 1 , 1366 , 768);
		try {
			dbp.downLoad();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			System.out.println("网络不通!");
		}
	}
}