java爬虫爬取百度图片

java 爬虫实现按照关键词爬取图片，并在下载在指定目录下。百度图片以瀑布流式显示，但pn参数决定了页面第一张图片为第pn张，且每页显示三十张，可以通过pn=0，pn=30，pn=...来实现图片的不断加载，而不用模拟滚轮效果去以瀑布流加载新的图片。

用到了jsoup的jar包，后来发现jsoup解析不了百度的元代码...，仅仅是取了个document，都放在源码下载里了

源码下载：点击打开链接

java爬虫爬取百度图片

DownBaiduPicture.java

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
 * 爬取百度图片
 * 可设置	<关键词>    <页数(30张/页)>    <缩略/原 图>    <分辨率(缩略图设置无效)>
 * 原图有一定几率下载失败(程序本身鲁棒性不强而且可能源站点有下载限制)
 * 缩略图不会
 * @author M
 *
 */
public class DownBaiduPicture {
	static int BUFFERSIZE = 819200;
	static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
	static String baseUrl = "https://image.baidu.com/search/index?ct=&z=&tn=baiduimage&ipn=r&word=";
	static String pnUrl = "&pn=";
	static String connect = "&;
	static String widthUrl = "&width=";
	static String heightUrl = "&height=";
	private String key = "美女";
	private int pn = 0;
	private int width = 0;
	private int height = 0;
	private String file = null;
	private int flag = 0;
	/**
	 * 初始化下载路径
	 * @param str
	 */
	public DownBaiduPicture(String str){
		file = str;
	}
	/**
	 * 设置下载图片参数
	 * @param word  <关键词>
	 * @param page	<页数>
	 * @param flg	<0为缩略图/1为原图>
	 * @param wid	<分辨率宽>
	 * @param hei	<分辨率高>
	 */
	public void setPicture(String word, int page, int flg, int wid, int hei){
		key = word;
		pn = page;
		width = wid;
		height = hei;
		flag = flg;
	}
	/**
	 * 缺省分辨率
	 * @param word
	 * @param page
	 * @param flg
	 */
	public void setPicture(String word, int page, int flg){
		this.setPicture(word, page, flg, 0, 0);
	}
	/**
	 * 下载图片
	 * @param srcUrl		<图片源地址>
	 * @param outputFile	<输出文件路径名>
	 * @throws IOException	<文件异常>
	 */
	public void downloadEach(String srcUrl, String outputFile) throws IOException{
		System.out.println(srcUrl+"\t"+"start");
		URL url = new URL(srcUrl);
		URLConnection uc = url.openConnection();
		if(flag == 0){
			HttpsURLConnection hus = (HttpsURLConnection)uc;
			hus.setDoOutput(true);
			hus.setRequestProperty("User-Agent", UserAgent);
			hus.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
			hus.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
			hus.setRequestProperty("Connection", "keep-alive");
			BufferedInputStream bis = null;
			BufferedOutputStream bos = null;
			try {
				bis = new BufferedInputStream(hus.getInputStream());
				bos = new BufferedOutputStream(new FileOutputStream(outputFile));
				byte[] temp = new byte[BUFFERSIZE];
				int count = 0;
				while((count = bis.read(temp)) != -1){
					bos.write(temp, 0, count);
					bos.flush();
				}
				System.out.println(srcUrl+"\t"+"end");
			}catch (IOException e) {
				System.out.println(srcUrl+"\t"+"error");
				errorFileDel(outputFile);
			}finally {
				bos.close();
				bis.close();
			}
			return;
		}
		HttpURLConnection huc = (HttpURLConnection)uc;
		huc.setDoOutput(true);
		huc.setRequestProperty("User-Agent", UserAgent);
		huc.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
		huc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
		huc.setRequestProperty("Connection", "keep-alive");

		BufferedInputStream bis = null;
		BufferedOutputStream bos = null;
		try {
			bis = new BufferedInputStream(huc.getInputStream());
			bos = new BufferedOutputStream(new FileOutputStream(outputFile));
			byte[] temp = new byte[BUFFERSIZE];
			int count = 0;
			while((count = bis.read(temp)) != -1){
				bos.write(temp, 0, count);
				bos.flush();
			}
			System.out.println(srcUrl+"\t"+"end");
		}catch (IOException e) {
			System.out.println(srcUrl+"\t"+"error");
			errorFileDel(outputFile);
		}finally {
			bos.close();
			bis.close();
		}
	}
	/**
	 * 解析页面的图片链接
	 * @throws IOException
	 */
	public void downLoad() throws IOException{
		for(int i = 0; i < pn; i ++){
			String urlRes = baseUrl+key+pnUrl+(i*30)+connect+widthUrl;
			urlRes += width == 0? "": width;
			urlRes += height == 0? heightUrl : heightUrl + height;
			
			System.out.println(urlRes);
			Document document = null;
			document = Jsoup.connect(new String(urlRes.getBytes("utf-8")))
							.userAgent(UserAgent)
							.get();
			String str = document.toString();
			String reg = flag == 0? "thumbURL\":\"https://.+?\"" : "objURL\":\"http://.+?\"" ;
			Pattern pattern = Pattern.compile(reg);
			Matcher matcher = pattern.matcher(str);
			String pathname = file+"/"+key+"/"+i;
			new File(pathname).mkdirs();
			int count = 0;
			while(matcher.find()){
				count++;
				int start = flag == 0? 11 : 9;
				String findUrl = matcher.group().substring(start, matcher.group().length()-1);
				String opn;
				int index;
				if((index = findUrl.lastIndexOf("."))!=-1&&
						(findUrl.substring(index).equals(".png")||
						 findUrl.substring(index).equals(".PNG")||
						 findUrl.substring(index).equals(".jif")||
						 findUrl.substring(index).equals(".GIF"))){
					opn = count + findUrl.substring(index);
				}
				else{
					opn = count + ".jpg";
				}
				try {
					downloadEach(findUrl, pathname+"/"+opn);
				} catch (Exception e) {
					System.out.println(findUrl+"\terror");
					continue;
				}
			}
		}
	}
	/**
	 * 只在下载原图时起作用，
	 * 删除出现错误的图片(仍然杜绝不了图片格式损坏问题)
	 * @param outputFile <错误文件路径>
	 */
	public static void errorFileDel(String outputFile){
		File errorFile = new File(outputFile);
		if(errorFile.exists()){
			errorFile.delete();
		}
	}
}

Test.java

import java.io.IOException;
public class Test {


	public static void main(String[] args) {
		String basepath = "E://test";
		DownBaiduPicture dbp = new DownBaiduPicture(basepath);
		dbp.setPicture("壁纸", 3 , 1 , 1366 , 768);
		try {
			dbp.downLoad();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			System.out.println("网络不通！");
		}
	}
}

java爬虫爬取百度图片

继续阅读

Java小案例——随机数猜测随机数猜测

nginx location中斜线的位置的重要性

sort()函数到底是怎样进行数字排序的

27 Best Free Eclipse Plug-ins for Java Developer to be ProductiveCode Quality PluginsText Editor PluginsDependency ManagementVersion Control Integration PluginsFramework Development Continuous Integration Related PluginsOther Utility Plugins

Java String.format方法的简单使用

neo4j之cypher使用文档

GitHub连夜封杀！这份阿里 10W 字内部 Java 字面试手册到底有多强？

spark/scala关于【资源文件】加载方法概述外部文件加载方案测试资源文件打包入jar包中小结

mybatis_入门程序Mybatis入门

AOP编程_Android优雅权限框架(1)概念基础，2021金三银四前言正文大纲正文

Effective Java 8:通用程序设计

OOM三种类型

工厂模式-三种类型

【递归】高效率求2的n次幂

win10本地scala和spark安装安装scala安装spark

scala (3) Function 和 Method