java 爬蟲 實作按照關鍵詞爬取圖檔,并在下載下傳在指定目錄下。百度圖檔以瀑布流式顯示,但pn參數決定了頁面第一張圖檔為第pn張,且每頁顯示三十張,可以通過pn=0,pn=30,pn=...來實作圖檔的不斷加載,而不用模拟滾輪效果去以瀑布流加載新的圖檔。
用到了jsoup的jar包,後來發現jsoup解析不了百度的元代碼...,僅僅是取了個document,都放在源碼下載下傳裡了
源碼下載下傳:點選打開連結
DownBaiduPicture.java
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
* 爬取百度圖檔
* 可設定 <關鍵詞> <頁數(30張/頁)> <縮略/原 圖> <分辨率(縮略圖設定無效)>
* 原圖有一定幾率下載下傳失敗(程式本身魯棒性不強而且可能源站點有下載下傳限制)
* 縮略圖不會
* @author M
*
*/
public class DownBaiduPicture {
static int BUFFERSIZE = 819200;
static String UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36";
static String baseUrl = "https://image.baidu.com/search/index?ct=&z=&tn=baiduimage&ipn=r&word=";
static String pnUrl = "&pn=";
static String connect = "&;
static String widthUrl = "&width=";
static String heightUrl = "&height=";
private String key = "美女";
private int pn = 0;
private int width = 0;
private int height = 0;
private String file = null;
private int flag = 0;
/**
* 初始化下載下傳路徑
* @param str
*/
public DownBaiduPicture(String str){
file = str;
}
/**
* 設定下載下傳圖檔參數
* @param word <關鍵詞>
* @param page <頁數>
* @param flg <0為縮略圖/1為原圖>
* @param wid <分辨率寬>
* @param hei <分辨率高>
*/
public void setPicture(String word, int page, int flg, int wid, int hei){
key = word;
pn = page;
width = wid;
height = hei;
flag = flg;
}
/**
* 預設分辨率
* @param word
* @param page
* @param flg
*/
public void setPicture(String word, int page, int flg){
this.setPicture(word, page, flg, 0, 0);
}
/**
* 下載下傳圖檔
* @param srcUrl <圖檔源位址>
* @param outputFile <輸出檔案路徑名>
* @throws IOException <檔案異常>
*/
public void downloadEach(String srcUrl, String outputFile) throws IOException{
System.out.println(srcUrl+"\t"+"start");
URL url = new URL(srcUrl);
URLConnection uc = url.openConnection();
if(flag == 0){
HttpsURLConnection hus = (HttpsURLConnection)uc;
hus.setDoOutput(true);
hus.setRequestProperty("User-Agent", UserAgent);
hus.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
hus.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
hus.setRequestProperty("Connection", "keep-alive");
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
try {
bis = new BufferedInputStream(hus.getInputStream());
bos = new BufferedOutputStream(new FileOutputStream(outputFile));
byte[] temp = new byte[BUFFERSIZE];
int count = 0;
while((count = bis.read(temp)) != -1){
bos.write(temp, 0, count);
bos.flush();
}
System.out.println(srcUrl+"\t"+"end");
}catch (IOException e) {
System.out.println(srcUrl+"\t"+"error");
errorFileDel(outputFile);
}finally {
bos.close();
bis.close();
}
return;
}
HttpURLConnection huc = (HttpURLConnection)uc;
huc.setDoOutput(true);
huc.setRequestProperty("User-Agent", UserAgent);
huc.setRequestProperty("Accept-Encoding", "gzip, deflate, sdch, br");
huc.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8");
huc.setRequestProperty("Connection", "keep-alive");
BufferedInputStream bis = null;
BufferedOutputStream bos = null;
try {
bis = new BufferedInputStream(huc.getInputStream());
bos = new BufferedOutputStream(new FileOutputStream(outputFile));
byte[] temp = new byte[BUFFERSIZE];
int count = 0;
while((count = bis.read(temp)) != -1){
bos.write(temp, 0, count);
bos.flush();
}
System.out.println(srcUrl+"\t"+"end");
}catch (IOException e) {
System.out.println(srcUrl+"\t"+"error");
errorFileDel(outputFile);
}finally {
bos.close();
bis.close();
}
}
/**
* 解析頁面的圖檔連結
* @throws IOException
*/
public void downLoad() throws IOException{
for(int i = 0; i < pn; i ++){
String urlRes = baseUrl+key+pnUrl+(i*30)+connect+widthUrl;
urlRes += width == 0? "": width;
urlRes += height == 0? heightUrl : heightUrl + height;
System.out.println(urlRes);
Document document = null;
document = Jsoup.connect(new String(urlRes.getBytes("utf-8")))
.userAgent(UserAgent)
.get();
String str = document.toString();
String reg = flag == 0? "thumbURL\":\"https://.+?\"" : "objURL\":\"http://.+?\"" ;
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(str);
String pathname = file+"/"+key+"/"+i;
new File(pathname).mkdirs();
int count = 0;
while(matcher.find()){
count++;
int start = flag == 0? 11 : 9;
String findUrl = matcher.group().substring(start, matcher.group().length()-1);
String opn;
int index;
if((index = findUrl.lastIndexOf("."))!=-1&&
(findUrl.substring(index).equals(".png")||
findUrl.substring(index).equals(".PNG")||
findUrl.substring(index).equals(".jif")||
findUrl.substring(index).equals(".GIF"))){
opn = count + findUrl.substring(index);
}
else{
opn = count + ".jpg";
}
try {
downloadEach(findUrl, pathname+"/"+opn);
} catch (Exception e) {
System.out.println(findUrl+"\terror");
continue;
}
}
}
}
/**
* 隻在下載下傳原圖時起作用,
* 删除出現錯誤的圖檔(仍然杜絕不了圖檔格式損壞問題)
* @param outputFile <錯誤檔案路徑>
*/
public static void errorFileDel(String outputFile){
File errorFile = new File(outputFile);
if(errorFile.exists()){
errorFile.delete();
}
}
}
Test.java
import java.io.IOException;
public class Test {
public static void main(String[] args) {
String basepath = "E://test";
DownBaiduPicture dbp = new DownBaiduPicture(basepath);
dbp.setPicture("桌面", 3 , 1 , 1366 , 768);
try {
dbp.downLoad();
} catch (IOException e) {
// TODO Auto-generated catch block
System.out.println("網絡不通!");
}
}
}