新浪天氣預報新聞java抓去程式

我做了個程式把新浪上的天氣新聞抓過來存到本地，考慮通路速度問題，新聞中的圖檔也要儲存到本地。

程式如下

package vnet.com.weather1;

import java.io.BufferedReader;

import java.io.ByteArrayOutputStream;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.URL;

import java.net.URLConnection;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import vnet.com.update.Getdata;

public class Newlist {

private static final Log log = LogFactory.getLog(Newlist.class);

public static void main(String args[]){

Newlist n=new Newlist();

String[] k=n.getNewList();

for (int i=0;i<k.length;i++){

System.out.println(k[i].replace("href=/"", "href=/"newinfo2.jsp?url="));

}

String[] m=n.getNewinfo("news/2008/1119/35261.html");

for (int l=0;l<m.length;l++){

System.out.println(m[l]);

}

public String[] getNewinfo(String url){

String URL="http://weather.news.sina.com.cn/"+url;

//30是指取30段滿足給出的正則條件的字元串，如果隻找出10個，那數組後面的全為null

String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);

for (int i=0;i<s.length;i++){

Pattern sp = Pattern.compile("src=/"(.*?)/"");

Matcher matcher = sp.matcher(s[i]);

if (matcher.find()){

String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0];

if(!imageurl.startsWith("http://")){

imageurl="http://weather.news.sina.com.cn/"+imageurl;

}

System.out.println("新聞有圖檔:"+imageurl);

String content=getContent(imageurl);

String[] images=imageurl.split("/");

String imagename=images[images.length-1];

System.out.println("圖檔名:"+imagename);

try {

File fwl = new File(imagename);

PrintWriter outl = new PrintWriter(fwl);

outl.println(content);

outl.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

System.out.println("s[i]:"+s[i]);

//修改檔案圖檔位址

s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename);

}

return s;

}

public String[] getNewList(){

String url="http://weather.news.sina.com.cn/weather/news/index.html";

return getNewList(getContent(url));

}

private String[] getNewList(String content ){

//String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50);

String[] s = analysis("<li>(.*?)</li>" , content , 50);

return s;

}

private String[] analysis(String pattern, String match , int i){

Pattern sp = Pattern.compile(pattern);

Matcher matcher = sp.matcher(match);

String[] content = new String[i];

for (int i1 = 0; matcher.find(); i1++){

content[i1] = matcher.group(1);

}

//下面一段是為了剔除為空的串

int l=0;

for (int k=0;k<content.length;k++){

if (content[k]==null){

l=k;

break;

}

String[] content2;

if (l!=0){

content2=new String[l];

for (int n=0;n<l;n++){

content2[n]=content[n];

}

return content2;

}else{

return content;

}

public static String getContent (String strUrl){

URLConnection uc = null;

String all_content=null;

try {

all_content =new String();

URL url = new URL(strUrl);

uc = url.openConnection();

uc.setRequestProperty("User-Agent",

"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");

System.out.println("-----------------------------------------");

System.out.println("Content-Length: "+uc.getContentLength());

System.out.println("Set-Cookie: "+uc.getHeaderField("Set-Cookie"));

System.out.println("-----------------------------------------");

//擷取檔案頭資訊

System.out.println("Header"+uc.getHeaderFields().toString());

System.out.println("-----------------------------------------");

if (uc == null)

return null;

InputStream ins = uc.getInputStream();

ByteArrayOutputStream outputstream = new ByteArrayOutputStream();

byte[] str_b = new byte[1024];

int i = -1;

while ((i=ins.read(str_b)) > 0) {

outputstream.write(str_b,0,i);

}

all_content = outputstream.toString();

// System.out.println(all_content);

} catch (Exception e) {

e.printStackTrace();

log.error("擷取網頁内容出錯");

}finally{

uc = null;

}

// return new String(all_content.getBytes("ISO8859-1"));

System.out.println(all_content.length());

return all_content;

}

現在的問題是:圖檔下載下傳不全，我用後面兩種getContent方法下圖檔，下來的圖檔大小都和檔案頭裡獲得的Content-Length，也就是圖檔的實際大小不符，預覽不了。

而且反複測試，兩種方法每次下來的東西大小是固定的，是以重複下載下傳沒有用？

測試toString後length大小比圖檔實際的小，而生成的圖檔比圖檔資料大。下載下傳後存儲過程中圖檔資料增加了！

圖檔資料流toString過程中資料大小發生了改變，還原不回來。其它新聞内容沒有問題。估計是圖檔的編碼格式等的問題。在圖檔資料流讀過來時直接生成圖檔就可以了。

public int saveImage (String strUrl){

URLConnection uc = null;

try {

URL url = new URL(strUrl);

uc = url.openConnection();

uc.setRequestProperty("User-Agent",

"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");

//uc.setReadTimeout(30000);

//擷取圖檔長度

//System.out.println("Content-Length: "+uc.getContentLength());

//擷取檔案頭資訊

//System.out.println("Header"+uc.getHeaderFields().toString());

if (uc == null)

return 0;

InputStream ins = uc.getInputStream();

byte[] str_b = new byte[1024];

int byteRead=0;

String[] images=strUrl.split("/");

String imagename=images[images.length-1];

File fwl = new File(imagename);

FileOutputStream fos= new FileOutputStream(fwl);

while ((byteRead=ins.read(str_b)) > 0) {

fos.write(str_b,0,byteRead);

};

fos.flush();

fos.close();

} catch (Exception e) {

e.printStackTrace();

log.error("擷取網頁内容出錯");

}finally{

uc = null;

}

return 1;

}

方法二：

首先把搜尋後的頁面用流讀取出來，再寫個正則，去除不要的内容，再把最後的結果存成xml格式檔案、或者直接存入資料庫，用的時候再調用

本代碼隻是顯示html頁的源碼内容，如果需要抽取内容請自行改寫public static String regex()中的正則式

package rssTest;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.net.URLConnection;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class MyRSS

{

public static String getHtmlSource(String url)

{

StringBuffer codeBuffer = null;

BufferedReader in=null;

try

{

URLConnection uc = new URL(url).openConnection();

uc.setRequestProperty("User-Agent",

"Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");

// 讀取url流内容

in = new BufferedReader(new InputStreamReader(uc

.getInputStream(), "gb2312"));

codeBuffer = new StringBuffer();

String tempCode = "";

// 把buffer内的值讀取出來,儲存到code中

while ((tempCode = in.readLine()) != null)

{

codeBuffer.append(tempCode).append("/n");

}

in.close();

}

catch (MalformedURLException e)

{

e.printStackTrace();

}

catch (IOException e)

{

e.printStackTrace();

}

return codeBuffer.toString();

}

public static String regex()

{

String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>";

return googleRegex;

}

public static List<String> GetNews()

{

List<String> newsList = new ArrayList<String>();

String allHtmlSource = MyRSS

.getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os- maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f");

Pattern pattern = Pattern.compile(regex());

Matcher matcher = pattern.matcher(allHtmlSource);

while (matcher.find())

{

String urlLink = matcher.group(2);

String title = matcher.group(4);

title = title.replaceAll("<font color=CC0033>", "");

title = title.replaceAll("</font>", "");

title = title.replaceAll("<b>...</b>", "");

String content = matcher.group(6);

content = content.replaceAll("<font color=CC0033>", "");

content = content.replaceAll("</font>", "");

content = content.replaceAll("<b>...</b>", "");

newsList.add(urlLink);

newsList.add(title);

newsList.add(content);

}

return newsList;

}

public static void main(String[] args)

{

System.out

.println(MyRSS

.getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html"));

}

方法三：

jsp自動抓取新聞自動抓取新聞

package com.news.spider;

import java.io.File;

import java.io.FileFilter;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Calendar;

import java.util.Date;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import com.db.DBAccess;

public class SpiderNewsServer {

public static void main(String[] args) throws Exception{

//設定抓取資訊的首頁面

String endPointUrl = "http://cn.china.cn/zixun/";

//獲得目前時間

Calendar calendar=Calendar.getInstance();

SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");

String DateNews = sdf.format(calendar.getTime());

List listNewsType = new ArrayList();

//取入口頁面html

WebHtml webHtml = new WebHtml();

String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl);

if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){

return;

}

String strTemp1 = "http://cn.china.cn/article/";

String strTemp2 = "</li>";

int stopIndex=0;

int startIndex=0;

int dd=0;

while(true){

dd++;

startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex);

System.out.println("=========="+startIndex);

stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex);

System.out.println("==========---------"+stopIndex);

if(startIndex!=-1 && stopIndex!=-1){

String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex);

System.out.println("@@@@@--------"+companyType);

System.out.println("@@@@@--------"+companyType.indexOf("/""));

companyType=companyType.substring(0,companyType.indexOf("/""));

System.out.println("#####--------"+companyType);

listNewsType.add(companyType);

}

if(dd>10){

break;

}

if(stopIndex==-1 || startIndex==-1){

break;

}

System.out.println("listCompanyType====="+listNewsType.size());

String title="";

String hometext="";

String bodytext="";

String keywords="";

String counter = "221";

String cdate= "";

int begainIndex=0;//檢索字元串的起點索引

int endIndex=0;//檢索字元串的終點索引

String begainStr;//檢索開始字元串

String endStr;//檢索結束字元串

for (int rows = 1; rows < listNewsType.size(); rows++) {

String strNewsDetail = listNewsType.get(rows).toString();

System.out.println("strNewsDetail====="+strNewsDetail);

if(strNewsDetail != null && strNewsDetail.length() > 0){

WebHtml newsListHtml = new WebHtml();

String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail);

System.out.println("$$$$$------"+htmlDocuemtntCom);

if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){

return;

}

//截取時間

int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>時間：");

System.out.println("%%%%%--"+dateBegainIndex);

String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20);

System.out.println("^^^^^^^^^^^^^^^---"+newTime);

String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3);

String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1);

System.out.println("^^^^^^^^^^^^^^^---"+newTimeM);

System.out.println("^^^^^^^^^^^^^^^---"+dateM);

if(newTimeM == dateM || newTimeM.equals(dateM)){

//檢索新聞标題

begainStr="<div class=/"divCon bg008 /">";

endStr="<div>時間：";

begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);

System.out.println("&&&&&&------"+begainIndex);

endIndex=htmlDocuemtntCom.indexOf(endStr,0);

System.out.println("&&&&&&------"+endIndex);

if(begainIndex!=-1 && endIndex!=-1){

title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();

title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>"));

title = title.replace("'", "");

title = title.replace(";", "");

title = title.replace(" ", "");

}

//檢索新聞内容

begainStr="<div class=/"divCon bg008 /">";

endStr="";

begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);

endIndex=htmlDocuemtntCom.indexOf(endStr,0);

if(begainIndex!=-1 && endIndex!=-1){

bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();

if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0)

bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>"));

bodytext=bodytext.replace(" ", "");

bodytext=bodytext.replace("<br>", "");

bodytext=bodytext.replace("/n", "<br>");

bodytext=bodytext.replace("'", "");

bodytext=bodytext.replace(";", "");

}

//簡介

if(bodytext.length()>40)

hometext = bodytext.substring(0,40)+"......";

else{

hometext = bodytext+"......";

}

//浏覽量

String str = String.valueOf(Math.random());

counter = str.substring(str.lastIndexOf(".")+1,5);

Calendar cal = Calendar.getInstance();

cal.setTime(new Date());

cdate = cal.getTimeInMillis()+"";

cdate = cdate.substring(0,10);

}else{

continue;

}

System.out.println("-------------------------"+title);

System.out.println("-------------------------"+cdate);

System.out.println("-------------------------"+hometext);

System.out.println("-------------------------"+bodytext);

System.out.println("-------------------------"+keywords);

System.out.println("-------------------------"+counter);

}

package com.news.spider;

import java.net.URL;

import java.net.URLConnection;

import java.io.BufferedReader;

import java.io.InputStreamReader;

public class WebHtml {

public String getWebHtml(String url){

try {

URL myURL = new URL(url);

URLConnection conn = myURL.openConnection();

BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));

String line = null;

StringBuffer document = new StringBuffer("");

while ((line = reader.readLine()) != null){

document.append(line + "/n");

}

reader.close();

String resutlDocument = new String(document);

return resutlDocument;

} catch (Exception e) {}

return "";

}

出處：【Gjava人才】

網址： http://www.gjrencai.com

轉載時請注明出處和網址

新浪天氣預報新聞java抓去程式

繼續閱讀

/\B(?=(?:\d{3})+$)/g 一條令人費解的正規表達式

neo4j之cypher使用文檔

适用于JavaScript的ECMAScript 2020規範向前發展

GitHub連夜封殺！這份阿裡 10W 字内部 Java 字面試手冊到底有多強？

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

mybatis_入門程式Mybatis入門

JS生成uuid的四種方法

AOP程式設計_Android優雅權限架構(1)概念基礎，2021金三銀四前言正文大綱正文

GridView終極用法(一)

Effective Java 8:通用程式設計

OOM三種類型

工廠模式-三種類型

【遞歸】高效率求2的n次幂

layui多任務上傳添加進度條

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method