天天看點

新浪天氣預報新聞java抓去程式

我做了個程式把新浪上的天氣新聞抓過來存到本地,考慮通路速度問題,新聞中的圖檔也要儲存到本地。

程式如下

package vnet.com.weather1;

import java.io.BufferedReader;

import java.io.ByteArrayOutputStream;

import java.io.File;

import java.io.FileWriter;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.PrintWriter;

import java.net.URL;

import java.net.URLConnection;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;

import vnet.com.update.Getdata;

public class Newlist {

    private static final Log log = LogFactory.getLog(Newlist.class);

    public  static void main(String args[]){

        Newlist n=new Newlist();

        String[] k=n.getNewList();

        for (int i=0;i<k.length;i++){

        System.out.println(k[i].replace("href=/"", "href=/"newinfo2.jsp?url="));

        }

        String[] m=n.getNewinfo("news/2008/1119/35261.html");

        for (int l=0;l<m.length;l++){       

            System.out.println(m[l]);   

        }

    }

    public String[] getNewinfo(String url){

        String URL="http://weather.news.sina.com.cn/"+url;

        //30是指取30段滿足給出的正則條件的字元串,如果隻找出10個,那數組後面的全為null

        String[] s = analysis("<p>(.*?)</p>" , getContent(URL) , 30);

        for (int i=0;i<s.length;i++){

            Pattern sp = Pattern.compile("src=/"(.*?)/"");

            Matcher matcher = sp.matcher(s[i]);

            if (matcher.find()){

                 String imageurl=analysis("src=/"(.*?)/"" , s[i] , 1)[0];

                 if(!imageurl.startsWith("http://")){

                     imageurl="http://weather.news.sina.com.cn/"+imageurl;

                  }

                System.out.println("新聞有圖檔:"+imageurl);

                String content=getContent(imageurl);

                  String[] images=imageurl.split("/");

                  String imagename=images[images.length-1];

                  System.out.println("圖檔名:"+imagename);

        try {

            File fwl = new File(imagename);

            PrintWriter outl = new PrintWriter(fwl);

            outl.println(content);

            outl.close();

            } catch (IOException e) {

                // TODO Auto-generated catch block

                e.printStackTrace();

            }

            System.out.println("s[i]:"+s[i]);

            //修改檔案圖檔位址

            s[i]=s[i].replace(analysis("src=/"(.*?)/"" , s[i] , 1)[0], imagename);

            }

        }

        return s;

    }

    public  String[] getNewList(){

        String url="http://weather.news.sina.com.cn/weather/news/index.html";

        return getNewList(getContent(url));      

    }

    private  String[] getNewList(String content ){

        //String[] s = analysis("align=/"center/" valign=/"top/"><img src=/"../images/a(.*?).gif/" width=/"70/" height=/"65/"></td>" , content , 50);   

        String[] s = analysis("<li>(.*?)</li>" , content , 50);

        return s;

    }

    private String[] analysis(String pattern, String match , int i){

        Pattern sp = Pattern.compile(pattern);

        Matcher matcher = sp.matcher(match);

        String[] content = new String[i];

        for (int i1 = 0; matcher.find(); i1++){       

            content[i1] = matcher.group(1);      

        }

        //下面一段是為了剔除為空的串

        int l=0;

        for (int k=0;k<content.length;k++){

            if (content[k]==null){

                l=k;

                break;

            }

        }

        String[] content2;

        if (l!=0){

            content2=new String[l];

            for (int n=0;n<l;n++){

                content2[n]=content[n];

            }

             return content2;

        }else{

            return content;   

        }

    }

    public static  String getContent (String strUrl){

        URLConnection uc = null;

        String all_content=null;

    try {

               all_content =new  String();

               URL url = new URL(strUrl);

               uc = url.openConnection();

               uc.setRequestProperty("User-Agent", 

                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");                 

              System.out.println("-----------------------------------------"); 

              System.out.println("Content-Length:     "+uc.getContentLength()); 

              System.out.println("Set-Cookie:     "+uc.getHeaderField("Set-Cookie")); 

              System.out.println("-----------------------------------------");

              //擷取檔案頭資訊

              System.out.println("Header"+uc.getHeaderFields().toString());

              System.out.println("-----------------------------------------"); 

               if (uc == null)

                   return null;

               InputStream ins = uc.getInputStream();

                ByteArrayOutputStream outputstream = new ByteArrayOutputStream();

               byte[] str_b = new byte[1024];

                   int i = -1;

                   while ((i=ins.read(str_b)) > 0) {

                    outputstream.write(str_b,0,i);

                   }

                   all_content = outputstream.toString();

                  // System.out.println(all_content);

           } catch (Exception e) {

               e.printStackTrace();

               log.error("擷取網頁内容出錯");

           }finally{

               uc = null;

           }

          // return new String(all_content.getBytes("ISO8859-1"));

           System.out.println(all_content.length());

           return all_content;

       }

}

現在的問題是:圖檔下載下傳不全,我用後面兩種getContent方法下圖檔,下來的圖檔大小都和檔案頭裡獲得的Content-Length,也就是圖檔的實際大小不符,預覽不了。

  而且反複測試,兩種方法每次下來的東西大小是固定的,是以重複下載下傳沒有用?

測試toString後length大小比圖檔實際的小,而生成的圖檔比圖檔資料大。下載下傳後存儲過程中圖檔資料增加了!

  圖檔資料流toString過程中資料大小發生了改變,還原不回來。其它新聞内容沒有問題。估計是圖檔的編碼格式等的問題。在圖檔資料流讀過來時直接生成圖檔就可以了。

public  int saveImage (String strUrl){

        URLConnection uc = null;

    try {

               URL url = new URL(strUrl);

               uc = url.openConnection();

               uc.setRequestProperty("User-Agent", 

                                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");   

               //uc.setReadTimeout(30000);

         //擷取圖檔長度 

          //System.out.println("Content-Length:     "+uc.getContentLength());

          //擷取檔案頭資訊

           //System.out.println("Header"+uc.getHeaderFields().toString());        

               if (uc == null)

                   return 0;

               InputStream ins = uc.getInputStream();

                 byte[] str_b = new byte[1024];        

                 int byteRead=0;                          

                String[] images=strUrl.split("/");

        String imagename=images[images.length-1];

              File fwl = new File(imagename);

              FileOutputStream fos= new FileOutputStream(fwl);

                  while ((byteRead=ins.read(str_b)) > 0) {

                      fos.write(str_b,0,byteRead);

                     };

                    fos.flush(); 

                  fos.close();

           } catch (Exception e) {

               e.printStackTrace();

               log.error("擷取網頁内容出錯");

           }finally{

               uc = null;

           }

           return 1;

       }

方法二:

首先把搜尋後的頁面用流讀取出來,再寫個正則,去除不要的内容,再把最後的結果存成xml格式檔案、或者直接存入資料庫,用的時候再調用

本代碼隻是顯示html頁的源碼内容,如果需要抽取内容請自行改寫public static String regex()中的正則式

package rssTest;  

import java.io.BufferedReader;  

import java.io.IOException;  

import java.io.InputStreamReader;  

import java.net.HttpURLConnection;  

import java.net.MalformedURLException;  

import java.net.URL;  

import java.net.URLConnection;  

import java.util.ArrayList;  

import java.util.List;  

import java.util.regex.Matcher;  

import java.util.regex.Pattern;  

public class MyRSS  

{  

    public static String getHtmlSource(String url)  

    {  

        StringBuffer codeBuffer = null;  

        BufferedReader in=null;  

        try 

        {  

           URLConnection uc = new URL(url).openConnection();  

            uc.setRequestProperty("User-Agent",  

                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)");  

            // 讀取url流内容  

            in = new BufferedReader(new InputStreamReader(uc  

                    .getInputStream(), "gb2312"));  

            codeBuffer = new StringBuffer();  

            String tempCode = "";  

            // 把buffer内的值讀取出來,儲存到code中  

            while ((tempCode = in.readLine()) != null)  

            {  

                codeBuffer.append(tempCode).append("/n");  

            }  

            in.close();  

        }  

        catch (MalformedURLException e)  

        {  

            e.printStackTrace();  

        }  

        catch (IOException e)  

        {  

            e.printStackTrace();  

        }  

        return codeBuffer.toString();  

    }  

    public static String regex()  

    {  

        String googleRegex = "<div class=g>(.*?)href=/"(.*?)/"(.*?)/">(.*?)</a>(.*?)<div class=std>(.*?)<br>";  

        return googleRegex;  

    }  

    public static List<String> GetNews()  

    {  

        List<String> newsList = new ArrayList<String>();  

        String allHtmlSource = MyRSS  

                .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os-  maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f");  

        Pattern pattern = Pattern.compile(regex());  

        Matcher matcher = pattern.matcher(allHtmlSource);  

        while (matcher.find())  

        {  

            String urlLink = matcher.group(2);  

            String title = matcher.group(4);  

            title = title.replaceAll("<font color=CC0033>", "");  

            title = title.replaceAll("</font>", "");  

            title = title.replaceAll("<b>...</b>", "");  

            String content = matcher.group(6);  

            content = content.replaceAll("<font color=CC0033>", "");  

            content = content.replaceAll("</font>", "");  

            content = content.replaceAll("<b>...</b>", "");  

            newsList.add(urlLink);  

            newsList.add(title);  

            newsList.add(content);  

        }  

       return newsList;  

    }  

    public static void main(String[] args)  

    {  

       System.out  

        .println(MyRSS  

                .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html"));  

    }  

}

方法三:

jsp自動抓取新聞 自動抓取新聞

package com.news.spider;

import java.io.File;

import java.io.FileFilter;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Calendar;

import java.util.Date;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import com.db.DBAccess;

public class SpiderNewsServer {

public static void main(String[] args) throws Exception{

   //設定抓取資訊的首頁面

   String endPointUrl = "http://cn.china.cn/zixun/";

   //獲得目前時間

   Calendar calendar=Calendar.getInstance();

      SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd");

      String DateNews = sdf.format(calendar.getTime());

   List listNewsType = new ArrayList();

   //取入口頁面html

   WebHtml webHtml = new WebHtml();

   String htmlDocuemtnt1 = webHtml.getWebHtml(endPointUrl);

   if(htmlDocuemtnt1 == null || htmlDocuemtnt1.length() == 0){

    return;

   }

   String strTemp1 = "http://cn.china.cn/article/";

   String strTemp2 = "</li>";

   int stopIndex=0;

   int startIndex=0;

   int dd=0;

   while(true){

    dd++;

    startIndex = htmlDocuemtnt1.indexOf(strTemp1, stopIndex);

    System.out.println("=========="+startIndex);

    stopIndex= htmlDocuemtnt1.indexOf(strTemp2, startIndex);

    System.out.println("==========---------"+stopIndex);

    if(startIndex!=-1 && stopIndex!=-1){

     String companyType=htmlDocuemtnt1.substring(startIndex,stopIndex);

     System.out.println("@@@@@--------"+companyType);

     System.out.println("@@@@@--------"+companyType.indexOf("/""));

     companyType=companyType.substring(0,companyType.indexOf("/""));

     System.out.println("#####--------"+companyType);

     listNewsType.add(companyType);

    }

    if(dd>10){

     break;

    }

    if(stopIndex==-1 || startIndex==-1){

     break;

    }

   }

   System.out.println("listCompanyType====="+listNewsType.size());

   String title="";

     String hometext="";

     String bodytext="";

     String keywords="";

     String counter = "221";

     String cdate= "";

   int begainIndex=0;//檢索字元串的起點索引

   int endIndex=0;//檢索字元串的終點索引

   String begainStr;//檢索開始字元串       

   String endStr;//檢索結束字元串

   for (int rows = 1; rows < listNewsType.size(); rows++) {

    String strNewsDetail = listNewsType.get(rows).toString();

    System.out.println("strNewsDetail====="+strNewsDetail);

    if(strNewsDetail != null && strNewsDetail.length() > 0){

     WebHtml newsListHtml = new WebHtml();

     String htmlDocuemtntCom = newsListHtml.getWebHtml(strNewsDetail);

     System.out.println("$$$$$------"+htmlDocuemtntCom);

     if(htmlDocuemtntCom == null || htmlDocuemtntCom.length() == 0){

      return;

     }

     //截取時間    

     int dateBegainIndex = htmlDocuemtntCom.indexOf("<div>時間:");

     System.out.println("%%%%%--"+dateBegainIndex);

     String newTime = htmlDocuemtntCom.substring(dateBegainIndex,dateBegainIndex+20);

     System.out.println("^^^^^^^^^^^^^^^---"+newTime);

     String newTimeM = newTime.substring(newTime.lastIndexOf("-")+1,newTime.lastIndexOf("-")+3);

     String dateM = DateNews.substring(DateNews.lastIndexOf("-")+1);

     System.out.println("^^^^^^^^^^^^^^^---"+newTimeM);

     System.out.println("^^^^^^^^^^^^^^^---"+dateM);

     if(newTimeM == dateM || newTimeM.equals(dateM)){

      //檢索新聞标題

      begainStr="<div class=/"divCon bg008 /">";       

      endStr="<div>時間:";

      begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);

      System.out.println("&&&&&&------"+begainIndex);

      endIndex=htmlDocuemtntCom.indexOf(endStr,0);

      System.out.println("&&&&&&------"+endIndex);

      if(begainIndex!=-1 && endIndex!=-1){

       title = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();

       title = title.substring(title.indexOf("<h1>")+4,title.indexOf("</h1>"));

       title = title.replace("'", "");

       title = title.replace(";", "");

       title = title.replace(" ", "");

      }

      //檢索新聞内容

      begainStr="<div class=/"divCon bg008 /">";       

      endStr="<!-- page begin -->";

      begainIndex=htmlDocuemtntCom.indexOf(begainStr,0);

      endIndex=htmlDocuemtntCom.indexOf(endStr,0);

      if(begainIndex!=-1 && endIndex!=-1){

       bodytext = htmlDocuemtntCom.substring(begainIndex,endIndex).trim();

       if(bodytext.indexOf("<p>")>0 && bodytext.indexOf("</p>")>bodytext.indexOf("<p>") && bodytext.indexOf("</p>")>0)

        bodytext = bodytext.substring(bodytext.indexOf("<p>")+3,bodytext.indexOf("</p>"));

       bodytext=bodytext.replace("&nbsp;", "");

       bodytext=bodytext.replace("<br>", "");

       bodytext=bodytext.replace("/n", "<br>");

       bodytext=bodytext.replace("'", "");

       bodytext=bodytext.replace(";", "");

      }

      //簡介

      if(bodytext.length()>40)

       hometext = bodytext.substring(0,40)+"......";

      else{

       hometext = bodytext+"......";

      }

      //浏覽量

      String str = String.valueOf(Math.random());

      counter = str.substring(str.lastIndexOf(".")+1,5);

      Calendar cal = Calendar.getInstance();

      cal.setTime(new Date());

      cdate = cal.getTimeInMillis()+"";

      cdate = cdate.substring(0,10);

     }else{

      continue;

     }

    }

    System.out.println("-------------------------"+title);

    System.out.println("-------------------------"+cdate);

    System.out.println("-------------------------"+cdate);

    System.out.println("-------------------------"+hometext);

    System.out.println("-------------------------"+bodytext);

    System.out.println("-------------------------"+keywords);

    System.out.println("-------------------------"+counter);

   }

}

}

package com.news.spider;

import java.net.URL;

import java.net.URLConnection;

import java.io.BufferedReader;

import java.io.InputStreamReader;

public class WebHtml {

public String getWebHtml(String url){

   try {

    URL myURL = new URL(url);

    URLConnection conn = myURL.openConnection();

    BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));

    String line = null;

    StringBuffer document = new StringBuffer("");

    while ((line = reader.readLine()) != null){

     document.append(line + "/n");

    }

    reader.close();

    String resutlDocument = new String(document);

    return resutlDocument;

   } catch (Exception e) {}

   return "";

}

}

出處:【Gjava人才】

網址: http://www.gjrencai.com

轉載時請注明出處和網址