天天看點

JAVA抓取網站網頁内容

最近在用JAVA研究下爬網技術,呵呵,入了個門,把自己的心得和大家分享下  

以下提供二種方法,一種是用apache提供的包.另一種是用JAVA自帶的.  

代碼如下:  

// 第一種方法  

//這種方法是用apache提供的包,簡單友善  

//但是要用到以下包:commons-codec-1.4.jar  

//    commons-httpclient-3.1.jar  

//    commons-logging-1.0.4.jar  

    public static String createhttpClient(String url, String param) {  

        HttpClient client = new HttpClient();  

        String response = null;  

        String keyword = null;  

        PostMethod postMethod = new PostMethod(url);  

//        try {  

//            if (param != null)  

//                keyword = new String(param.getBytes("gb2312"), "ISO-8859-1");  

//        } catch (UnsupportedEncodingException e1) {  

//            // TODO Auto-generated catch block  

//            e1.printStackTrace();  

//        }  

        // NameValuePair[] data = { new NameValuePair("keyword", keyword) };  

        // // 将表單的值放入postMethod中  

        // postMethod.setRequestBody(data);  

        //    以上部分是帶參數抓取,我自己把它登出了.大家可以把登出消掉研究下             

        try {  

            int statusCode = client.executeMethod(postMethod);  

            response = new String(postMethod.getResponseBodyAsString()  

                    .getBytes("ISO-8859-1"), "gb2312");//這裡要注意下 gb2312要和你抓取網頁的編碼要一樣  

            String p = response.replaceAll("\\&[a-zA-Z]{1,10};", "")  

                    .replaceAll("<[^>]*>", "");//去掉網頁中帶有html語言的标簽  

            System.out.println(p);  

        } catch (Exception e) {  

            e.printStackTrace();  

        }  

        return response;  

    }  

    // 第二種方法  

    // 這種方法是JAVA自帶的URL來抓取網站内容  

    public String getPageContent(String strUrl, String strPostRequest,  

            int maxLength) {  

        // 讀取結果網頁  

        StringBuffer buffer = new StringBuffer();  

        System.setProperty("sun.net.client.defaultConnectTimeout", "5000");  

        System.setProperty("sun.net.client.defaultReadTimeout", "5000");  

            URL newUrl = new URL(strUrl);  

            HttpURLConnection hConnect = (HttpURLConnection) newUrl  

                    .openConnection();  

            // POST方式的額外資料  

            if (strPostRequest.length() > 0) {  

                hConnect.setDoOutput(true);  

                OutputStreamWriter out = new OutputStreamWriter(hConnect  

                        .getOutputStream());  

                out.write(strPostRequest);  

                out.flush();  

                out.close();  

            }  

            // 讀取内容  

            BufferedReader rd = new BufferedReader(new InputStreamReader(  

                    hConnect.getInputStream()));  

            int ch;  

            for (int length = 0; (ch = rd.read()) > -1  

                    && (maxLength <= 0 || length < maxLength); length++)  

                buffer.append((char) ch);  

            String s = buffer.toString();  

            s.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", "");  

            System.out.println(s);  

            rd.close();  

            hConnect.disconnect();  

            return buffer.toString().trim();  

            // return "錯誤:讀取網頁失敗!";  

            //  

            return null;  

然後寫個測試類:  

public static void main(String[] args) {  

        String url = "http://www.renren.com";  

        String keyword = "人人";  

        createhttpClient p = new createhttpClient();  

        String response = p.createhttpClient(url, keyword); // 第一種方法  

        // p.getPageContent(url, "post", 100500);//第二種方法  

呵呵,看看控制台吧,是不是把網頁的内容擷取了  

第三種方法:  

import java.io.FileOutputStream;  

import java.io.InputStream;  

import java.io.OutputStream;  

import java.net.URL;  

    public class GetUrlToHtml {  

        public static void main(String[] args) {  

            InputStream in = null;     

            OutputStream out = null;  

            try {  

                if ((args.length != 1)&& (args.length != 2))   

                    throw new IllegalArgumentException("Wrong number of args");  

                URL url = new URL(args[0]);    

                in = url.openStream();          

                if (args.length == 2)           

                    out = new FileOutputStream(args[1]);  

                else out = System.out;  

                byte[] buffer = new byte[4096];  

                if(out==System.out){new String();}  

                int bytes_read;  

                while((bytes_read = in.read(buffer)) != -1){  

                    out.write(buffer, 0, bytes_read);}                       

            catch (Exception e) {  

                System.err.println(e);  

                System.err.println("Usage: java GetURL <URL> [<filename>]");  

            finally {   

                try { in.close(); out.close(); } catch (Exception e) {}