天天看点

使用代理IP网络爬虫的三种方式

import java.io.IOException;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.SocketAddress;
import java.net.URL;
import java.net.URLConnection;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.channels.SocketChannel;
import java.nio.charset.Charset;
import java.util.Scanner;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLSession;

public class Tester {
    public static void main(String[] args) {
        String uri = "https://fpcy.hb-n-tax.gov.cn:443/WebQuery/yzmQuery";
        //String uri = "http://pv.sohu.com/cityjson?ie=utf-8";

        String host = "52.183.30.241";
        int port = ;

        try {
            test1(uri, host, port);
            //test2(uri, host, port);
            //test3(uri, host, port);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static void test1(String uri, String host, Integer port) throws Exception {
        SocketAddress addr = new InetSocketAddress(host, port);
        Proxy proxy = new Proxy(Proxy.Type.HTTP, addr);
        IngoreSSL();//忽略HTTPS请求的SSL证书,按需要设置
        URLConnection connection = new URL(uri).openConnection(proxy);
        show(connection.getInputStream());
    }

    /**
     * 该方法 如果代理ip失败,会使用本地ip
     * @param uri
     * @param host
     * @param port
     * @throws Exception
     */
    static void test2(String uri, String host, Integer port) throws Exception {
        System.setProperty("http.proxySet", "true");
        System.setProperty("http.proxyHost", host);
        System.setProperty("http.proxyPort", port + "");
        IngoreSSL();
        URLConnection connection = new URL(uri).openConnection();
        show(connection.getInputStream());
    }

    static void test3(String uri, String host, Integer port) throws IOException {
        SocketChannel sc = SocketChannel.open();
        // 要设置连接超时
        sc.socket().connect(new InetSocketAddress(host, port), );
        // 设置读超时
        sc.socket().setSoTimeout();

        sc.write(Charset.forName("utf8").encode("GET " + uri + " \r\n\r\n"));
        ByteBuffer buffer = ByteBuffer.allocate();
        InputStream is = sc.socket().getInputStream();
        ReadableByteChannel readCh = Channels.newChannel(is);
        while (readCh.read(buffer) != -) {
            buffer.flip();
            System.out.println(Charset.forName("utf8").decode(buffer));
            buffer.clear();
        }
        sc.close();
    }

    static void show(InputStream in) throws IOException {
        Scanner cin = new Scanner(in);
        StringBuilder builder = new StringBuilder();
        while (cin.hasNext()) {
            builder.append(cin.nextLine());
        }
        cin.close();
        System.out.println(builder.toString());
    }

    /**
     * 忽略HTTPS请求的SSL证书,必须在openConnection之前调用
     * @throws Exception
     */
    static void IngoreSSL() throws Exception {
        trustAllHttpsCertificates();
        HostnameVerifier hv = new HostnameVerifier() {
            public boolean verify(String urlHostName, SSLSession session) {
                return true;
            }
        };
        HttpsURLConnection.setDefaultHostnameVerifier(hv);
    }

    private static void trustAllHttpsCertificates() throws Exception {
        javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[];
        javax.net.ssl.TrustManager tm = new miTM();
        trustAllCerts[] = tm;
        javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext.getInstance("SSL");
        sc.init(null, trustAllCerts, null);
        javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
    }

    static class miTM implements javax.net.ssl.TrustManager, javax.net.ssl.X509TrustManager {
        public java.security.cert.X509Certificate[] getAcceptedIssuers() {
            return null;
        }

        public boolean isServerTrusted(java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public boolean isClientTrusted(java.security.cert.X509Certificate[] certs) {
            return true;
        }

        public void checkServerTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException {
            return;
        }

        public void checkClientTrusted(java.security.cert.X509Certificate[] certs, String authType) throws java.security.cert.CertificateException {
            return;
        }
    }
}
           

继续阅读