天天看点

java网络爬虫 爬取邮箱地址

package Socket;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class 网络爬虫 {
    public static void main(String[] args) throws IOException {
        List<String> list = getSocketMails();
        for (String mails:list){
            System.out.println(mails);
        }
    }
    public static List<String> getSocketMails() throws IOException {
        URL url = new URL("http://www.lwlwlw.com/meiwen/38545.html");
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(url.openStream()));
        String mail_regex = "\\w*@\\w+(\\.\\w+)+";
        Pattern pattern = Pattern.compile(mail_regex);
        String line = null;
        List<String> list = new ArrayList<>();
        while ((line = bufferedReader.readLine())!=null){
            Matcher m = pattern.matcher(line);
            while (m.find()){
                list.add(m.group());
            }
        }

        bufferedReader.close();
        return list;
    }

}