天天看点

14、go语言:单任务版爬虫

1、获得初始页面内容:

获取并打印所有城市第一页用户的详细信息

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	//go语言是utf-8编码,网页如果是gbk,需要解决
	//需要下载两个包
	//gopm get -g -v golang.org/x/text //也是官方的库但是不在标准库中
	utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性较差
	//all,err := ioutil.ReadAll(resp.Body)
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

需要根据网页自动识别编码,就需要另外一个库

gopm get -g -v golang.org/x/net/html

func determineEncoding(r io.Reader) encoding.Encoding {
	butes,err := bufio.NewReader(r).Peek(1024) //Peek之后是一个byte的arr
	if err != nil { 
		panic(err)
	}
	e, _,_ := charset.DetermineEncoding(bytes,"")
	return e
}

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	
	//go语言是utf-8编码,网页如果是gbk,需要解决
	//需要下载两个包
	//gopm get -g -v golang.org/x/text //也是官方的库但是不在标准库中
	//utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性较差
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

2、正则表达式:

获取城市名称和链接

1、CSS选择器(美元号+css属性)

$(’#cityList>dd>a’)

2、使用Xpath

3、使用正则表达式(适用性弱一些)

正则表达式获取邮箱账号

const text = "My email is [email protected]"
func main(){
	//re,err := regexp.Compile("[email protected]")
	//re := regexp.MustCompile("[email protected]") //一定不会出错的写法
	//re := regexp.MustCompile(".+@.+\\..+") //冒号的方式会发生\转义
	//re := regexp.MustCompile`.+@.+\..+`) //反引号不会发生\转义
	//re := regexp.MustCompile`[a-z0-9A-Z]+@.+\..+`) //后面如果还有其他@就会匹配出来[email protected]@abc.com
	re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`)
	match := re.FindString(text)
	fmt.Println(match)
}

//匹配多个空行的字符串
const text = `
My email is [email protected]
email is [email protected]
`
func main(){
	//re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`) 
	//match := re.FindString(text)
	re := regexp.MustCompile`([a-z0-9A-Z]+)@([a-z0-9A-Z]+)(\.[a-z0-9A-Z.]+)`)  //提取匹配的
	//match := re.FindAllString(text,-1) //不会进行提取
	match := re.FindAllStringSubmatch(text,-1)
	for _,m := range match {	
		fmt.Println(m)
	}
}
           

3、提取城市和URL:

func printCityList(contents []bytes){
	//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/yuxi" target="_blank" rel="external nofollow"  class="">玉溪</a>`)
	//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" .*>玉溪</a>`) //.*有可能匹配到最后一个>
	re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
	matches := re.FindAll(contents,-1)
	for _,m :=range mathces {
		fmt.Printf("%s\n",m)
	}
}

//-------------------
//城市列表解析器
func printCityList(contents []bytes){
	//re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
	re := regexp.MustCompile(`<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`) //非尖括号
	matches := re.FindAllSubmatch(contents,-1)
	for _,m :=range mathces {
		fmt.Printf("City: %s , URL:%s\n",m[2],m[1])	
	}
}

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

4、单任务版爬虫的架构:

还需要城市解析器、用户解析器

用户解析器的输出就是我们要的结果

解析器Parser

输入:utf-8编码的文本

输出:Request{URL,对应Parser}列表,Item列表

单机版爬虫架构:

Seed ->(request) Engine <->任务队列

Engine ->(URL) Fetcher -> (text) Engine

Engine -> (text) Parser -> (requests,items) Engine

5、Engine和Parser:

//------
package fetcher
func determineEncoding(r io.Reader) encoding.Encoding {
	butes,err := bufio.NewReader(r).Peek(1024) //Peek之后是一个byte的arr
	if err != nil { 
		log.Printf("Fetcher error: %v",err)
		return unicode.UTF8  //返回一个默认的编码
	}
	e, _,_ := charset.DetermineEncoding(bytes,"")
	return e
}

func Fetch(url string) ([]byte,error){
	resp,err := http.Get(url)
	if err != nil {
		return nil,err
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		return nil,fmt.Errorf("Error: status code: %d",resp.StatusCode)
	}
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	return all,err := ioutil.ReadAll(utf8Reader)
}
           

//-----------------封装Engine--------------------------

//-------
type.go
package engine

type Request struct {
	Url string
	ParserFunc func([]byte) ParseResult
}

type ParseResult struct {
	Requests []Request
	Items []interface{}
}

//空解析,什么都不做,只是为了走流程
type NilParser([]byte) ParseResult{
	return ParseResult{}
}
           
engine.go

//送很多种子
func Run(seeds ...Request){
	var requests []Request
	for _,r :=  range seeds {
		requests := append(requests,r)
	}
	for len(requests) >0 {
		r  := requests[0]
		requests = requests[1:]
		
		log.Printf("Fetching %s",r.Url)
		body,err := fetcher.Fetch(r.Url)
		if err != nil {
			log.Printf("Fetcher: error " + "fetching url %s:%v",r.Url,err)
			continue
		}
		
		parseResult := r.ParserFunc(body)
		requests = append(requests,parseResult.Requests...) //...表示把requests中的内容展开了添加进去,不然就得写成requests = append(requests,parseResult.Requests[0],parseResult.Requests[1])
		
		for _,item := range parseResult.Items {
			log.Printf("Got item %v",item) //%v直接输出,是数字输出数字,是字符串也直接输出
		}
	}

}
           

-----------------封装Parser--------------------------

const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParseResult{
	re := regexp.MustCompile(cityListRe) //非尖括号
	matches := re.FindAllSubmatch(contents,-1)
	
	result := engine.ParseResult{}
	for _,m :=range mathces {
		result.Items 	= append(result.Items,string(m[2])) //城市名字返回
		result.Requests = append(result.Requests,engine.Request{
								Url : string(m[1]),
								ParserFunc : engine.NilParser,
							})
	}
	return result
}

//重新写main函数
func main(){
	engine.Run(engine.Request{
			Url : "http://www.zhenai.com/zhenghun",
			ParseFunc : parser.ParseCityList,
		})
}