天天看點

14、go語言:單任務版爬蟲

1、獲得初始頁面内容:

擷取并列印所有城市第一頁使用者的詳細資訊

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	//go語言是utf-8編碼,網頁如果是gbk,需要解決
	//需要下載下傳兩個包
	//gopm get -g -v golang.org/x/text //也是官方的庫但是不在标準庫中
	utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性較差
	//all,err := ioutil.ReadAll(resp.Body)
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

需要根據網頁自動識别編碼,就需要另外一個庫

gopm get -g -v golang.org/x/net/html

func determineEncoding(r io.Reader) encoding.Encoding {
	butes,err := bufio.NewReader(r).Peek(1024) //Peek之後是一個byte的arr
	if err != nil { 
		panic(err)
	}
	e, _,_ := charset.DetermineEncoding(bytes,"")
	return e
}

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	
	//go語言是utf-8編碼,網頁如果是gbk,需要解決
	//需要下載下傳兩個包
	//gopm get -g -v golang.org/x/text //也是官方的庫但是不在标準庫中
	//utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性較差
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

2、正規表達式:

擷取城市名稱和連結

1、CSS選擇器(美元号+css屬性)

$(’#cityList>dd>a’)

2、使用Xpath

3、使用正規表達式(适用性弱一些)

正規表達式擷取郵箱賬号

const text = "My email is [email protected]"
func main(){
	//re,err := regexp.Compile("[email protected]")
	//re := regexp.MustCompile("[email protected]") //一定不會出錯的寫法
	//re := regexp.MustCompile(".+@.+\\..+") //冒号的方式會發生\轉義
	//re := regexp.MustCompile`.+@.+\..+`) //反引号不會發生\轉義
	//re := regexp.MustCompile`[a-z0-9A-Z]+@.+\..+`) //後面如果還有其他@就會比對出來[email protected]@abc.com
	re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`)
	match := re.FindString(text)
	fmt.Println(match)
}

//比對多個空行的字元串
const text = `
My email is [email protected]
email is [email protected]
`
func main(){
	//re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`) 
	//match := re.FindString(text)
	re := regexp.MustCompile`([a-z0-9A-Z]+)@([a-z0-9A-Z]+)(\.[a-z0-9A-Z.]+)`)  //提取比對的
	//match := re.FindAllString(text,-1) //不會進行提取
	match := re.FindAllStringSubmatch(text,-1)
	for _,m := range match {	
		fmt.Println(m)
	}
}
           

3、提取城市和URL:

func printCityList(contents []bytes){
	//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/yuxi" target="_blank" rel="external nofollow"  class="">玉溪</a>`)
	//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" .*>玉溪</a>`) //.*有可能比對到最後一個>
	re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
	matches := re.FindAll(contents,-1)
	for _,m :=range mathces {
		fmt.Printf("%s\n",m)
	}
}

//-------------------
//城市清單解析器
func printCityList(contents []bytes){
	//re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
	re := regexp.MustCompile(`<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`) //非尖括号
	matches := re.FindAllSubmatch(contents,-1)
	for _,m :=range mathces {
		fmt.Printf("City: %s , URL:%s\n",m[2],m[1])	
	}
}

func main(){
	resp,err := http.Get("http://www.zhenai.com/zhenghun")
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		fmt.Println("Error: status code",resp.StatusCode)
		return
	}
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	all,err := ioutil.ReadAll(utf8Reader)
	if err != nil {
		panic(err)
	}
	fmt.Printf("%s\n",all)
}
           

4、單任務版爬蟲的架構:

還需要城市解析器、使用者解析器

使用者解析器的輸出就是我們要的結果

解析器Parser

輸入:utf-8編碼的文本

輸出:Request{URL,對應Parser}清單,Item清單

單機版爬蟲架構:

Seed ->(request) Engine <->任務隊列

Engine ->(URL) Fetcher -> (text) Engine

Engine -> (text) Parser -> (requests,items) Engine

5、Engine和Parser:

//------
package fetcher
func determineEncoding(r io.Reader) encoding.Encoding {
	butes,err := bufio.NewReader(r).Peek(1024) //Peek之後是一個byte的arr
	if err != nil { 
		log.Printf("Fetcher error: %v",err)
		return unicode.UTF8  //傳回一個預設的編碼
	}
	e, _,_ := charset.DetermineEncoding(bytes,"")
	return e
}

func Fetch(url string) ([]byte,error){
	resp,err := http.Get(url)
	if err != nil {
		return nil,err
	}
	defer resp.Body.Close()
	
	if resp.StatusCode != http.StatusOk {
		return nil,fmt.Errorf("Error: status code: %d",resp.StatusCode)
	}
	e :=  determineEncoding(resp.Body)
	utf8Reader := transform.NewReader(resp.Body,e.NewDecoder()) 
	return all,err := ioutil.ReadAll(utf8Reader)
}
           

//-----------------封裝Engine--------------------------

//-------
type.go
package engine

type Request struct {
	Url string
	ParserFunc func([]byte) ParseResult
}

type ParseResult struct {
	Requests []Request
	Items []interface{}
}

//空解析,什麼都不做,隻是為了走流程
type NilParser([]byte) ParseResult{
	return ParseResult{}
}
           
engine.go

//送很多種子
func Run(seeds ...Request){
	var requests []Request
	for _,r :=  range seeds {
		requests := append(requests,r)
	}
	for len(requests) >0 {
		r  := requests[0]
		requests = requests[1:]
		
		log.Printf("Fetching %s",r.Url)
		body,err := fetcher.Fetch(r.Url)
		if err != nil {
			log.Printf("Fetcher: error " + "fetching url %s:%v",r.Url,err)
			continue
		}
		
		parseResult := r.ParserFunc(body)
		requests = append(requests,parseResult.Requests...) //...表示把requests中的内容展開了添加進去,不然就得寫成requests = append(requests,parseResult.Requests[0],parseResult.Requests[1])
		
		for _,item := range parseResult.Items {
			log.Printf("Got item %v",item) //%v直接輸出,是數字輸出數字,是字元串也直接輸出
		}
	}

}
           

-----------------封裝Parser--------------------------

const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow"  target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParseResult{
	re := regexp.MustCompile(cityListRe) //非尖括号
	matches := re.FindAllSubmatch(contents,-1)
	
	result := engine.ParseResult{}
	for _,m :=range mathces {
		result.Items 	= append(result.Items,string(m[2])) //城市名字傳回
		result.Requests = append(result.Requests,engine.Request{
								Url : string(m[1]),
								ParserFunc : engine.NilParser,
							})
	}
	return result
}

//重新寫main函數
func main(){
	engine.Run(engine.Request{
			Url : "http://www.zhenai.com/zhenghun",
			ParseFunc : parser.ParseCityList,
		})
}