1、獲得初始頁面内容:
擷取并列印所有城市第一頁使用者的詳細資訊
func main(){
resp,err := http.Get("http://www.zhenai.com/zhenghun")
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOk {
fmt.Println("Error: status code",resp.StatusCode)
return
}
//go語言是utf-8編碼,網頁如果是gbk,需要解決
//需要下載下傳兩個包
//gopm get -g -v golang.org/x/text //也是官方的庫但是不在标準庫中
utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性較差
//all,err := ioutil.ReadAll(resp.Body)
all,err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
fmt.Printf("%s\n",all)
}
需要根據網頁自動識别編碼,就需要另外一個庫
gopm get -g -v golang.org/x/net/html
func determineEncoding(r io.Reader) encoding.Encoding {
butes,err := bufio.NewReader(r).Peek(1024) //Peek之後是一個byte的arr
if err != nil {
panic(err)
}
e, _,_ := charset.DetermineEncoding(bytes,"")
return e
}
func main(){
resp,err := http.Get("http://www.zhenai.com/zhenghun")
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOk {
fmt.Println("Error: status code",resp.StatusCode)
return
}
//go語言是utf-8編碼,網頁如果是gbk,需要解決
//需要下載下傳兩個包
//gopm get -g -v golang.org/x/text //也是官方的庫但是不在标準庫中
//utf8Reader := transform.NewReader(resp.Body,simplifiedchinese.GBK.NewDecoder()) //通用性較差
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body,e.NewDecoder())
all,err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
fmt.Printf("%s\n",all)
}
2、正規表達式:
擷取城市名稱和連結
1、CSS選擇器(美元号+css屬性)
$(’#cityList>dd>a’)
2、使用Xpath
3、使用正規表達式(适用性弱一些)
正規表達式擷取郵箱賬号
const text = "My email is [email protected]"
func main(){
//re,err := regexp.Compile("[email protected]")
//re := regexp.MustCompile("[email protected]") //一定不會出錯的寫法
//re := regexp.MustCompile(".+@.+\\..+") //冒号的方式會發生\轉義
//re := regexp.MustCompile`.+@.+\..+`) //反引号不會發生\轉義
//re := regexp.MustCompile`[a-z0-9A-Z]+@.+\..+`) //後面如果還有其他@就會比對出來[email protected]@abc.com
re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`)
match := re.FindString(text)
fmt.Println(match)
}
//比對多個空行的字元串
const text = `
My email is [email protected]
email is [email protected]
`
func main(){
//re := regexp.MustCompile`[a-z0-9A-Z]+@[a-z0-9A-Z]+\.[a-z0-9A-Z]+`)
//match := re.FindString(text)
re := regexp.MustCompile`([a-z0-9A-Z]+)@([a-z0-9A-Z]+)(\.[a-z0-9A-Z.]+)`) //提取比對的
//match := re.FindAllString(text,-1) //不會進行提取
match := re.FindAllStringSubmatch(text,-1)
for _,m := range match {
fmt.Println(m)
}
}
3、提取城市和URL:
func printCityList(contents []bytes){
//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/yuxi" target="_blank" rel="external nofollow" class="">玉溪</a>`)
//regexp.Compile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" .*>玉溪</a>`) //.*有可能比對到最後一個>
re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
matches := re.FindAll(contents,-1)
for _,m :=range mathces {
fmt.Printf("%s\n",m)
}
}
//-------------------
//城市清單解析器
func printCityList(contents []bytes){
//re := regexp.MustCompile(`<a href="http://www.zhenai.com/zhenghun/[0-9a-z]+" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" [^>]*>[^<]+</a>`) //非尖括号
re := regexp.MustCompile(`<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`) //非尖括号
matches := re.FindAllSubmatch(contents,-1)
for _,m :=range mathces {
fmt.Printf("City: %s , URL:%s\n",m[2],m[1])
}
}
func main(){
resp,err := http.Get("http://www.zhenai.com/zhenghun")
if err != nil {
panic(err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOk {
fmt.Println("Error: status code",resp.StatusCode)
return
}
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body,e.NewDecoder())
all,err := ioutil.ReadAll(utf8Reader)
if err != nil {
panic(err)
}
fmt.Printf("%s\n",all)
}
4、單任務版爬蟲的架構:
還需要城市解析器、使用者解析器
使用者解析器的輸出就是我們要的結果
解析器Parser
輸入:utf-8編碼的文本
輸出:Request{URL,對應Parser}清單,Item清單
單機版爬蟲架構:
Seed ->(request) Engine <->任務隊列
Engine ->(URL) Fetcher -> (text) Engine
Engine -> (text) Parser -> (requests,items) Engine
5、Engine和Parser:
//------
package fetcher
func determineEncoding(r io.Reader) encoding.Encoding {
butes,err := bufio.NewReader(r).Peek(1024) //Peek之後是一個byte的arr
if err != nil {
log.Printf("Fetcher error: %v",err)
return unicode.UTF8 //傳回一個預設的編碼
}
e, _,_ := charset.DetermineEncoding(bytes,"")
return e
}
func Fetch(url string) ([]byte,error){
resp,err := http.Get(url)
if err != nil {
return nil,err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOk {
return nil,fmt.Errorf("Error: status code: %d",resp.StatusCode)
}
e := determineEncoding(resp.Body)
utf8Reader := transform.NewReader(resp.Body,e.NewDecoder())
return all,err := ioutil.ReadAll(utf8Reader)
}
//-----------------封裝Engine--------------------------
//-------
type.go
package engine
type Request struct {
Url string
ParserFunc func([]byte) ParseResult
}
type ParseResult struct {
Requests []Request
Items []interface{}
}
//空解析,什麼都不做,隻是為了走流程
type NilParser([]byte) ParseResult{
return ParseResult{}
}
engine.go
//送很多種子
func Run(seeds ...Request){
var requests []Request
for _,r := range seeds {
requests := append(requests,r)
}
for len(requests) >0 {
r := requests[0]
requests = requests[1:]
log.Printf("Fetching %s",r.Url)
body,err := fetcher.Fetch(r.Url)
if err != nil {
log.Printf("Fetcher: error " + "fetching url %s:%v",r.Url,err)
continue
}
parseResult := r.ParserFunc(body)
requests = append(requests,parseResult.Requests...) //...表示把requests中的内容展開了添加進去,不然就得寫成requests = append(requests,parseResult.Requests[0],parseResult.Requests[1])
for _,item := range parseResult.Items {
log.Printf("Got item %v",item) //%v直接輸出,是數字輸出數字,是字元串也直接輸出
}
}
}
-----------------封裝Parser--------------------------
const cityListRe = `<a href="(http://www.zhenai.com/zhenghun/[0-9a-z]+)" target="_blank" rel="external nofollow" target="_blank" rel="external nofollow" [^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) engine.ParseResult{
re := regexp.MustCompile(cityListRe) //非尖括号
matches := re.FindAllSubmatch(contents,-1)
result := engine.ParseResult{}
for _,m :=range mathces {
result.Items = append(result.Items,string(m[2])) //城市名字傳回
result.Requests = append(result.Requests,engine.Request{
Url : string(m[1]),
ParserFunc : engine.NilParser,
})
}
return result
}
//重新寫main函數
func main(){
engine.Run(engine.Request{
Url : "http://www.zhenai.com/zhenghun",
ParseFunc : parser.ParseCityList,
})
}