天天看點

Go語言解析HtmlGo語言解析Html

Go語言解析Html

思想來源:BeautifulSoup4

原則:簡單、快、省記憶體

特點:自造輪子随心用,不規則html照樣幹

結構體及其接口定義

package bs

type SelFunc interface {
    Sel(tag string, attrs *map[string]string) (nodes []*Node) // 隻提供給user此方法
    SelById(id string) []*Node
    SelByTag(tag string) []*Node
    SelByClass(class string) []*Node
}

type Node struct { // 基本節點結構
    Tag   string             // 标簽名
    Attrs *map[string]string //屬性
    Value string             // 此節點的值
    Sons  []*Node            // 子節點
    is    bool               // 節點是否已經周遊
    start bool               // 是否開始節點
}

type Soup struct { // 解析結構
    html  string  // 文本
    nodes []*Node // 标簽清單
    index []int   // 所有标簽的下标
}
           

解析步驟(核心)

1.初始化

Soup

,此時生成Html文檔的各個節點清單以及節點位置的記錄表

2.使用者調用

Sel

方法,傳入解析規則(标簽名、标簽屬性限制等)

3.解析使用者請求并傳回子節點指針(指針省記憶體)

代碼:

package bs

import (
    "container/list"
    "fmt"
    "regexp"
    "strings"
)

var (
    regTag   = regexp.MustCompile(`<[a-z|A-Z|/].*?>`)         // 比對标簽
    regAttrs = regexp.MustCompile(`([a-z|A-Z]+?)= *?"(.*?)"`) // 比對屬性
    DEBUG    = false
)

func out(s string) {
    if DEBUG {
        fmt.Println(s)
    }
}

func Init(html string) *Soup { // 初始化Soup
    sp := Soup{}
    sp.setHtml(html)
    return &sp
}

func (self *Soup) setHtml(text string) {
    self.html = text
    for _, ss := range regTag.FindAllStringIndex(self.html,) {
        s := self.html[ss]:ss]]
        if strings.Contains(s, "/>") || strings.Contains(s, "<br>") || strings.Contains(s, "<img") || strings.Contains(s, "<hr") || strings.Contains(s, "<input") { // 不要單獨的标簽
            continue
        }
        var nd Node
        if s[] == "</" { // 結束标簽
            nd.Tag = s : len(s]
            nd.start = false
        } else { // 開始标簽
            nd.Tag = strings.Split(s, " ")]:]
            nd.start = true
            if strings.Contains(nd.Tag, ">") {
                nd.Tag = nd.Tag[:len(nd.Tag]
            }
        }
        // fmt.Println("Tag:", nd.Tag, nd.start)
        attrs := make(map[string]string)
        for _, a := range regAttrs.FindAllStringSubmatch(s,) {
            if len(a) == {
                attrs[a]] = a]
            }
        }
        nd.Attrs = &attrs
        nd.is = false
        // fmt.Println(nd.Tag, *nd.Attrs)
        self.nodes = append(self.nodes, &nd)
        self.index = append(self.index, ss]) // 隻需要開始位置
    }
}

func right(cur *map[string]string, attrs *map[string]string) bool {
    // cur 包含 attrs 則傳回true
    for k, v := range *attrs {
        if (*cur)[k] != v {
            return false
        }
    }
    return true
}

func trim(c rune) bool { // 去除首尾的無用字元
    return c == '\n' || c == '\t' || c == ' '

}

func (self *Soup) parse(cur int) { // 解析cur節點
    if self.nodes[cur].is || !self.nodes[cur].start { // 目前節點已被解析
        out("已經解析/結束節點")
        return
    }
    leng := len(self.index)
    nds := list.New() // 節點樹
    nds.PushBack(cur) // 根節點入棧(位置)
    for cur < leng {  // 找結束節點
        cur++
        if cur >= leng {
            return
        }
        tp := nds.Back()
        iv := tp.Value.(int)
        if self.nodes[cur].start { // 是開始節點
            // 壓棧, 此節點為前一節點子節點
            self.nodes[iv].Sons = append(self.nodes[iv].Sons, self.nodes[cur])
            nds.PushBack(cur)

        } else if self.nodes[iv].Tag == self.nodes[cur].Tag { // 是結束節點, 且比對前一個,完成解析,出棧
            // 存其Value
            self.nodes[iv].Value = strings.TrimFunc(regTag.ReplaceAllString(self.html[self.index[iv]:self.index[cur]], ""), trim)
            // 将其置為已解析
            self.nodes[iv].is = true
            nds.Remove(tp)
        }
        if nds.Len() == {
            break
        }
    }
}

func (self *Soup) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
    cur :=
    leng := len(self.index)
    for cur < leng {
        if tag != "" && tag != self.nodes[cur].Tag { // 标簽不比對
            cur++
            continue
        }
        if attrs != nil && !right(self.nodes[cur].Attrs, attrs) { // 屬性不比對
            cur++
            continue
        }
        // 找到滿足條件的節點
        nodes = append(nodes, self.nodes[cur])
        // 解析該節點及其子節點
        self.parse(cur)
        cur++
    }
    return
}

func itool(n *Node, tag string, attrs *map[string]string, nodes *[]*Node) {
    for _, i := range n.Sons {
        if (i.Tag == tag || tag == "") && (attrs != nil && right(i.Attrs, attrs) || attrs == nil) {
            *nodes = append(*nodes, i)
        }
        itool(i, tag, attrs, nodes)
    }

}

func (self *Node) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
    // 對于節點,之前已經解析過了
    itool(self, tag, attrs, &nodes)
    return
}

func (self *Soup) SelById(id string) []*Node {
    return self.Sel("", &map[string]string{"id": id})

}

func (self *Soup) SelByTag(tag string) []*Node {
    return self.Sel(tag, nil)
}

func (self *Soup) SelByClass(class string) []*Node {
    return self.Sel("", &map[string]string{"class": class})
}

func (self *Node) SelById(id string) []*Node {
    return self.Sel("", &map[string]string{"id": id})

}

func (self *Node) SelByTag(tag string) []*Node {
    return self.Sel(tag, nil)
}

func (self *Node) SelByClass(class string) []*Node {
    return self.Sel("", &map[string]string{"class": class})
}
           

示例

package main

import (
    "fmt"
    "myspider/bs"
)

var html = `
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story" id="sp">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow"  class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow"  class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow"  class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>nothing in here</b>
</p>
<p class="story">...</p>
<ul class="story" id="0">
    <li class="t" id="1">
        <li class="t" id="2">asdf</li>
    </li>
    <li class="t" id="3">2</li>
    <li class="t" id="4">3</li>
</ul>
`

var soup = bs.Init(html)

func t1() {

    // by tag
    fmt.Println("By Tag........................")
    for _, j := range soup.Sel("a", nil) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }
    // by attrs
    fmt.Println("By Attrs........................")
    for _, j := range soup.Sel("", &map[string]string{"class": "story"}) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }
    // by tag and attrs
    fmt.Println("By Tag And Attrs........................")
    for _, j := range soup.Sel("p", &map[string]string{"class": "story"}) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }

    // more
    fmt.Println("More.......................................")
    for _, j := range soup.Sel("", &map[string]string{"id": "sp"}) {
        for _, a := range j.Sel("a", nil) {
            fmt.Println("Tag:", a.Tag)
            fmt.Println("Attrs:", *a.Attrs)
            fmt.Println("Value:", a.Value)
        }
    }
    // Detail
    fmt.Println("Soup Details....................................")
    for _, j := range soup.SelById("sp") {
        fmt.Println("Tag:", j.Tag)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)

    }
    for _, j := range soup.SelByClass("sister") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range soup.SelByTag("title") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    fmt.Println("Node Details....................................")
    note := soup.SelById("sp")]
    for _, j := range note.SelByClass("sister") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range note.SelById("link3") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range note.SelByTag("a") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
}

func t2() {
    n := soup.SelByTag("ul")]
    for _, i := range n.Sons {
        fmt.Println(i.Value)
    }
}

func t3() {
    n := soup.SelById("sp")]
    for _, i := range n.Sons {
        fmt.Println(i.Tag)
    }
}

func t4() {
    n := soup.SelByTag("ul")]
    for _, j := range n.SelByTag("li") {
        fmt.Println(j.Value)
    }
}
func main() {
    t1()
}
           

至于穩定性,示例的html片段都能解析還有什麼不能解析的?

項目位址:https://github.com/pysrc/bs