Go語言解析Html
思想來源:BeautifulSoup4
原則:簡單、快、省記憶體
特點:自造輪子随心用,不規則html照樣幹
結構體及其接口定義
package bs
type SelFunc interface {
Sel(tag string, attrs *map[string]string) (nodes []*Node) // 隻提供給user此方法
SelById(id string) []*Node
SelByTag(tag string) []*Node
SelByClass(class string) []*Node
}
type Node struct { // 基本節點結構
Tag string // 标簽名
Attrs *map[string]string //屬性
Value string // 此節點的值
Sons []*Node // 子節點
is bool // 節點是否已經周遊
start bool // 是否開始節點
}
type Soup struct { // 解析結構
html string // 文本
nodes []*Node // 标簽清單
index []int // 所有标簽的下标
}
解析步驟(核心)
1.初始化
Soup
,此時生成Html文檔的各個節點清單以及節點位置的記錄表
2.使用者調用
Sel
方法,傳入解析規則(标簽名、标簽屬性限制等)
3.解析使用者請求并傳回子節點指針(指針省記憶體)
代碼:
package bs
import (
"container/list"
"fmt"
"regexp"
"strings"
)
var (
regTag = regexp.MustCompile(`<[a-z|A-Z|/].*?>`) // 比對标簽
regAttrs = regexp.MustCompile(`([a-z|A-Z]+?)= *?"(.*?)"`) // 比對屬性
DEBUG = false
)
func out(s string) {
if DEBUG {
fmt.Println(s)
}
}
func Init(html string) *Soup { // 初始化Soup
sp := Soup{}
sp.setHtml(html)
return &sp
}
func (self *Soup) setHtml(text string) {
self.html = text
for _, ss := range regTag.FindAllStringIndex(self.html,) {
s := self.html[ss]:ss]]
if strings.Contains(s, "/>") || strings.Contains(s, "<br>") || strings.Contains(s, "<img") || strings.Contains(s, "<hr") || strings.Contains(s, "<input") { // 不要單獨的标簽
continue
}
var nd Node
if s[] == "</" { // 結束标簽
nd.Tag = s : len(s]
nd.start = false
} else { // 開始标簽
nd.Tag = strings.Split(s, " ")]:]
nd.start = true
if strings.Contains(nd.Tag, ">") {
nd.Tag = nd.Tag[:len(nd.Tag]
}
}
// fmt.Println("Tag:", nd.Tag, nd.start)
attrs := make(map[string]string)
for _, a := range regAttrs.FindAllStringSubmatch(s,) {
if len(a) == {
attrs[a]] = a]
}
}
nd.Attrs = &attrs
nd.is = false
// fmt.Println(nd.Tag, *nd.Attrs)
self.nodes = append(self.nodes, &nd)
self.index = append(self.index, ss]) // 隻需要開始位置
}
}
func right(cur *map[string]string, attrs *map[string]string) bool {
// cur 包含 attrs 則傳回true
for k, v := range *attrs {
if (*cur)[k] != v {
return false
}
}
return true
}
func trim(c rune) bool { // 去除首尾的無用字元
return c == '\n' || c == '\t' || c == ' '
}
func (self *Soup) parse(cur int) { // 解析cur節點
if self.nodes[cur].is || !self.nodes[cur].start { // 目前節點已被解析
out("已經解析/結束節點")
return
}
leng := len(self.index)
nds := list.New() // 節點樹
nds.PushBack(cur) // 根節點入棧(位置)
for cur < leng { // 找結束節點
cur++
if cur >= leng {
return
}
tp := nds.Back()
iv := tp.Value.(int)
if self.nodes[cur].start { // 是開始節點
// 壓棧, 此節點為前一節點子節點
self.nodes[iv].Sons = append(self.nodes[iv].Sons, self.nodes[cur])
nds.PushBack(cur)
} else if self.nodes[iv].Tag == self.nodes[cur].Tag { // 是結束節點, 且比對前一個,完成解析,出棧
// 存其Value
self.nodes[iv].Value = strings.TrimFunc(regTag.ReplaceAllString(self.html[self.index[iv]:self.index[cur]], ""), trim)
// 将其置為已解析
self.nodes[iv].is = true
nds.Remove(tp)
}
if nds.Len() == {
break
}
}
}
func (self *Soup) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
cur :=
leng := len(self.index)
for cur < leng {
if tag != "" && tag != self.nodes[cur].Tag { // 标簽不比對
cur++
continue
}
if attrs != nil && !right(self.nodes[cur].Attrs, attrs) { // 屬性不比對
cur++
continue
}
// 找到滿足條件的節點
nodes = append(nodes, self.nodes[cur])
// 解析該節點及其子節點
self.parse(cur)
cur++
}
return
}
func itool(n *Node, tag string, attrs *map[string]string, nodes *[]*Node) {
for _, i := range n.Sons {
if (i.Tag == tag || tag == "") && (attrs != nil && right(i.Attrs, attrs) || attrs == nil) {
*nodes = append(*nodes, i)
}
itool(i, tag, attrs, nodes)
}
}
func (self *Node) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
// 對于節點,之前已經解析過了
itool(self, tag, attrs, &nodes)
return
}
func (self *Soup) SelById(id string) []*Node {
return self.Sel("", &map[string]string{"id": id})
}
func (self *Soup) SelByTag(tag string) []*Node {
return self.Sel(tag, nil)
}
func (self *Soup) SelByClass(class string) []*Node {
return self.Sel("", &map[string]string{"class": class})
}
func (self *Node) SelById(id string) []*Node {
return self.Sel("", &map[string]string{"id": id})
}
func (self *Node) SelByTag(tag string) []*Node {
return self.Sel(tag, nil)
}
func (self *Node) SelByClass(class string) []*Node {
return self.Sel("", &map[string]string{"class": class})
}
示例
package main
import (
"fmt"
"myspider/bs"
)
var html = `
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story" id="sp">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" target="_blank" rel="external nofollow" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" target="_blank" rel="external nofollow" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" target="_blank" rel="external nofollow" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>nothing in here</b>
</p>
<p class="story">...</p>
<ul class="story" id="0">
<li class="t" id="1">
<li class="t" id="2">asdf</li>
</li>
<li class="t" id="3">2</li>
<li class="t" id="4">3</li>
</ul>
`
var soup = bs.Init(html)
func t1() {
// by tag
fmt.Println("By Tag........................")
for _, j := range soup.Sel("a", nil) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// by attrs
fmt.Println("By Attrs........................")
for _, j := range soup.Sel("", &map[string]string{"class": "story"}) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// by tag and attrs
fmt.Println("By Tag And Attrs........................")
for _, j := range soup.Sel("p", &map[string]string{"class": "story"}) {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
fmt.Println("Value:", j.Value)
}
// more
fmt.Println("More.......................................")
for _, j := range soup.Sel("", &map[string]string{"id": "sp"}) {
for _, a := range j.Sel("a", nil) {
fmt.Println("Tag:", a.Tag)
fmt.Println("Attrs:", *a.Attrs)
fmt.Println("Value:", a.Value)
}
}
// Detail
fmt.Println("Soup Details....................................")
for _, j := range soup.SelById("sp") {
fmt.Println("Tag:", j.Tag)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range soup.SelByClass("sister") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range soup.SelByTag("title") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
fmt.Println("Node Details....................................")
note := soup.SelById("sp")]
for _, j := range note.SelByClass("sister") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range note.SelById("link3") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
for _, j := range note.SelByTag("a") {
fmt.Println("Tag:", j.Tag)
fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Attrs:", *j.Attrs)
// fmt.Println("Value:", j.Value)
}
}
func t2() {
n := soup.SelByTag("ul")]
for _, i := range n.Sons {
fmt.Println(i.Value)
}
}
func t3() {
n := soup.SelById("sp")]
for _, i := range n.Sons {
fmt.Println(i.Tag)
}
}
func t4() {
n := soup.SelByTag("ul")]
for _, j := range n.SelByTag("li") {
fmt.Println(j.Value)
}
}
func main() {
t1()
}
至于穩定性,示例的html片段都能解析還有什麼不能解析的?
項目位址:https://github.com/pysrc/bs