Go语言解析Html

思想来源：BeautifulSoup4

原则：简单、快、省内存

特点：自造轮子随心用，不规则html照样干

结构体及其接口定义

package bs

type SelFunc interface {
    Sel(tag string, attrs *map[string]string) (nodes []*Node) // 只提供给user此方法
    SelById(id string) []*Node
    SelByTag(tag string) []*Node
    SelByClass(class string) []*Node
}

type Node struct { // 基本节点结构
    Tag   string             // 标签名
    Attrs *map[string]string //属性
    Value string             // 此节点的值
    Sons  []*Node            // 子节点
    is    bool               // 节点是否已经遍历
    start bool               // 是否开始节点
}

type Soup struct { // 解析结构
    html  string  // 文本
    nodes []*Node // 标签列表
    index []int   // 所有标签的下标
}

解析步骤（核心）

1.初始化 Soup ，此时生成Html文档的各个节点列表以及节点位置的记录表

2.用户调用 Sel 方法，传入解析规则（标签名、标签属性限制等）

3.解析用户请求并返回子节点指针（指针省内存）

代码：

package bs

import (
    "container/list"
    "fmt"
    "regexp"
    "strings"
)

var (
    regTag   = regexp.MustCompile(`<[a-z|A-Z|/].*?>`)         // 匹配标签
    regAttrs = regexp.MustCompile(`([a-z|A-Z]+?)= *?"(.*?)"`) // 匹配属性
    DEBUG    = false
)

func out(s string) {
    if DEBUG {
        fmt.Println(s)
    }
}

func Init(html string) *Soup { // 初始化Soup
    sp := Soup{}
    sp.setHtml(html)
    return &sp
}

func (self *Soup) setHtml(text string) {
    self.html = text
    for _, ss := range regTag.FindAllStringIndex(self.html, 100000) {
        s := self.html[ss[0]:ss[1]]
        if strings.Contains(s, "/>") || strings.Contains(s, "<br>") || strings.Contains(s, "<img") || strings.Contains(s, "<hr") || strings.Contains(s, "<input") { // 不要单独的标签
            continue
        }
        var nd Node
        if s[:2] == "</" { // 结束标签
            nd.Tag = s[2 : len(s)-1]
            nd.start = false
        } else { // 开始标签
            nd.Tag = strings.Split(s, " ")[0][1:]
            nd.start = true
            if strings.Contains(nd.Tag, ">") {
                nd.Tag = nd.Tag[:len(nd.Tag)-1]
            }
        }
        // fmt.Println("Tag:", nd.Tag, nd.start)
        attrs := make(map[string]string)
        for _, a := range regAttrs.FindAllStringSubmatch(s, 10) {
            if len(a) == 3 {
                attrs[a[1]] = a[2]
            }
        }
        nd.Attrs = &attrs
        nd.is = false
        // fmt.Println(nd.Tag, *nd.Attrs)
        self.nodes = append(self.nodes, &nd)
        self.index = append(self.index, ss[0]) // 只需要开始位置
    }
}

func right(cur *map[string]string, attrs *map[string]string) bool {
    // cur 包含 attrs 则返回true
    for k, v := range *attrs {
        if (*cur)[k] != v {
            return false
        }
    }
    return true
}

func trim(c rune) bool { // 去除首尾的无用字符
    return c == '\n' || c == '\t' || c == ' '

}

func (self *Soup) parse(cur int) { // 解析cur节点
    if self.nodes[cur].is || !self.nodes[cur].start { // 当前节点已被解析
        out("已经解析/结束节点")
        return
    }
    leng := len(self.index)
    nds := list.New() // 节点树
    nds.PushBack(cur) // 根节点入栈（位置）
    for cur < leng {  // 找结束节点
        cur++
        if cur >= leng {
            return
        }
        tp := nds.Back()
        iv := tp.Value.(int)
        if self.nodes[cur].start { // 是开始节点
            // 压栈, 此节点为前一节点子节点
            self.nodes[iv].Sons = append(self.nodes[iv].Sons, self.nodes[cur])
            nds.PushBack(cur)

        } else if self.nodes[iv].Tag == self.nodes[cur].Tag { // 是结束节点， 且匹配前一个,完成解析,出栈
            // 存其Value
            self.nodes[iv].Value = strings.TrimFunc(regTag.ReplaceAllString(self.html[self.index[iv]:self.index[cur]], ""), trim)
            // 将其置为已解析
            self.nodes[iv].is = true
            nds.Remove(tp)
        }
        if nds.Len() == 0 {
            break
        }
    }
}

func (self *Soup) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
    cur := 0
    leng := len(self.index)
    for cur < leng {
        if tag != "" && tag != self.nodes[cur].Tag { // 标签不匹配
            cur++
            continue
        }
        if attrs != nil && !right(self.nodes[cur].Attrs, attrs) { // 属性不匹配
            cur++
            continue
        }
        // 找到满足条件的节点
        nodes = append(nodes, self.nodes[cur])
        // 解析该节点及其子节点
        self.parse(cur)
        cur++
    }
    return
}

func itool(n *Node, tag string, attrs *map[string]string, nodes *[]*Node) {
    for _, i := range n.Sons {
        if (i.Tag == tag || tag == "") && (attrs != nil && right(i.Attrs, attrs) || attrs == nil) {
            *nodes = append(*nodes, i)
        }
        itool(i, tag, attrs, nodes)
    }

}

func (self *Node) Sel(tag string, attrs *map[string]string) (nodes []*Node) {
    // 对于节点，之前已经解析过了
    itool(self, tag, attrs, &nodes)
    return
}

func (self *Soup) SelById(id string) []*Node {
    return self.Sel("", &map[string]string{"id": id})

}

func (self *Soup) SelByTag(tag string) []*Node {
    return self.Sel(tag, nil)
}

func (self *Soup) SelByClass(class string) []*Node {
    return self.Sel("", &map[string]string{"class": class})
}

func (self *Node) SelById(id string) []*Node {
    return self.Sel("", &map[string]string{"id": id})

}

func (self *Node) SelByTag(tag string) []*Node {
    return self.Sel(tag, nil)
}

func (self *Node) SelByClass(class string) []*Node {
    return self.Sel("", &map[string]string{"class": class})
}

示例

package main

import (
    "fmt"
    "myspider/bs"
)

var html = `
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story" id="sp">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>nothing in here</b>
</p>
<p class="story">...</p>
<ul class="story" id="0">
    <li class="t" id="1">
        <li class="t" id="2">asdf</li>
    </li>
    <li class="t" id="3">2</li>
    <li class="t" id="4">3</li>
</ul>
`

var soup = bs.Init(html)

func t1() {

    // by tag
    fmt.Println("By Tag........................")
    for _, j := range soup.Sel("a", nil) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }
    // by attrs
    fmt.Println("By Attrs........................")
    for _, j := range soup.Sel("", &map[string]string{"class": "story"}) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }
    // by tag and attrs
    fmt.Println("By Tag And Attrs........................")
    for _, j := range soup.Sel("p", &map[string]string{"class": "story"}) {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        fmt.Println("Value:", j.Value)
    }

    // more
    fmt.Println("More.......................................")
    for _, j := range soup.Sel("", &map[string]string{"id": "sp"}) {
        for _, a := range j.Sel("a", nil) {
            fmt.Println("Tag:", a.Tag)
            fmt.Println("Attrs:", *a.Attrs)
            fmt.Println("Value:", a.Value)
        }
    }
    // Detail
    fmt.Println("Soup Details....................................")
    for _, j := range soup.SelById("sp") {
        fmt.Println("Tag:", j.Tag)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)

    }
    for _, j := range soup.SelByClass("sister") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range soup.SelByTag("title") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    fmt.Println("Node Details....................................")
    note := soup.SelById("sp")[0]
    for _, j := range note.SelByClass("sister") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range note.SelById("link3") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
    for _, j := range note.SelByTag("a") {
        fmt.Println("Tag:", j.Tag)
        fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Attrs:", *j.Attrs)
        // fmt.Println("Value:", j.Value)
    }
}

func t2() {
    n := soup.SelByTag("ul")[0]
    for _, i := range n.Sons {
        fmt.Println(i.Value)
    }
}

func t3() {
    n := soup.SelById("sp")[0]
    for _, i := range n.Sons {
        fmt.Println(i.Tag)
    }
}

func t4() {
    n := soup.SelByTag("ul")[0]
    for _, j := range n.SelByTag("li") {
        fmt.Println(j.Value)
    }
}
func main() {
    t1()
}

至于稳定性，示例的html片段都能解析还有什么不能解析的？

项目地址：https://github.com/pysrc/bs

Go语言解析Html

Go语言解析Html

结构体及其接口定义

解析步骤（核心）

示例

猜你喜欢