foreword
Many people may think that crawling is the exclusive skill of Python, but in fact, using Go language may achieve better results
what is a reptile
One thing we must understand before starting to implement a crawler is what a crawler is. A web crawler (also known as a web spider, web robot, and more often called web chaser in the FOAF community) is a program or script that automatically crawls information from the World Wide Web according to certain rules . Other less commonly used names are ant, autoindex, emulator, or worm.
How to write a crawler
The following points are worth noting when writing a crawler program
-
Specify the target URL
-
Send a request, get a response packet
-
save filter data
-
Use analytics data
crawler implementation
Baidu Tieba webpage is simple to crawl
Take Chongqing University of Posts and Telecommunications as an example
According to the URL, it can be found that there is an extra 50 behind the pn each time.
After finding the rule, we can implement it concretely. We need to use the net/http package to get the data of the page and save it in the file through IO operation
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
func httpGet(url string) (res string, err error) {
resp, err1 := http.Get(url)
if err != nil {
err = err1 //内部错误传出
return
}
defer resp.Body.Close()
//循环读取数据 传出给调用者
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
fmt.Println("读取完成")
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
//累加数据
res += string(buf[:n])
}
return
}
func query(start int, end int) {
fmt.Printf("正在爬取%d页到%d页...\n", start, end)
//循环爬取数据
for i := start; i <= end; i++ {
url := "https://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E9%82%AE%E7%94%B5%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
res, err := httpGet(url)
if err != nil {
fmt.Println("err = ", err)
continue
}
//保存为文件
f, err := os.Create("第" + strconv.Itoa(i) + "页" + ".html")
if err != nil {
fmt.Println("err = ", err)
continue
}
f.WriteString(res)
f.Close() //保存好一个文件就关闭一个
}
}
func main() {
//指定起始终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
query(start, end)
}
Crawl Baidu Tieba webpage concurrent version
A major language feature of the Go language is that it naturally supports high concurrency, and crawler and concurrency can be perfectly combined. Crawling with high concurrency can greatly improve the efficiency of crawler. The implementation of high concurrency is not difficult. We only need to start a coroutine and synchronize it with the main coroutine. Other operations are similar to the non-concurrent version.
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
func httpGet(url string) (res string, err error) {
resp, err1 := http.Get(url)
if err != nil {
err = err1 //内部错误传出
return
}
defer resp.Body.Close()
//循环读取数据 传出给调用者
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
//累加数据
res += string(buf[:n])
}
return
}
//爬取单个页面的函数
func spiderPage(i int, page chan int) {
url := "https://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E9%82%AE%E7%94%B5%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
res, err := httpGet(url)
if err != nil {
fmt.Println("err = ", err)
return
}
//保存为文件
f, err := os.Create("第" + strconv.Itoa(i) + "页" + ".html")
if err != nil {
fmt.Println("err = ", err)
return
}
f.WriteString(res)
f.Close() //保存好一个文件就关闭一个
page <- i //与主协程完成同步
}
func query(start int, end int) {
fmt.Printf("正在爬取%d页到%d页...\n", start, end)
page := make(chan int)
//循环爬取数据
for i := start; i <= end; i++ {
go spiderPage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d个页面完成爬取完成\n", <-page)
}
}
func main() {
//指定起始终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
query(start, end)
}
regular expression
A regular expression , also known as a regular expression , (Regular Expression, often abbreviated as regex, regexp, or RE in code), is a text pattern that includes ordinary characters (for example, letters between a and z) and special characters ( called "metacharacters"), a concept in computer science . Regular expressions use a single string to describe and match a series of strings that match a syntactic rule, and are usually used to retrieve and replace text that matches a pattern (rule).
Many programming languages support string manipulation using regular expressions. For example, a powerful regular expression engine is built into Perl . The concept of regular expressions was first popularized by tools in Unix (such as sed and grep ), and later widely used in Scala, PHP, C#, Java, C++, Objective-c, Perl, Swift, VBScript, Javascript , Ruby, Python, and more. Regular expressions are usually abbreviated as "regex", with regexp, regex in the singular , and regexps, regexes, and regexen in the plural .
character test
package main
import (
"fmt"
"regexp"
)
func main() {
str := "abc a7c mfc cat 8ca azc cba"
//解析
ret := regexp.MustCompile(`a.c`)
//提取
res := ret.FindAllStringSubmatch(str, -1)
//打印
fmt.Println(res)
}
//输出
[[abc] [a7c] [azc]]
进程 已完成,退出代码为 0
decimal test
package main
import (
"fmt"
"regexp"
)
func main() {
str := "3.14 123.123 .68 haha 1.0 abc 7. ab.3 66.6 123."
//解析
ret := regexp.MustCompile(`[0-9]+.[0-9]+`)
//提取
res := ret.FindAllStringSubmatch(str, -1)
//打印
fmt.Println(res)
}
//输出
3.14] [123.123] [1.0] [66.6]]
进程 已完成,退出代码为 0
Web Tab Test
package main
import (
"fmt"
"regexp"
)
func main() {
str := `<div class="wrapper">
<ul style='margin-left: 3px;' id='main-menu' >
<li><a href="index.php">首 页</a></li>
<li ><a href="user.php" >个人服务</a></li>
<li><a href="jwglFiles/index.php" target='_blank' >教学管理文件</a></li>
<li><a href="#">培养方案</a>
<ul>
<li><a href="pyfa2020/index.php" target='_blank'>2020版培养方案</a></li>
<li><a href="pyfa/index.php" target='_blank'>2016版培养方案</a></li>
<li><a href="infoNavi.php?dId=000303" target='_blank'>其他版培养方案</a></li>
<li><a href="lxs/pyfa/index.php" target='_blank'>留学生</a></li>
</ul>
</li>
<li><a href="bszn/index.php" target='_blank' >办事指南</a></li>
<li><a href="kebiao/index.php" target='_bank'>课表查询</a></li>
<li><a href="jxjc/index.php" target='_bank' >进程与调停课</a></li>
<li><a href="ksap/index.php" target='_bank' >考试安排</a></li>
<li><a href="infoNavi.php?dId=000308" target='_bank' >表格下载</a></li>
<li><a href="infoNavi.php?dId=000310" target='_bank' >校历</a></li>
<!--
<li ><a href="history/index.php" target="_blank">历史数据</a></li>
<li><a href="websiteNavi.php" class="topMenu" >功能网站</a></li>
<li><a href="historyData.php" class="topMenu" >数据中心</a></li>
<li ><a href="jwzxlxs/index.php" target="_blank">留学生</a></li>
-->
<li><a href="infoNavi.php?dId=0007" target='_bank' >党建工作</a></li>
<li><a href="contact.php" class="popInfo" >联系我们</a></li>
</ul>
<div style="float: right;color: rgb(221, 221, 221);padding: 9px 10px;">`
//解析
ret := regexp.MustCompile(`<li><a href="(?s:(.*?))"`)
//提取
res := ret.FindAllStringSubmatch(str, -1)
//打印
for _, one := range res {
//fmt.Println("one[0]=", one[0])
fmt.Println(one[1])//返回的是一个数组
}
}
//输出内容
index.php
jwglFiles/index.php
#
pyfa2020/index.php
pyfa/index.php
infoNavi.php?dId=000303
lxs/pyfa/index.php
bszn/index.php
kebiao/index.php
jxjc/index.php
ksap/index.php
infoNavi.php?dId=000308
infoNavi.php?dId=000310
websiteNavi.php
historyData.php
infoNavi.php?dId=0007
contact.php
Use regular expressions to crawl titles in Baidu Tieba (high concurrency)
check title regex
class="j_th_tit ">(?s:(.*?))</a>
package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func httpGet(url string) (res string, err error) {
resp, err1 := http.Get(url)
if err != nil {
err = err1 //内部错误传出
return
}
defer resp.Body.Close()
//循环读取数据 传出给调用者
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
break
}
if err2 != nil && err2 != io.EOF {
err = err2
return
}
//累加数据
res += string(buf[:n])
}
return
}
func saveFile(i int, title [][]string) {
f, err := os.Create("第" + strconv.Itoa(i) + "页.txt")
if err != nil {
fmt.Println("err = ", err)
return
}
defer f.Close()
n := len(title)
for i := 0; i < n; i++ {
f.WriteString(title[i][1] + "\n")
}
}
//爬取单个页面的函数
func spiderPage(i int, page chan int) {
url := "https://tieba.baidu.com/f?kw=%E9%87%8D%E5%BA%86%E9%82%AE%E7%94%B5%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
res, err := httpGet(url)
if err != nil {
fmt.Println("err = ", err)
return
}
ret := regexp.MustCompile(`class="j_th_tit ">(?s:(.*?))</a>`)
titles := ret.FindAllStringSubmatch(res, -1)
saveFile(i, titles)
page <- i //与主协程完成同步
}
func query(start int, end int) {
fmt.Printf("正在爬取%d页到%d页...\n", start, end)
page := make(chan int)
//循环爬取数据
for i := start; i <= end; i++ {
go spiderPage(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d个页面完成爬取完成\n", <-page)
}
}
func main() {
//指定起始终止页
var start, end int
fmt.Print("请输入爬取的起始页(>=1):")
fmt.Scan(&start)
fmt.Print("请输入爬取的终止页(>=start):")
fmt.Scan(&end)
query(start, end)
}
Epilogue
If there is anything unclear, please feel free to ask me questions, and I will try my best to answer them.
Here is my GitHub home page github.com/L2ncE
Welcome everyone to Follow /Star/Fork