贴吧爬虫案例

首先看下小案例:

import urllib
import urllib.parse
import urllib.request

#抓取数据的目标地址
url = "http://www.baidu.com/s"
headers = {"User-Agent":"Mozilla..."}
#输入查询字符串
keyword = input("请输入你所要查询的字符串:")

wd = {"wd":keyword}
#对字符串进行解码
wd = urllib.parse.urlencode(wd)
#拼接完整的字符串
fullUrl = url + "?" + wd
print(fullUrl)
#创建一个请求对象
request = urllib.request.Request(fullUrl,headers = headers)
#获取相应信息
response = urllib.request.urlopen(request)
#print(response.read())

贴吧小爬虫案例

爬取一个贴吧

import urllib
import urllib.request
def loadPage(url,filename):
    """
    作用:根据url发送请求,获取服务器响应信息
    url:需要爬取的url地址
    ffilename:处理的文件名
    """
    print("正在下载"+str(filename))
    headers = {"USer-Agent":"Mozilla..."}
    #构建请求对象
    request = urllib.request.Request(url,headers = headers)
    #返回响应信息
    response= urllib.request.urlopen(request)
    return response.read()

def writePage(html,filename):
    """
    作用:将html内容写道本地
    html:服务器响应内容文件
    """
    print("正在保存"+filename)
    #文件写入方法,该方法不用文件关闭操作
    with open(filename,"w") as f:
        f.write(str(html))
    print ("-" * 30)

def tiebaSpider(url,beginPage,endPage):
    """
    作用:贴吧爬虫调度器,负责组合处理每个页面的url
    url:贴吧url的前部分
    beginPage:起始页
    endPage:终止页
    """
    endPage = endPage +1
    for page in range(beginPage,endPage):
        pn = (page -1) * 50
        filename = "第" + str(page) + "页.html"
        fullurl = url + "&pn=" +str(pn)
        print(fullurl)
        html = loadPage(fullurl,filename)
        writePage(html,filename)

if __name__ == "__main__":
    kw = input("请输入你要爬取的贴吧名:")
    beginPage = int(input("请输入起始页:"))
    endPage = int(input("请输入结束页:"))

    url = "http://tieba.com/f?"
    key = urllib.parse.urlencode({"kw":kw})
    fullurl = url + key
    tiebaSpider(fullurl,beginPage,endPage)

结果:

请输入你要爬取的贴吧名:python
请输入起始页:1
请输入结束页:3
http://tieba.com/f?kw=python&pn=0
正在下载第1页.html
正在保存第1页.html
------------------------------
http://tieba.com/f?kw=python&pn=50
正在下载第2页.html
正在保存第2页.html
------------------------------
http://tieba.com/f?kw=python&pn=100
正在下载第3页.html
正在保存第3页.html
------------------------------

Process finished with exit code 0

猜你喜欢

转载自blog.csdn.net/qq_38709565/article/details/81111048