人狠话不多,直接上源码(爬虫小白,大神勿喷)
"""
糗事百科爬虫
"""
from urllib import request,parse
from urllib import error
import chardet
from lxml import etree
import csv
def qiushibaikeSpider(url, beginPage, endPage):
for page in range(beginPage, endPage):
pn = page
fullurl =url + "page/" + str(pn)
# print(fullurl)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
req = request.Request(fullurl, headers=headers)
try:
response = request.urlopen(req)
resHtml = response.read()
resHtml = resHtml.decode("utf-8",'ignore')
html = etree.HTML(resHtml)
results = html.xpath('//div[contains(@id,"qiushi_tag")]')
for site in results:
# 头像的提取
imgs = site.xpath('./div/a/img/@src')
if len(imgs) > 0:
imgUrl = imgs[0]
else:
imgUrl = ""
# 作者的提取
authorUrl=site.xpath("./div/a/h2")
if not authorUrl:
author = "匿名用户"
else:
author = authorUrl[0].text
# 帖子内容
content = site.xpath('.//div[@class="content"]/span')[0].text.strip()
# 点赞数的提取
vote = site.xpath('.//span[@class="stats-vote"]/i')[0].text
# 评论数
comment = site.xpath('.//span[@class="stats-comments"]/a/i')[0].text
print(imgUrl, author, content, vote, comment)
# 数据的存储
filename = './data/qiushibaike_' + str(page) + '.csv'
with open(filename, 'a', encoding='utf-8') as file:
wr = csv.writer(file)
wr.writerow([imgUrl, author, content, vote, comment])
except error.URLError as e:
print(e)
if __name__ == "__main__":
proxy = {"http": "118.31.220.3:8080"}
proxy_support = request.ProxyHandler(proxy)
opener = request.build_opener(proxy_support)
request.install_opener(opener)
beginPage = int(input("请输入起始页:"))
endPage = int(input("请输入终止页:"))
url = "https://www.qiushibaike.com/8hr/"
qiushibaikeSpider(url,beginPage, endPage)