最近闲来无事,写一份关于爬虫的文章。
抓取糗事百科 目标: 1、抓取糗事百科热门帖,获取其**发布者**、**评论(及数量)**、**赞踩(及数量)**等 2、将信息清洗并打印,循环输出 3、设计程序,使可以选择抓取的页面范围 4、将每一页的信息保存到文本
废话不多说 代码如下(具体的每一步都有详细的解释):
# -*-coding:utf-8-*- import re import requests import time """初始化查询的网址""" siteURL = "https://www.qiushibaike.com/" def replace(x): """ 方便用replace方法把换行符等删除 :param: x :return: x.strip """ x = re.sub(re.compile('<br>|</br>|/>|<br'), "", x) return x.strip() def getSource(url): """ 获取网页源码 :param: url :return: result """ user_agent = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' headers = {'User_agent': user_agent} r = requests.get(url, headers=headers) result = r.text return result def getDetaipage(detailURL): """ 获取详情页信息 :param: detailURL :return:items """ source = getSource(detailURL) pattern = re.compile( '<div class="author.*?<h2>(.*?)</h2>.*?Icon">(.*?)</div>.*?<div class="content">.*?<span>(.*?)</span>.*?<span.*?stats-vote.*?number">(.*?)</i>.*?stats-comments.*?number">(.*?)</i>.*?up.*?number hidden">(.*?)</span>.*?down.*?number hidden">(.*?)</span>', re.S) items = re.findall(pattern, source) return items def saveDetailpage(data): """ 保存信息写入文件 :param: data,name :return: """ with open("qiushibaike.txt", "a+", encoding='utf-8') as f: f.write(data) def OnePage(detailURL): """ 对一页的操作 :param: detailURL,name :return:data """ data = getDetaipage(detailURL) return data def getAllPage(start, end): """ 对很多页的操作(1:start页等于1 2:start页大于1) :param: start,end :return: False """ items = [] if start == 1: print(u'正在获取第 1 页的数据...') detailURL = siteURL data = OnePage(detailURL) # 将多个 data 合并 items += data number = 2 for page in range(2, end + 1): print(u'正在获取第', number, u'页的数据...') detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625' data = OnePage(detailURL) items += data time.sleep(2) number += 1 if number == end + 1: print(u'', u'\n加载结束!') return items elif start > 1: number = start for page in range(start, end + 1): print(u'', u'\n正在获取第', number, u'页的数据...') # https://www.qiushibaike.com/8hr/page/10/ detailURL = siteURL + '8hr/page/' + str(page) + '/?s=4964625' data = OnePage(detailURL) items += data time.sleep(2) number += 1 if number == end + 1: print(u'', u'加载结束!') return items def main(): """主函数 调用的参数可以更改 最大到13页""" items = getAllPage(start=int(1), end=int(13)) number = 1 for item in items: data = str(number) + u'楼' + u'\t楼主:' + replace(item[0]) + u'\t' + item[1] + u'岁' + u'\n发言:' + replace( item[2]) + u'\n好笑:' + item[3] + u'\t评论:' + item[4] + u'\t赞:' + item[5] + u'\t踩:' + item[6] + '\n' print(data) saveDetailpage(str(data)) number = int(number) + 1 if __name__ == "__main__": main()