爬虫实例之使用requests和Beautifusoup爬取糗百热门用户信息

这次主要用requests库和Beautifusoup库来实现对糗百的热门帖子的用户信息的收集,由于糗百的反爬虫不是很严格,也不需要先登录才能获取数据,所以较简单。

思路,先请求首页的热门帖子获得用户详情链接,然后请求用户详情页,用Beautifusoup解析得到基本的用户信息

代码:

 1 #!/usr/bin/env python
 2 # -*- coding:utf-8 -*-
 3 import requests
 4 import json
 5 import urllib3
 6 import pymongo
 7 from bs4 import BeautifulSoup
 8 
 9 urllib3.disable_warnings()
10 
11 class Qsbk():
12     def __init__(self,url):
13         self.url = url # 糗事百科首页热门帖子链接
14         self.base_url = 'https://www.qiushibaike.com' # 用于用户链接拼接
15         self.user_info = {} # 保存用户信息
16         self.headers = {
17             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'
18         } # 请求头信息
19         self.proxies = {
20             'http':'192.168.105.71:80',
21             'https':'192.168.105.71:80'
22         } # 使用代理
23         self.file = open('qsbk.json','a',encoding='utf-8') # 把用户信息保存到json文本中,以追加的形式
24 
25         
26 
27     def get_data(self):
28         try:
29             response = requests.get(url=self.url,headers=self.headers,proxies=self.proxies,verify=False).text  # 请求糗百热门页面
30             soup = BeautifulSoup(response,'lxml')
31             node_list = soup.find_all('div',class_='author clearfix')
32             for node in node_list:
33                 item = node.find('a')
34                 if item != None:  # 某些匿名帖子没有用户链接,需要排除掉
35                     link = item.get('href') # 得到用户详情链接
36                     user_link = self.base_url + link
37                     # print(user_link)
38                     self.parse_data(user_link) # 请求详情页并解析得到用户信息
39         except Exception as e:
40             print(e)
41 
42 
43     def parse_data(self,user_link):
44         try:
45             result = requests.get(user_link,headers=self.headers,proxies=self.proxies,verify=False).content.decode()
46             soup = BeautifulSoup(result,'lxml')
47             status = soup.find('h3').get_text()
48             if '当前用户已关闭糗百个人动态' in status:  # 排除关闭糗百的用户
49                 pass
50             else:
51                 username = soup.find('h2').get_text()
52                 self.user_info['用户名'] = username
53                 node_list = soup.find_all('div',class_="user-statis user-block")
54                 fans = node_list[0].select('ul > li')[0].get_text().split(':',1)[-1] #得到是“粉丝数:111”类似的数据,需要先用":"分割然后得到具体的数字,后续信息类似
55                 concerns = node_list[0].select('ul > li')[1].get_text().split(':',1)[-1]
56                 comments = node_list[0].select('ul > li')[3].get_text().split(':',1)[-1]
57                 coins = node_list[0].select('ul > li')[4].get_text().split(':',1)[-1]
58                 marriage = node_list[1].select('ul > li')[0].get_text().split(':',1)[-1]
59                 job = node_list[1].select('ul > li')[2].get_text().split(':',1)[-1]
60                 web_age = node_list[1].select('ul > li')[4].get_text().split(':',1)[-1]
61                 self.user_info['粉丝数'] = fans
62                 self.user_info['关注数'] = concerns
63                 self.user_info['评论'] = comments
64                 self.user_info['笑脸'] = coins
65                 self.user_info['婚姻'] = marriage
66                 self.user_info['职业'] = job
67                 self.user_info['糗龄'] = web_age
68                 # print(self.user_info)
69                 self.save_json() # 保存数据到json文件中
70                 self.save_mongo() # 保存到MongoDB中
71         except Exception as e:
72             print(e)
73 
74     def save_json(self):
75         content = json.dumps(self.user_info,ensure_ascii=False) + '\n'  # 每行结束后换行
76         self.file.write(content)
77         self.file.close()
78 
79     def save_mongo(self):
80         try:
81             self.client = pymongo.MongoClient(host='localhost', port=27107)
82             self.col = self.client['qsbk']['qsbk']
83             self.col.insert(self.user_info)
84         except Exception as e:
85             print(e)
86 
87 if __name__ == '__main__':
88     for i in range(1,3):
89         url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(i)
90         # print(url)
91         qsbk = Qsbk(url)
92         qsbk.get_data()

猜你喜欢

转载自www.cnblogs.com/woaixuexi9999/p/9283253.html
今日推荐