爬虫项目汇总

不登录下获取数据

 1 # coding=utf-8
 2 """
 3 用类封装爬虫任务，
 4 目的，获取豆瓣某地区安热度排列的全部电影
 5 思路：
 6     chorme分析目标url，
 7     构建url
 8     发请求获取数据
 9     保存数据
10     循环上三步直到最后一页
11 注意：目前代码中的url地址已经失效
12 """
13 import requests
14 import json
15 
16 class DoubanSpider:
17     def __init__(self):
18         self.url_temp_list = [
19             {
20                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?start={}&count=18&loc_id=108288",
21                 "country": "US"
22             },
23             {
24                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_english_hot/items?start={}&count=18&loc_id=108288",
25                 "country": "UK"
26             },
27             {
28                 "url_temp": "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_domestic_hot/items?start={}&count=18&loc_id=108288",
29                 "country": "CN"
30             }
31         ]
32         self.headers = {
33             "User-Agent": "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36",
34             "Referer": "https://m.douban.com/movie/"
35         }
36 
37     def parse_url(self, url):  # 发送请求，获取响应
38         print(url)
39         response = requests.get(url, headers=self.headers)
40         return response.content.decode()
41 
42     def get_content_list(self, json_str):  # 提取数据
43         dict_ret = json.loads(json_str)
44         content_list = dict_ret["subject_collection_items"]
45         total = dict_ret["total"]  # 代表总数量  不一定正确
46         return content_list, total
47 
48     def save_content_list(self, content_list,country):  # 保存
49         with open("douban.txt", "a", encoding="utf-8") as f:
50             for content in content_list:
51                 content["country"] = country
52                 f.write(json.dumps(content, ensure_ascii=False))
53                 f.write("\n")  # 写入换行符，进行换行
54         print("保存成功")
55 
56     def run(self):  # 实现主要逻辑
57         for url_temp in self.url_temp_list:
58             num = 0 # num是url中的start参数，表示起始页
59             total = 100  # 假设有第一页
60             while num < total + 18: # 不能等于，因为等于意味着上一次已经把最后一页取完了
61                 # 1.start_url
62                 url = url_temp["url_temp"].format(num)
63                 # 2.发送请求，获取响应
64                 json_str = self.parse_url(url)
65                 # 3.提取是数据
66                 content_list, total = self.get_content_list(json_str)
67 
68                 # 4.每一页都保存一下，而不是全部获取后再保存，防止中间出问题了，前面获取的都白费了。
69                 self.save_content_list(content_list,url_temp["country"])
70                 # if len(content_list)<18: # 这种方式判断是否取到尾也可以
71                 #     break
72                 # 5.构造下一页的url地址,进入循环
73                 num += 18
74 
75 
76 if __name__ == '__main__':
77     douban_spider = DoubanSpider()
78     douban_spider.run()

01.豆瓣获取最热电影信息

自动登录案例

 1 """
 2 套路：登录首页的时候，已经给浏览器设置cookies，此时未激活
 3 登录成功后返回假的cookies，激活未激活的cookies，
 4 
 5 """
 6 import requests
 7 from bs4 import BeautifulSoup
 8 
 9 headers = {
10     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
11 }
12 
13 index = requests.get("https://dig.chouti.com/", headers=headers)
14 cookies = index.cookies.get_dict()
15 
16 
17 # ===========================点赞=================
18 
19 # 1.登录
20 login = requests.post(
21     "https://dig.chouti.com/login",
22     data={
23         "phone": 8615026809593,
24         "password":'dajiahaa',
25     },
26     headers=headers,
27     cookies=cookies)
28 
29 # 2.点赞
30 dizan = requests.post(
31     url="https://dig.chouti.com/link/vote?linksId=25389911",
32     cookies=cookies,
33     headers=headers)
34 
35 print(dizan.text)

01.抽屉网

 1 """
 2 套路：
 3 - 带请求头
 4 - 带cookie
 5 - 请求体中：
 6     commit:Sign in
 7     utf8:✓
 8     authenticity_token:放在页面隐藏表单中
 9     login:asdfasdfasdf
10     password:woshiniba8
11 
12 """
13 import requests
14 from bs4 import BeautifulSoup
15 
16 headers = {
17     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
18 }
19 
20 login = requests.get(
21     "https://github.com/login",
22     headers=headers,
23 )
24 cookies = login.cookies.get_dict()
25 login_par = BeautifulSoup(login.content, 'html.parser')
26 token_input = login_par.find(name='input', attrs={"name": "authenticity_token"})
27 
28 authenticity_token = token_input.attrs.get("value")
29 # 1.登录
30 re_login = requests.post(
31     "https://github.com/session",
32     data={
33         "commit": "Sign in",
34         "utf8":"✓",
35         "login": "[email protected]",
36         "password": 'cs11187',
37         "authenticity_token": authenticity_token,
38         "webauthn-support": "supported"
39     },
40     cookies=cookies,
41     headers={
42         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36",
43         "Referer": "https://github.com/login"
44     }
45 )
46 
47 print(re_login.text)

02.github

不登录下获取数据

自动登录案例

猜你喜欢