以下为这次的python作业
#将豆瓣上所有的250部电影信息全部爬取下来 import requests import re def crow(i): url = 'https://movie.douban.com/top250?start=' + str(25 * i) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } response = requests.get(url, headers=headers) # 电影详情页url、图片链接、电影名称、导演信息、电影评分、评价人数、电影简介 movie_content_list = re.findall( # 正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)" .*?<span class="title">(.*?)</span>.*?<p class="">(.*?)</p>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(.*?)人评价.*?<span class="inq">(.*?)</span>', # 解析分析 response.text, # 匹配模式 re.S ) for movie_content in movie_content_list: # 解压赋值 detail_url, movie_jpg, name,actor, point, num ,brief= movie_content data = f'电影名字:{name},详情页url:{detail_url},图片url:{movie_jpg},演员信息:{actor},评分:{point},评价人数:{num},简介:{brief}\n' print(data) # 保存数据,把电影信息写入文件中 with open('豆瓣top250.txt', 'a', encoding='utf-8') as f: f.write(data) for i in range(10): crow(i)
此次部分结果截图
第四天上课的笔记
第一部分爬取梨视频
''' 爬取梨视频 请求url: https://www.pearvideo.com/ 请求方式: GET 请求头 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 ''' # import requests # import re#正则模块 # # #对梨视频详情页发送请求,获取响应数据 # # response = requests.get('https://www.pearvideo.com/') # print(response.status_code) # print(response.text) # # #re.findall(’正则匹配规则','解析文本','正则模式’) # #re.S:全局模式(对整个文本进行匹配) # # . 指的是当前位置 # # * 指的是查找所有 # ''' # <a href="video_1543373" # <a href="video_(.*?)" #提取1543373 # ''' # # # #获取主页视频详情页ID # res = re.findall('<a href="video_(.*?)"',response.text,re.S) # print(res) # # for m_id in res: # #拼接详情页url # detail_url = 'https://www.pearvideo.com/video_'+ m_id # print(detail_url) import requests import re#正则模块 #uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 import uuid #爬虫三部曲 #1.发送请求 def get_page(url): response = requests.get(url) return response #2.解析数据 #解析主页获取视频详情页ID def parse_index(text): res = re.findall('<a href="video_(.*?)"', text, re.S) # print(res) detail_url_list = [] for m_id in res: #拼接详情页url detail_url = 'https://www.pearvideo.com/video_'+ m_id # print(detail_url) detail_url_list.append(detail_url) return detail_url_list #解析详情页获取视频url def parse_detail(text): ''' (.*?):提取括号的内容 .*? :直接匹配 正则:srcUrl="(.*?)" ''' movie_url = re.findall('srcUrl="(.*?)"',text,re.S)[0] return movie_url #保存数据 def save_movie(movie_url): response = requests.get(movie_url) #把视频写到本地 with open(f'{uuid.uuid4()}.mp4','wb')as f: f.write(response.content) f.flush() if __name__ == '__main__': #1.对主页发送请求 index_res = get_page(url = 'https://www.pearvideo.com/') #2.对主页进行解析、获取详情页 detail_url_list = parse_index(index_res.text) #3.对每个详情页url发送请求 for detail_url in detail_url_list: detail_res = get_page(url=detail_url) #4.解析详情页获取视频url movie_url = parse_detail(detail_res.text) print(movie_url) #5.保存视频 save_movie(movie_url)
第二部分对第一部分进行完善,从同步改进为异步,提高了爬取性能
import requests import re#正则模块 #uuid.uuid4() 可以根据时间戳生成一段世界上唯一的随机字符串 import uuid #导入线程池模块 from concurrent.futures import ThreadPoolExecutor #线程池限制50个线程 pool = ThreadPoolExecutor(50) #爬虫三部曲 #1.发送请求 def get_page(url): print(f'开始异步任务:{url}') response = requests.get(url) return response #2.解析数据 #解析主页获取视频详情页ID def parse_index(res): response = res.result() res = re.findall('<a href="video_(.*?)"',response.text, re.S) # print(res) #循环id列表 for m_id in res: #拼接详情页url detail_url = 'https://www.pearvideo.com/video_'+ m_id # print(detail_url) #把详情页url提交给get_page函数 pool.submit(get_page, detail_url).add_done_callback(parse_detail) #解析详情页获取视频url def parse_detail(res): ''' (.*?):提取括号的内容 .*? :直接匹配 正则:srcUrl="(.*?)" ''' response = res.result() movie_url = re.findall('srcUrl="(.*?)"',response.text,re.S)[0] #异步提交把视频url传给get_page函数,把返回的结果传给save_movie pool.submit(get_page,movie_url).add_done_callback(save_movie) #保存数据 def save_movie(res): movie_res = res.result() #把视频写到本地 with open(f'{uuid.uuid4()}.mp4','wb')as f: f.write(movie_res.content) print(f'视频下载结束:{movie_res.url}') f.flush() if __name__ == '__main__': # #1.对主页发送请求 # index_res = get_page(url = 'https://www.pearvideo.com/') # #2.对主页进行解析、获取详情页 # detail_url_list = parse_index(index_res.text) #往get_page发送异步请求,把结果交给parse_index函数 url = 'https://www.pearvideo.com/' pool.submit(get_page,url).add_done_callback(parse_index) # #3.对每个详情页url发送请求 # for detail_url in detail_url_list: # detail_res = get_page(url=detail_url) # # #4.解析详情页获取视频url # movie_url = parse_detail(detail_res.text) # print(movie_url) # # #5.保存视频 # save_movie(movie_url)
第三部分对requests 的详细使用的一个说明
''' #访问知乎发现 请求url: https://www.zhihu.com/explore 请求方式: GET 请求头: user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 cookies ''' # #访问知乎 # import requests # response = requests.get(url='https://www.zhihu.com/explore') # print(response.status_code)#400(因为知乎有反爬系统,所以需要使用user-agent模仿成浏览器) # print(response.text) #携带请求头参数访问知乎 import requests #请求头字典 headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } #在get请求内,添加user—agent response = requests.get(url= 'https://www.zhihu.com/explore',headers=headers) print(response.status_code)#200 #print(response.text) with open('zhihu.html', 'w', encoding='utf-8')as f: f.write(response.text) ''' params请求参数 访问百度搜查安徽工程大学url https://www.baidu.com/s?wd=安徽工程大学&pn=10 https://www.baidu.com/s?wd=安徽工程大学&pn=20 ''' from urllib.parse import urlencode url = 'https://www.baidu.com/s' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36' } #运用params模块可以通过在wd中给搜索关键词,pn中为网址中推断的页码对应符号 response = requests.get(url, headers=headers,params={"wd":"安徽工程大学","pn":"20"}) with open('安徽工程大学.html','w',encoding='utf-8')as f: f.write(response.text) ''' 携带cookies 携带登陆cookies破解github登陆验证 请求url: https://github.com/settings/emails 请求方式: GET 请求头: User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36 Cookie: _ga=GA1.2.340859838.1560497661; _octo=GH1.1.1008046484.1560497665; _device_id=e9262adad4546dc55306dccadf395fc8; user_session=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; __Host-user_session_same_site=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; logged_in=yes; dotcom_user=zcj925; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _gh_sess=OUV6aXA5bFdXV1E4UjhYN3lYRGY5bzFnTFBuRlA2azg3YmIvaXhXcTF1eVdhdzJ4cWhyOCszV2JZSjhqdXdjWHAxSVZrZ0R3NmoxbkN0Q3NaZU03NDVnMms1Q1psRVZkWFMyVjlueE05NlYvbGd1NCt2dHZ5cU9hMUFONHBSZiszdm03RVhTa0NjNlcwM2w0ckl2Q2dUUVl3R3FZaHBOeWxYeHpmejlsbUxBPS0tN0lNS3A1QVdzOHcxbU0vMHk4VVNGQT09--4603443955da96a2744014bf1cc137fb4c62c8ad ''' import requests # #方法一 url ='https://github.com/settings/emails' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', # 'cookie': 'has_recent_activity=1; _ga=GA1.2.340859838.1560497661; tz=Asia%2FShanghai; _octo=GH1.1.1008046484.1560497665; _device_id=e9262adad4546dc55306dccadf395fc8; user_session=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; __Host-user_session_same_site=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; logged_in=yes; dotcom_user=zcj925; _gat=1; _gh_sess=MHNtbkxGQ0Y4TlN5R0N1djVrQmRXVG02czM5bHhKNXF4bWJtQ2lPSzhDVWxML2VKa2Zpa3o4SU95WlMycDU5UDdWK1RYV0gwQWlaVlh6MHVqMEQ0KytvYjNnOXJMQWphdHE5dUNvZUVjcktKZm5laDNXdjVYL0VkZzZzS3QwYzFydkgxSlhlcGdycTJnSTdDTUtUKzVmdHZGRGYrWlUyRlZVTmJOckdpYWVORFVMU2UzVjA2NFlyZXpmV1ZRMWtISEE2LzB6TExZaDZqbHBMZHZSTWpjenRvSURWREx6NEd5a0hyVlJrOFNRbGlXMUQxcmdIbUM2czh6Z2xGZ0pZMnZxaVpXb3JHTFJ6SFZ1ZkxIQWlENnFXOUNZUXVDRzdSV08yeFlNNDF6SkE9LS01VHpTeE1Gb080Q01iamVyVzVoNHlBPT0%3D--7b81cc2d608a25eaae9c7ea2b86f20ee63b8c052' } # github_res = requests.get(url,headers=headers) #方法二 cookies = { 'Cookie': '_ga=GA1.2.340859838.1560497661; _octo=GH1.1.1008046484.1560497665; _device_id=e9262adad4546dc55306dccadf395fc8; user_session=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; __Host-user_session_same_site=Lxlfi6wERyaSG5eP2DGv0cmX5abz1UrlGblpbsMqlfcZaAxC; logged_in=yes; dotcom_user=zcj925; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _gh_sess=OUV6aXA5bFdXV1E4UjhYN3lYRGY5bzFnTFBuRlA2azg3YmIvaXhXcTF1eVdhdzJ4cWhyOCszV2JZSjhqdXdjWHAxSVZrZ0R3NmoxbkN0Q3NaZU03NDVnMms1Q1psRVZkWFMyVjlueE05NlYvbGd1NCt2dHZ5cU9hMUFONHBSZiszdm03RVhTa0NjNlcwM2w0ckl2Q2dUUVl3R3FZaHBOeWxYeHpmejlsbUxBPS0tN0lNS3A1QVdzOHcxbU0vMHk4VVNGQT09--4603443955da96a2744014bf1cc137fb4c62c8ad' } # github_res = requests.get(url ,headers=headers, cookies= cookies) # print('1053918575'in github_res.text) github_res = requests.get(url, headers=headers, cookies=cookies) print('1053918575' in github_res.text)
第四部分对豆瓣电影top250的前25个进行了爬取
'''''' ''' 主页: https://movie.douban.com/top250 GET User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则: # 电影详情页url、图片链接、电影名称、电影评分、评价人数 <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价 ''' import requests import re url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36' } # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point, num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num} \n' print(data) # 3、保存数据,把电影信息写入文件中 with open('douban.txt', 'a', encoding='utf-8') as f: f.write(data)
总结:这一天的课学习下来让我感受很吃力,起初的内容还是可以理解的,但是到之后的一些突然发现难以去理解,到自己写的时候发现无法写出来,并且作业本以为自己可以写出来,后来发现自己的想法使用不了,把变量写入params只能将第一页的内容打印出来,好像并没有去执行我想要的,后来百度之后换了一种方法才实现了改功能,感觉自己还是要更一步去认真学习python相关知识