爬虫入门-5-1.正则表达式在爬虫中的应用

1.爬取百思不得姐段子

 1 import requests
 2 import re
 3 
 4 
 5 def parse_url(url):
 6     headers = {
 7         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
 8                       'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
 9     }
10     response = requests.get(url, headers=headers, timeout=10)
11     text = response.text
12     # 1.正则表达式规则注意标签后必须加.*?
13     contents = re.findall(r'<div class="j-r-list-c-desc">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
14     # 2.新建列表,保存修改过的段子
15     jokes = []
16     for content in contents:
17         # dz类型为<class 'str'>
18         dz = re.sub(r"<br.*?>", "", content)
19         # 3.将段子加入到列表
20         jokes.append(dz)
21         for joke in jokes:
22             # 4.将段子存储到文件中
23             with open('3.txt', 'a', encoding='utf-8') as f:
24                 f.write(joke)
25                 f.write('\n\n')
26 
27 
28 def main():
29     url = 'http://www.budejie.com/text/1'
30     for x in range(1, 10):
31         url = 'http://www.budejie.com/text/%s' % x
32         parse_url(url)
33 
34 
35 if __name__ == '__main__':
36     main()
View Code

2.爬取豆瓣新书首页

 1 import re
 2 import requests
 3 import os
 4 
 5 PROXY = {
 6     'HTTPS': '116.209.55.208:9999'
 7 }
 8 
 9 
10 def spider():
11     url = 'https://book.douban.com/latest?icn=index-latestbook-all'
12     response = requests.get(url, proxies=PROXY)
13     # 获取图片只能用response.text,不能用response.content
14     html = response.text
15     # 获取图片只需要写src属性正则
16     titles = re.findall(r'<div class="detail-frame">.*?<a.*?>(.*?)</a>', html, re.DOTALL)
17     imgs = re.findall('img src="(.*?)"', html, re.DOTALL)[1:]
18     for value in zip(titles, imgs):
19         title, img = value
20         with open('pic/' + title + '.jpg', 'wb') as f:
21             f.write(requests.get(img).content)
22 
23 
24 if __name__ == '__main__':
25     if os.path.exists('pic'):
26         pass
27     else:
28         os.mkdir('pic')
29     spider()
View Code

3.爬取古诗文

 1 import requests
 2 import re
 3 
 4 
 5 def parse_page(url):
 6     headers = {
 7         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
 8                       '(KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
 9     }
10     response = requests.get(url, headers=headers)
11     text = response.text
12     # 获取古诗标题
13     titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL)
14     # 获取作者朝代
15     dynasties = re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
16     # 获取作者姓名
17     authors = re.findall(r'<p\sclass="source">.*?<a.*?><a.*?>(.*?)</a>', text, re.DOTALL)
18     # 获取古诗内容
19     content_tags = re.findall(r'<div\sclass="contson" .*?>(.*?)</div>', text, re.DOTALL)
20     contents = []
21     for content in content_tags:
22         x = re.sub(r'<.*?>', "", content)
23         contents.append(x.strip())
24     poems = []
25     for value in zip(titles, dynasties, authors, contents):
26         title, dynasty, author, content = value
27         poem = {
28             'title': title,
29             'dynasty': dynasty,
30             'author': author,
31             'content': content
32         }
33         poems.append(poem)
34     for poem in poems:
35         print(poem)
36         print('=' * 50)
37 
38 
39 def main():
40     url = 'https://www.gushiwen.org/default_1.aspx'
41     for x in range(1, 20):
42         url = 'https://www.gushiwen.org/default_%s.aspx' % x
43         parse_page(url)
44 
45 
46 if __name__ == '__main__':
47     main()
View Code

4.爬取校花网图片

 1 import requests, re, os
 2 
 3 # 文件夹名称
 4 FileName = 'downloads'
 5 
 6 
 7 # 保存图片
 8 def SaveImage(image, name="temp"):
 9     # 图片存放路径
10     fpath = os.path.join(FileName, name + '.jpg')
11     response = requests.get("http://www.xiaohuar.com/d/file/" + image).content
12     # 写入图片
13     with open(fpath, 'wb+') as f:
14         f.write(response)
15 
16 
17 # 获取当前页图片Url集合
18 def GetImage(fanyeUr):
19     # 请求页面
20     page = requests.get(fanyeUr)
21     # 设置编码
22     page.encoding = 'gbk'
23     # 正则获取图片集合
24     imglist = re.findall('alt="(.*?)" src="/d/file/(.*?\.jpg)"', page.text)
25     # 循环保存图片
26     for name, url in imglist:
27         print(url, name)
28         SaveImage(url, name)
29 
30 
31 # 判断文件夹是否存在
32 if not os.path.exists(os.path.join(os.getcwd(), FileName)):
33     # 新建文件夹
34     os.mkdir(os.path.join(os.getcwd(), FileName))
35 
36 # 请求第一页
37 fanyeUr = 'http://www.xiaohuar.com/list-1-0.html'
38 # 循环翻页
39 for faye in range(1, 5):
40     # 获取翻页Url
41     GetImage(fanyeUr)
42     fanyeUr = 'http://www.xiaohuar.com/list-1-%s.html' % faye
View Code

猜你喜欢

转载自www.cnblogs.com/min-R/p/10506752.html