#页面简单的提取 from urllib import request import re #1.生成一个模拟请求 rq = request.Request(r'https://www.qiushibaike.com/',headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}) #请求数据 data = request.urlopen(rq) string = data.read().decode('utf-8') # print(string) # with open('qiubai.html','w',encoding='utf-8') as fp: # fp.write(string) items = re.findall(r'<div class="author clearfix">.+?<div class="stats">',string,re.S) # print(items) duanzi = [] for item in items: author = re.search(r'<h2>(.+?)</h2>',item,re.S) # print(author.group(1).strip()) content = re.search(r'<span>(.+?)</span>',item,re.S) # print(content.group(1).strip()) content = content.group(1).strip() content = re.sub(r'<br/>','\n',content) duanzi.append((author.group(1).strip(),content)) print(duanzi)
Python 简单页面提取
猜你喜欢
转载自blog.csdn.net/weixin_42218889/article/details/81483980
今日推荐
周排行