Python 简单页面提取

#页面简单的提取
from urllib import request
import re

#1.生成一个模拟请求
rq = request.Request(r'https://www.qiushibaike.com/',headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"})

#请求数据
data = request.urlopen(rq)
string = data.read().decode('utf-8')
# print(string)
# with open('qiubai.html','w',encoding='utf-8') as fp:
#     fp.write(string)
items = re.findall(r'<div class="author clearfix">.+?<div class="stats">',string,re.S)
# print(items)
duanzi = []
for item in items:
    author = re.search(r'<h2>(.+?)</h2>',item,re.S)
    # print(author.group(1).strip())
    content = re.search(r'<span>(.+?)</span>',item,re.S)
    # print(content.group(1).strip())
    content = content.group(1).strip()
    content = re.sub(r'<br/>','\n',content)
    duanzi.append((author.group(1).strip(),content))

print(duanzi)

猜你喜欢

转载自blog.csdn.net/weixin_42218889/article/details/81483980