import re
import os
import os.path
import time
from urllib.request import urlopen
dstDir = 'YuanShi'
if not os.path.isdir(dstDir):
os.mkdir(dstDir)
startUrl = r'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(startUrl) as fp:
content = fp.read().decode()
# 提取并遍历每位大牛链接
pattern = r'<li class="name_list"><a href="(.+)" target="_blank">(.+)</a></li>'
result = re.findall(pattern, content)
for item in result:
perUrl, name = item
# 测试是否获取信息
print(perUrl)
# 这里根据初爬结果进行改进
name = name.replace('<h3>', '').replace('</h3>', '')
name = os.path.join(dstDir, name)
perUrl = r'http://www.cae.cn/' + perUrl
with urlopen(perUrl) as fp:
content = fp.read().decode()
# 抓取简介
pattern = r'<p>(.+?)</p>'
result = re.findall(pattern, content) # 返回string中所有与pattern匹配的全部字符串,返回形式为数组。
if result:
intro = re.sub('(<a.+</a>)|( )|( );','','\n'.join(result))
with open(name+'.txt', 'w', encoding='utf8') as fp:
fp.write(intro)
python爬虫,获取中国工程院院士信息
猜你喜欢
转载自blog.csdn.net/Jocks5/article/details/121716308
今日推荐
周排行