博客园 文章爬取(乱写的,有的爬不下来)

微博爬取(乱写的)

import re
import requests
web=[
    {"name":'张三',"博客地址":"http://www.cnblogs.com/bladecheng/"},
    {"name":"甲","博客地址":"http://www.cnblogs.com/pythonywy/"},
    {"name":"乙","博客地址":"http://www.cnblogs.com/pythonywy/"},
    {"name":"丙","博客地址":"http://www.cnblogs.com/zrx19960128/"},
    {"name":"丁","博客地址":"http://www.cnblogs.com/itboy-newking/"},
    {"name":"帅哥","博客地址":"http://www.cnblogs.com/chuwanliu/"},
    {"name":"浪哥","博客地址":"http://www.cnblogs.com/einsam/"},
    {"name":"强哥","博客地址":"http://www.cnblogs.com/wsxiaoyao"},
    {"name":"云哥","博客地址":"http://www.cnblogs.com/yellowcloud/"}
]
for n in range(len(web)):
    print("%s的博客文章地址如下:" %(web[n]["name"]))
    html = requests.get(web[n]["博客地址"])
    strr = html.text                                              #网页文本  
    pat1 = r'postTitle2" href="(.*?)</a>'             #正则匹配
    title = re.findall(pat1, strr)                             #匹配后的结果
    long = len(title)
    for i in range(0, long):
        tx = r'">'
        res = re.sub(tx, '  文章标题:', title[i])
        print(res)
print("爬取完毕!")

猜你喜欢

转载自www.cnblogs.com/bladecheng/p/10883555.html
今日推荐