爬取博客文章列表

目标:爬取博客园首页的文章标题和URL

1,打开博客园https://www.cnblogs.com/,“检查”网页元素,定位到博客文章位置:
在这里插入图片描述
2,文章标题与URL都在class值为"post-item-title"的a标签中,可以用正则表达式过滤:

from urllib3 import *
from re import *

http = PoolManager()    #使用连接池,加快响应速度。
disable_warnings()      #禁用urllib3的https安全警告


def download(url):
    result = http.request('GET', url)
    htmlStr = result.data.decode('utf-8')
    return htmlStr


def analyse(htmlStr):
    aList = findall('<a[^>]*post-item-title[^>]*>[^<]*</a>', htmlStr)       #正则过滤指定的<a>标签
    result = []
    for a in aList:
        g = search('href[\s]*=[\s]*[\'"]([^>\'""]*)[\'"]', a)   #从标签中获取URL
        if g != None:
            url = g.group(1)
        index1 = a.find(">")
        index2 = a.rfind("<")
        title = a[index1 + 1:index2]    #URL裁剪
        d = {
    
    }
        d['url'] = url
        d['title'] = title
        result.append(d)
    return result


def crawler(url):
    html = download(url)
    blogList = analyse(html)
    for blog in blogList:
        print("title:", blog["title"])
        print("url:", blog["url"])


crawler('https://www.cnblogs.com')
  • 网页请求:urllib3库
  • 页面解析:正则表达式

3,运行:
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/dangfulin/article/details/107848020
今日推荐