Python3 获取CSDN博客所有文章标题及阅读数

#coding=utf-8
import re
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable

def getHtml(url):  
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}  
    page = requests.get(url,headers = headers)
    html =page.text
    return html

if __name__=='__main__':
    ltitle = []
    lread = []
    lcommand = []
    table = PrettyTable(['NO.','文章标题','阅读数','评论数'])
    for page in range(1,18):
        url = "https://blog.csdn.net/qq523176585/article/list/{}".format(page)
        Html = getHtml(url)
        Soup = BeautifulSoup(Html,"html.parser")
        titles = Soup.select('h4 > a[href]')
        for title in titles[1:]:
            title = title.text.split('\n')[-1]
            title = title.strip()
            ltitle.append(title)
        reg = r'\d+'
        readnums = Soup.select('div > p:nth-of-type(2)')
        for readnum in readnums[1:]:
            readnum = readnum.text
            readnum = re.findall(reg,readnum)[0]
            lread.append(int(readnum))
        
        commands = Soup.select('div > p:nth-of-type(3)')
        for command in commands[1:]:
            command = command.text
            command = re.findall(reg,command)[0]
            lcommand.append(int(command))
    for i in range(len(ltitle)):
        table.add_row([i+1,ltitle[i],lread[i],lcommand[i]])
    print (table)
    print ("总阅读数:{}\n总评论数:{}".format(sum(lread),sum(lcommand)))

猜你喜欢

转载自blog.csdn.net/qq523176585/article/details/82891782