Added proxy function, the code is very easy to understand do not want to explain the excess
import re import requests from requests import RequestException import time import random from bs4 import BeautifulSoup # Get the response page file DEF get_response (url): the try : headers = { ' the Referer ' : ' https://blog.csdn.net ' , # disguised CSDN blog to search the article from ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 68.0.3440.75 Safari / 537.36 ' # disguised browser } # set agent IP porxy_list = [ {"http": "http://218.60.8.99:3129"}, {"http": "http://114.226.244.78:9999"}, {"http": "http://39.137.95.71:80"}, {"http": "http://115.159.31.195:8080"}, {"http": "http://39.137.69.7:8080"}, {"http": "http://39.106.66.178:80"}, {"http": "http://101.4.136.34:81"}, # 最新添加 {"http": "http://1.197.10.199:9999"}, {"http": "http://115.216.79.93:9999"}, {"http": "http://123.149.136.215:999"}, {"http": "http://39.108.92.182:8888"}, {"http": "http://221.1.200.242:43399"}, {"http": "http://175.42.123.88:9999"}, {"http": "http://223.241.119.0:9999"}, {"http": "http://59.44.78.30:54069"}, {"http": "http://114.104.185.114:9999"}, {"http": "http://163.204.247.84:9999"}, {"http": "http://123.149.141.128:9999"}, {"http": "http://223.215.6.181:9999"}, {"http": "http://106.85.143.27:9999"}, {"http": "http://123.163.27.131:9999"}, {"http": "http://61.145.4.204:9999"}, {"http": "http://183.166.162.198:9999"}, {"http": "http://110.243.2.57:9999"}, ] proxy = random.choice(porxy_list) Response = requests.get (URL, headers = headers, Proxies = Proxy) IF response.status_code == requests.codes.ok: # the response status code is 200 or Requests also comes with a built-in status code query object return response.text return None the except requestexception: Print ( ' request error ' ) return None # Get all blog links to articles, the following soup object creation process, if the page should be coded as other, plus a = from_encoding 'UTF-8' DEF GET_URL (HTML, u_name): url_list = [] num = re.findall(r'<div.*?article-item-box csdn-tracking-statistics.*?data-articleid.*?(\d+).*?>', html) for x in range(len(num)): url = f'https://blog.csdn.net/{u_name}/article/details/{num[x]}' url_list.append(url) return url_list # Queries blog how many pages (temporarily not think of a better way, the future will be perfect) DEF get_page (u_name): var = 1 while True: url = f'https://blog.csdn.net/{u_name}/article/list/{var}' list1 = get_url(get_response(url), u_name) if len(list1): was + = 1 else : break return was - 1 # Get the total amount of reading articles DEF get_all (HTML): read_num = int(re.compile(r'<dl.*?text-center.*?title.*?(\d[0-9][0-9][0-9]*).*?>').search(html).group(1)) return read_num def parse_page(html): try: read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(html).group(1)) return read_num except Exception: print('解析出错') return None # Get each article title DEF get_name (url): html = get_response(url) soup = BeautifulSoup(html, 'html.parser') return soup.title.string # Entrance DEF main (): url_old = [] # for each page links to articles stored in the user's url_new = [] # link is used to store the user's every article of var_all = 0 # var_all sum total access to store each new round user_name the iNPUT = ( " Please enter your user name CSDN: " ) page_num = get_page (user_name) Print (f ' on your blog page, a total of {page_num} ' ) # get a list of all articles for NUM in the Range (page_num): temp = num + 1 url_old.append(f'https://blog.csdn.net/{user_name}/article/list/{temp}') url_new += get_url(get_response(url_old[num]), user_name) art_num = len (url_new) Print (f ' your current number of blog articles art_num} { ' ) var1 = get_all (get_response was (url_new [0])) # var1 for accessing stored pre total brushings Print ( ' current total amount of reading: ' , var1) the while True: for X in Range (len (url_new)): html = get_response(url_new[x]) read_num = parse_page (HTML) Print ( ' the current amount of reading: ' , read_num) IF art_num <40 : sleep_time = random.randint(60, 65) else: sleep_time = 1 print('please wait', sleep_time, 's') the time.sleep (sleep_time) # set access frequency, too frequent access triggers anti crawler Print (F ' article. 1 + X} {/} {art_num: ' ) Print (get_name (url_new [X]), ' has been successfully accessed ' ) var2 = get_all (get_response was (url_new [0])) # var2 for the total amount of memory access brushings Print ( ' the current cycle as the increase of the amount of reading: ' , var2 - var1) var_all + = (var2 - var1) Print (F ' increased during the run of the total amount of reading var_all} { ' ) if __name__ == '__main__': main()
From: https: //blog.csdn.net/solitudi/article/details/104209520