Python3.7 automatic brush blog traffic (only need to enter a user id) (rpm)

Added proxy function, the code is very easy to understand do not want to explain the excess

import re
import requests
from requests import RequestException
import time
import random
from bs4 import BeautifulSoup


# Get the response page file 
DEF get_response (url):
     the try :
        headers = {
             ' the Referer ' : ' https://blog.csdn.net ' ,   # disguised CSDN blog to search the article from 
            ' the User-- Agent ' : ' the Mozilla / 5.0 (the Windows NT 10.0; Win64; x64-) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 68.0.3440.75 Safari / 537.36 ' 
            # disguised browser 
        }
         # set agent IP 
        porxy_list = [
            {"http": "http://218.60.8.99:3129"},
            {"http": "http://114.226.244.78:9999"},
            {"http": "http://39.137.95.71:80"},
            {"http": "http://115.159.31.195:8080"},
            {"http": "http://39.137.69.7:8080"},
            {"http": "http://39.106.66.178:80"},
            {"http": "http://101.4.136.34:81"},
            # 最新添加
            {"http": "http://1.197.10.199:9999"},
            {"http": "http://115.216.79.93:9999"},
            {"http": "http://123.149.136.215:999"},
            {"http": "http://39.108.92.182:8888"},
            {"http": "http://221.1.200.242:43399"},
            {"http": "http://175.42.123.88:9999"},
            {"http": "http://223.241.119.0:9999"},
            {"http": "http://59.44.78.30:54069"},
            {"http": "http://114.104.185.114:9999"},
            {"http": "http://163.204.247.84:9999"},
            {"http": "http://123.149.141.128:9999"},
            {"http": "http://223.215.6.181:9999"},
            {"http": "http://106.85.143.27:9999"},
            {"http": "http://123.163.27.131:9999"},
            {"http": "http://61.145.4.204:9999"},
            {"http": "http://183.166.162.198:9999"},
            {"http": "http://110.243.2.57:9999"},
        ]
        proxy = random.choice(porxy_list)
        Response = requests.get (URL, headers = headers, Proxies = Proxy)
         IF response.status_code == requests.codes.ok:   # the response status code is 200 or Requests also comes with a built-in status code query object 
            return response.text
         return None
     the except requestexception:
         Print ( ' request error ' )
         return None


# Get all blog links to articles, the following soup object creation process, if the page should be coded as other, plus a = from_encoding 'UTF-8' 
DEF GET_URL (HTML, u_name):
    url_list = []
    num = re.findall(r'<div.*?article-item-box csdn-tracking-statistics.*?data-articleid.*?(\d+).*?>', html)
    for x in range(len(num)):
        url = f'https://blog.csdn.net/{u_name}/article/details/{num[x]}'
        url_list.append(url)
    return url_list


# Queries blog how many pages (temporarily not think of a better way, the future will be perfect) 
DEF get_page (u_name):
    var = 1
    while True:
        url = f'https://blog.csdn.net/{u_name}/article/list/{var}'
        list1 = get_url(get_response(url), u_name)
        if len(list1):
            was + = 1
         else :
             break 
    return was - 1


# Get the total amount of reading articles 
DEF get_all (HTML):
    read_num = int(re.compile(r'<dl.*?text-center.*?title.*?(\d[0-9][0-9][0-9]*).*?>').search(html).group(1))
    return read_num


def parse_page(html):
    try:
        read_num = int(re.compile('<span.*?read-count.*?(\d+).*?</span>').search(html).group(1))
        return read_num
    except Exception:
        print('解析出错')
        return None


# Get each article title 
DEF get_name (url):
    html = get_response(url)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.title.string


# Entrance 
DEF main ():
    url_old = []         # for each page links to articles stored in the user's 
    url_new = []         # link is used to store the user's every article of 
    var_all = 0          # var_all sum total access to store each new round 
    user_name the iNPUT = ( " Please enter your user name CSDN: " )
    page_num = get_page (user_name)
     Print (f ' on your blog page, a total of {page_num} ' )
     # get a list of all articles 
    for NUM in the Range (page_num):
        temp = num + 1
        url_old.append(f'https://blog.csdn.net/{user_name}/article/list/{temp}')
        url_new += get_url(get_response(url_old[num]), user_name)
    art_num = len (url_new)
     Print (f ' your current number of blog articles art_num} { ' )
    var1 = get_all (get_response was (url_new [0]))         # var1 for accessing stored pre total brushings 
    Print ( ' current total amount of reading: ' , var1)
     the while True:
         for X in Range (len (url_new)):
            html = get_response(url_new[x])
            read_num = parse_page (HTML)
             Print ( ' the current amount of reading: ' , read_num)
             IF art_num <40 :
                sleep_time = random.randint(60, 65)
            else:
                sleep_time = 1
            print('please wait', sleep_time, 's')
            the time.sleep (sleep_time)   # set access frequency, too frequent access triggers anti crawler 
            Print (F ' article. 1 + X} {/} {art_num: ' )
             Print (get_name (url_new [X]), ' has been successfully accessed ' )
        var2 = get_all (get_response was (url_new [0]))   # var2 for the total amount of memory access brushings 
        Print ( ' the current cycle as the increase of the amount of reading: ' , var2 - var1)
        var_all + = (var2 - var1)
         Print (F ' increased during the run of the total amount of reading var_all} { ' )


if __name__ == '__main__':
    main()

From: https: //blog.csdn.net/solitudi/article/details/104209520

Guess you like

Origin www.cnblogs.com/gisoracle/p/12283877.html