利用python刷CSDN访问量

访问间隔最好是在60秒以上,原文的60到75会出问题,我修改到了90-120,完美运行。

测试结果:90-120,第17次访问时还是会出问题,是csdn已经有拦截机制了么。

代码如下:

# -*- coding:UTF-8 -*-
import re
import time
import random
import requests
import urllib.request
from bs4 import BeautifulSoup

firefoxHead = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
IPRegular = r"(([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5]).){3}([1-9]?\d|1\d{2}|2[0-4]\d|25[0-5])"
host = "https://blog.csdn.net"
url = "https://blog.csdn.net/WASEFADG/article/details/{}"
codes = ["88854724","88822550"]


def parseIPList(url="http://www.xicidaili.com/"):
    IPs = []
    request = urllib.request.Request(url, headers=firefoxHead)
    response = urllib.request.urlopen(request)
    soup = BeautifulSoup(response, "lxml")
    tds = soup.find_all("td")
    for td in tds:
        string = str(td.string)
        if re.search(IPRegular, string):
            IPs.append(string)
    return IPs


def PV(code):
    s = requests.Session()
    s.headers = firefoxHead
    count = 0
    while True:
        count += 1
        print("正在进行第{}次访问\t".format(count), end="\t")
        IPs = parseIPList()
        s.proxies = {"http": "{}:8080".format(IPs[random.randint(0, 40)])}
        s.get(host)
        r = s.get(url.format(code))
        html = r.text
        soup = BeautifulSoup(html, "html.parser")
        spans = soup.find_all("span")
        print(spans[2].string)
        time.sleep(random.randint(90, 120))


def main():
    PV(codes[0])


if __name__ == "__main__":
    main()


转载自:https://blog.csdn.net/CY19980216/article/details/82825833#commentBox

猜你喜欢

转载自blog.csdn.net/WASEFADG/article/details/88892909