Python brush CSDN blog script v2.0

__author__ = 'change'
# coding=utf-8

"""
 ** Python Blog's Visit Count V2.0
 ** (V1.0 http://blog.csdn.net/change518/article/details/14108511)
 ** By change
 ** 2015.11.4
 ** http://blog.csdn.net/change518
 ** First traverse the list of articles and extract the address of each blog
 ** Reconstruct HTTP requests to access these addresses, using threads to improve speed
 ** Visit all articles in the blog to achieve the purpose of brushing traffic
 ** Due to caching, the traffic will be updated after a period of time
"""

import urllib2
import re
import datetime
import Queue
import threading

# record the start time of the program running
startTime = datetime.datetime.now()

# Threads
threadNum = 10
threadList = []

# All article link address list
myList = list()
myLinks = Queue.Queue()

# add request header
requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}

# loop read pagination
for i in range(1, 5):
    url = "http://blog.csdn.net/change518/article/list/" + str(i) + "?viewmode=contents"
    request = urllib2.Request(url, headers=requestHeader)
    response = urllib2.urlopen(request)

    htmlResult = response.read()

    myPattern = '<span class="link_title"><a href="/change518/article/details/\d{7,8}">'
    firstResult = re.findall(myPattern, htmlResult)

    myPattern = '/change518/article/details/\d{7,8}'
    firstResultStr = ''.join(firstResult)
    myList += re.findall(myPattern, firstResultStr)

# Add all elements in the List to the Queue
for linkAddress in myList:
    myLinks.put('http://blog.csdn.net' + linkAddress)


def mySpiderThread(j):
    """
    Read the URL list in a loop and use Queue to synchronize resources between threads
    :return:
    """
    while not (myLinks.empty()):
        singleLink = myLinks.get()
        request = urllib2.Request(singleLink, headers=requestHeader)
        print singleLink + " :" +str(j)
        for i in range(10):
            urllib2.urlopen(request)


# Create threadNum threads
for i in range(threadNum):
    t = threading.Thread(target=mySpiderThread, args=(i,))
    threadList.append(t)

# Start threadNum threads
for i in range(threadNum):
    threadList[i].start()

# The program hangs until all threads end
for i in range(threadNum):
    threadList[i].join()


"""
If you do not need to perform some operations after all threads are executed, such as counting the execution time of all threads, etc.
The three for loops above can also be written as:

# Start threadNum threads
for i in range(threadNum):
    t = threading.Thread(target=mySpiderThread, args=(i,))
    t.start()

"""


print 'Done'

# record the end time of the program running
endTime = datetime.datetime.now()
# Calculate the running time of the program
print (endTime - startTime).seconds



.




Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325524384&siteId=291194637