__author__ = 'change' # coding=utf-8 """ ** Python Blog's Visit Count V2.0 ** (V1.0 http://blog.csdn.net/change518/article/details/14108511) ** By change ** 2015.11.4 ** http://blog.csdn.net/change518 ** First traverse the list of articles and extract the address of each blog ** Reconstruct HTTP requests to access these addresses, using threads to improve speed ** Visit all articles in the blog to achieve the purpose of brushing traffic ** Due to caching, the traffic will be updated after a period of time """ import urllib2 import re import datetime import Queue import threading # record the start time of the program running startTime = datetime.datetime.now() # Threads threadNum = 10 threadList = [] # All article link address list myList = list() myLinks = Queue.Queue() # add request header requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"} # loop read pagination for i in range(1, 5): url = "http://blog.csdn.net/change518/article/list/" + str(i) + "?viewmode=contents" request = urllib2.Request(url, headers=requestHeader) response = urllib2.urlopen(request) htmlResult = response.read() myPattern = '<span class="link_title"><a href="/change518/article/details/\d{7,8}">' firstResult = re.findall(myPattern, htmlResult) myPattern = '/change518/article/details/\d{7,8}' firstResultStr = ''.join(firstResult) myList += re.findall(myPattern, firstResultStr) # Add all elements in the List to the Queue for linkAddress in myList: myLinks.put('http://blog.csdn.net' + linkAddress) def mySpiderThread(j): """ Read the URL list in a loop and use Queue to synchronize resources between threads :return: """ while not (myLinks.empty()): singleLink = myLinks.get() request = urllib2.Request(singleLink, headers=requestHeader) print singleLink + " :" +str(j) for i in range(10): urllib2.urlopen(request) # Create threadNum threads for i in range(threadNum): t = threading.Thread(target=mySpiderThread, args=(i,)) threadList.append(t) # Start threadNum threads for i in range(threadNum): threadList[i].start() # The program hangs until all threads end for i in range(threadNum): threadList[i].join() """ If you do not need to perform some operations after all threads are executed, such as counting the execution time of all threads, etc. The three for loops above can also be written as: # Start threadNum threads for i in range(threadNum): t = threading.Thread(target=mySpiderThread, args=(i,)) t.start() """ print 'Done' # record the end time of the program running endTime = datetime.datetime.now() # Calculate the running time of the program print (endTime - startTime).seconds
.