Python crawler learning (1)

The pseudo code of the basic framework of the crawler (stand-alone version) is essentially a multi-tree hierarchical traversal algorithm, and the breadth-first search of the graph:

import Queue
#Set the initial web page, the root of the tree
initial_page = "http://www.badtom.cn"
#Create a new queue to be crawled, already crawled collection
url_queue = Queue.Queue()
seen = set()
#Initialize the queue to be climbed, the collection has been climbed
seen.insert(initial_page)
url_queue.put(initial_page)
#The queue to be climbed is not empty
while(url_queue.size()>0):
    current_url = url_queue.get() # Take out the first url in the queue to be crawled
    store(current_url) #Store the webpage represented by this url
    for next_url in extract_urls(current_url): #Extract other urls linked in this url
        if next_url not in seen:      #去重
        seen.put(next_url) #Update the crawled collection
        url_queue.put(next_url) #Update the queue to be climbed

Crawler basic framework (distributed) pseudo code, the master node is responsible for maintaining the queue to be crawled and the crawled collection, and the slave node is responsible for processing pages and extracting links to be crawled:

#Main node master.py
#Create a new queue to be climbed, remove the structure bf
distributed_queue = DistributedQueue()
bf = BloomFilter()
 
#The queue to be climbed joins the initial page
initial_page = "www.badtom.cn"
distributed_queue.put(initial_page)
 
#server is used to listen to the request sent from the node and process the return
server = RPCServer()
server.listen()
 
#server handles request requests
while(true):
    request = server.request() #Receive the request sent from the node, otherwise block
    if request.head == 'GET': #Request a task from the node
        if distributed_queue.size() > 0:
        page = distributed_queue.get() #Get the column header from the queue to be climbed
            request.sendResponse(page) #Send to the slave node
        bf.put(page) #Join the crawled collection
        else:                   
        send(null) #All pages have been crawled, stop the program
            break              
    elif request.head == 'POST': #Send a receipt to be crawled from the node
    for next_url in request.urls
        if next_url not in bf #Join the queue to be crawled if not in the crawled set
        distributed_queue.put(request.url)
 
 
#slave node slave.py
#client is used to send requests to the master node
client = RPCClient()
 
#Get the url task from the main node until the queue to be climbed is empty and the program stops
while((current_url = client.request_from_master('GET')) != null)
    # process the current page
    store(current_url)
 
    # extract all links on this page
    to_send = []
    for next_url in extract_urls(current_url):
    to_send.append(next_url)
 
    #Send the link to be crawled to the master node
    client.send_to_master('POST',to_send)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325686264&siteId=291194637