Accessories website crawling

# Crawling webmaster material in the establishment of free templates 
Import Requests
 from lxml Import etree
 Import Random 


headers = {
     " the User-Agent " : " Mozilla / 5.0 (Windows NT 6.1) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 68.0. Safari 3440.106 / 537.36 " ,
     " connection " : " Close "       # prevent connection pool has been occupied by the resource releases the connection is disconnected immediately after the pool request 
} 

# URL of the first page 
url_page_one = ' HTTP: // SC. chinaz.com/jianli/free.html ' 

# other generic page url
url_demo = 'http://sc.chinaz.com/jianli/free_%d.html'

start_page = int(input('enter a start page num:'))
end_page = int(input('enter a end page num:'))

for i in range(start_page,end_page+1):
    if i == 1:
        url = url_page_one
    else:
        url = url_demo%i
    
    response = requests.get(url=url, headers=headers)
    response.encoding = 'utf-8' 
    Page_text = response.text 
    
    # resolve the name and url details page of the 
    Tree = etree.HTML (page_text) 
    div_list = tree.xpath ( ' // div [@ the above mentioned id = "Container"] / div ' ) 
    
    for div in div_list: 
        detail_url div.xpath = ( ' ./p/a/@href ' ) [0] 
        name = div.xpath ( ' ./p/a/text () ' ) [0] 
        
        # of the page before initiation request 
        detail_page_text = requests. GET (url = detail_url, headers = headers) .text 
        
        # on the details page of the source data to parse: url address to download the corresponding 
        tree =etree.HTML (detail_page_text) 
        
        # find all Download 
        li_list = tree.xpath ( ' // div [@ class = "clearfix MT20 downlist"] / UL / li ' ) 
        
        # randomly selected tag contains a li (li Tags Download the url) 
        Li = the random.choice (li_list)    
        DOWNLOAD_URL = li.xpath ( ' ./a/@href ' ) [0] 
        
        # for download resume the download link corresponding to click 
        data = requests.get (url = download_url, = headers headers) .content     # Content acquiring data stream in the form 
        name = name + " .rar " 
        
        with Open (name, " WB " ) AS fp:
            fp.write (the Data)
        
        Print (name, " Download successful " )
        

 

Guess you like

Origin www.cnblogs.com/kenD/p/11111669.html