urllib: Simple Post Bar Code crawled pages

Request the urllib Import from 
Import Time 
Import the urllib 
Import The urllib.parse 

# The url transmission request, obtain the server response file 
DEF loadPage (url, filename): 
    Print ( 'downloading' + filename) 
    headers = { 
        'the User - - Agent': 'the Mozilla / 5.0 (the Windows NT 6.1; the WOW64) AppleWebKit / 537.36 (KHTML, like the Gecko) the Chrome / 65.0.3325.181 Safari / 537.36 ' 
    } 
    REQ = urllib.request.Request (URL, headers = headers) 
    return the urllib.request.urlopen (REQ ) .read () 

# write the html content locally 
DEF WritePage (html, filename): 
    Print ( 'saving' + filename) 
    with Open (filename, 'wb') AS f: 
        f.write (html) 
    Print ( ' ------------------------------- ')



# Http://tieba.baidu.com/f?kw=python&fr=ala0&tpl=5 first page 
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 in accordance with the law and above the first page is the same url 

# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 second page 

# http://tieba.baidu.com/f?kw=python&ie= utf-8 & pn = 100 third page 

# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=150 fourth page 

# http://tieba.baidu.com/f?kw=python&ie= utf-8 & pn = 200 fifth page 



url # processing each page 
DEF tiebaSpider (URL, BeginPage, the endPage): 
    for page in Range (BeginPage, the endPage +. 1): 
        PN = (page -. 1) * 50 
        filename = 'D : / yemian / second '+ str (page) +' p .html ' 
        fullurl URL = +' & pn-'+ STR (PN) 
        HTML = loadPage (fullurl,filename)

        writePage(html,filename)


__name__ == IF '__main__': 
    kw = the INPUT ( 'Please enter the required crawling pages of Post Bar name:') 
    BeginPage = int (the INPUT ( 'Please enter start:')) 
    the endPage = int (the INPUT ( 'Please enter end page: ')) 

    ? URL =' http://tieba.baidu.com/f ' 
    Key = urllib.parse.urlencode ({' kW ':} kW) 
    fullurl URL = + Key 
    tiebaSpider (fullurl, BeginPage, the endPage ) 

Print ( 'Thank you for using') 
the time.sleep (10)

  

Guess you like

Origin www.cnblogs.com/wshr210/p/11305159.html