Tieba_Spider(爬虫)(py2.xx)

import urllib
import urllib.request
import time

def loadPage(url, filename):
    print ("the sys is loading the file you wanted")
    headers = {"User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
    request = urllib.Request(url, headers = headers)
    response = urllib.urlopen(request)
    return response.read()

def writePage(html, filename):
    print ("Writing the file" + filename)
    with open(filename,  "w") as myfile:
        myfile.write(html)

def tiebaSpider(url, startPage, endPage):
    print ("we are ready to spidering")

    for page in range(startPage, endPage):
        pn = (page - 1) * 50
        fullurl = url + "&" + str(pn)
        print (fullurl)

        filename = "di" + str(page) + "ye.html"
        html = loadPage(fullurl,filename)

        writePage(html,filename)

        time.sleep(2)
        print ("Thanks for using")

if __name__ == "__main":

    url = "http://tieba.baidu.com/f?"
    kw = input("please input what you wanted spider:")
    startpage = int(input("please input the startpage:"))
    endpage = int(input("please input the endpage:"))

    key = urllib.urlencode({"kw":kw})
    fullurl = url + key
    print (fullurl)
    tiebaSpider(fullurl, startpage, endpage)

猜你喜欢

转载自blog.csdn.net/weixin_42694291/article/details/81166255