import urllib
import urllib.request
import time
def loadPage(url, filename):
print ("the sys is loading the file you wanted")
headers = {"User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
request = urllib.Request(url, headers = headers)
response = urllib.urlopen(request)
return response.read()
def writePage(html, filename):
print ("Writing the file" + filename)
with open(filename, "w") as myfile:
myfile.write(html)
def tiebaSpider(url, startPage, endPage):
print ("we are ready to spidering")
for page in range(startPage, endPage):
pn = (page - 1) * 50
fullurl = url + "&" + str(pn)
print (fullurl)
filename = "di" + str(page) + "ye.html"
html = loadPage(fullurl,filename)
writePage(html,filename)
time.sleep(2)
print ("Thanks for using")
if __name__ == "__main":
url = "http://tieba.baidu.com/f?"
kw = input("please input what you wanted spider:")
startpage = int(input("please input the startpage:"))
endpage = int(input("please input the endpage:"))
key = urllib.urlencode({"kw":kw})
fullurl = url + key
print (fullurl)
tiebaSpider(fullurl, startpage, endpage)
Tieba_Spider(爬虫)(py2.xx)
猜你喜欢
转载自blog.csdn.net/weixin_42694291/article/details/81166255
今日推荐
周排行