首先看下小案例:
import urllib
import urllib.parse
import urllib.request
#抓取数据的目标地址
url = "http://www.baidu.com/s"
headers = {"User-Agent":"Mozilla..."}
#输入查询字符串
keyword = input("请输入你所要查询的字符串:")
wd = {"wd":keyword}
#对字符串进行解码
wd = urllib.parse.urlencode(wd)
#拼接完整的字符串
fullUrl = url + "?" + wd
print(fullUrl)
#创建一个请求对象
request = urllib.request.Request(fullUrl,headers = headers)
#获取相应信息
response = urllib.request.urlopen(request)
#print(response.read())
贴吧小爬虫案例
爬取一个贴吧
import urllib import urllib.request def loadPage(url,filename): """ 作用:根据url发送请求,获取服务器响应信息 url:需要爬取的url地址 ffilename:处理的文件名 """ print("正在下载"+str(filename)) headers = {"USer-Agent":"Mozilla..."} #构建请求对象 request = urllib.request.Request(url,headers = headers) #返回响应信息 response= urllib.request.urlopen(request) return response.read() def writePage(html,filename): """ 作用:将html内容写道本地 html:服务器响应内容文件 """ print("正在保存"+filename) #文件写入方法,该方法不用文件关闭操作 with open(filename,"w") as f: f.write(str(html)) print ("-" * 30) def tiebaSpider(url,beginPage,endPage): """ 作用:贴吧爬虫调度器,负责组合处理每个页面的url url:贴吧url的前部分 beginPage:起始页 endPage:终止页 """ endPage = endPage +1 for page in range(beginPage,endPage): pn = (page -1) * 50 filename = "第" + str(page) + "页.html" fullurl = url + "&pn=" +str(pn) print(fullurl) html = loadPage(fullurl,filename) writePage(html,filename) if __name__ == "__main__": kw = input("请输入你要爬取的贴吧名:") beginPage = int(input("请输入起始页:")) endPage = int(input("请输入结束页:")) url = "http://tieba.com/f?" key = urllib.parse.urlencode({"kw":kw}) fullurl = url + key tiebaSpider(fullurl,beginPage,endPage)
结果:
请输入你要爬取的贴吧名:python
请输入起始页:1
请输入结束页:3
http://tieba.com/f?kw=python&pn=0
正在下载第1页.html
正在保存第1页.html
------------------------------
http://tieba.com/f?kw=python&pn=50
正在下载第2页.html
正在保存第2页.html
------------------------------
http://tieba.com/f?kw=python&pn=100
正在下载第3页.html
正在保存第3页.html
------------------------------Process finished with exit code 0