网络爬虫的尺寸规模
小规模,数据量小,爬取速度不敏感 |
中规模,数据规模比较大爬取速度敏感 |
大规模,搜索引擎,爬取速度关键 |
Requests库 |
Scrapy库 |
定制开发 |
爬取网页,玩转网页 |
爬取网站,爬取系列网站 |
爬取全网 |
爬取网页的通用代码框架
import requests
def getHTMLText(url):
try:
headers = {'user-agent':'模拟浏览器信息'}
r = requests.get(url, headers = headers,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return '产生异常'
if __name__ == "__main__":
url = "http://www.baidu.com"
print(getHTMLText(url))
京东商品页面爬取
import requests
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text[:1000]
except:
return '产生异常'
if __name__ == "__main__":
url = "https://item.jd.com/100004404920.html"
print(getHTMLText(url))
百度搜索关键字提交
import requests
def getHTMLText(url):
try:
kv = {'wd':keyword}
r = requests.get(url,params=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return len(r.text)
except:
return '产生异常'
if __name__ == "__main__":
keyword = 'python'
url = "https://www.baidu.com/s"
print(getHTMLText(url))
IP 地址归属地自动查询
import requests
url = "http://m.ip138.com/ip.asp?ip="
try:
r = requests.get(url+'202.204.80.112')
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print('爬取失败')