目录
1.requests模块
1.常用方法
1.get():发起请求获取响应对
2.response方法
1.response.text:字符串
字符编码:ISO-8895-1
response.encoding = "utf-8"
2.response.content:bytes
3.response.status_code:返回响应码
3.get():查询参数 params(字典格式)
1.没有查询参数
res = requests.get(url,headers=headers)
2.有查询参数
params={"wd":"达内"}
res = requests.get(url,params=params,headers=headers)
import requests
url = "http://www.baidu.com/s?"
headers = {"User-Agent":"Mozilla5.0/"}
s = input("请输入要搜索的内容:")
# get方法params参数必须要为 字典 格式,自动编码
wd = {"wd":s}
res = requests.get(url,params=wd,headers=headers)
res.encoding = "utf-8"
print(res.text)
4.post():参数名data
1.data={} #data参数为字典,不用转为bytes数据类型
res = requests.post(url,data=data,headers=headers)
5.代理:proxies
1.爬虫与反爬虫斗争的第二步
获取代理IP的网站
1.西刺代理
2.快代理
3.全网代理
2.普通代理:proxies={"协议":"IP地址:端口号"}
proxies = {"HTTP":"171.221.239.11:808"}
import requests
url = "http://www.taobao.com/"
proxies = {"HTTP":"222.221.11.119:3128"}
headers = {"User-Agent":"Mozilla5.0/"}
res = requests.get(url,proxies=proxies,headers=headers)
res.encoding = "utf-8"
print(res.text)
3.私密代理:
proxies = {"HTTP":"http://309435365:[email protected]:16819"}
import requests
url = "http://www.taobao.com/"
headers = {"User-Agent":"Mozilla5.0/"}
proxies={"HTTP":"http://309435365:[email protected]:16819"}
#114.67.228.126:16819
res = requests.get(url,proxies=proxies,headers=headers)
res.encoding = "utf-8"
print(res.status_code)
4.案例:爬取链家地产二手房信息
目标:爬取小区名称 总价
步骤:
1.找URL
https://gz.fang.lianjia.com/loupan/
2.正则匹配
3.写入本地文件
6.Web客户端验证:auth
1. auth = ("用户名","密码")
2. 爬取http://code.tarena.com.cn
正则:p = re.compile('<a href="\w+/">(.*?)</a>',re.S)
7.SSL证书认证:verify
1.verify=True:默认,做SSL证书认证
2.verify=False:忽略证书认证
import requests
url = "https://www.12306.cn/mormhweb/"
headers = {"User-Agent":"Mozilla5.0/"}
res = requests.get(url,verify=False,headers=headers)
res.encoding = "utf-8"
print(res.text)
2.Handler处理器(urllib.request)
1.定义
自定义的urlopen()方法,urlopen方法是一个特殊的opener
2.常用方法
1.build_opener(Hander处理器对象)
2.opener.open(url) urlopen()
3.使用流程
1.创建相关的Handler处理器对象
http_handler = urllib.request.HTTPHander()
2.创建自定义opener对象
opener = urllib.request.build_opener(http_handler)
3.利用opener对象的open方法发送请求
import urllib.request
url = "http://www.baidu.com/"
# 1.创建HTTPHandler处理器对象
http_hander = urllib.request.HTTPHandler()
# 2.创建自定义的opener对象
opener = urllib.request.build_opener(http_hander)
# 3.利用opener对象的open方法发请求
req = urllib.request.Request(url)
res = opener.open(req)
print(res.read().decode("utf-8"))
4.Handler处理器分类
1.HTTPHandler()
2.ProxyHandler(代理IP)
3.ProxyBasicAuthHandler(密码管理器对象):私密代理
import urllib.request
url = "http://www.baidu.com/"
proxy = {"HTTP":"183.62.196.10:3128"}
# 1.创建Handler
proxy_handler = urllib.request.ProxyHandler(proxy)
# 2.创建自定义opener
opener = urllib.request.build_opener(proxy_handler)
# 3.利用open方法发请求
req = urllib.request.Request(url)
res = opener.open(req)
print(res.read().decode("utf-8"))
链家二手房数据爬取
# 存储到本地
import requests
import re
class LianJiaSpider:
def __init__(self):
self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
self.headers = {"User-Agent":"Mozilla5.0/"}
self.proxies = {"HTTP":"http://309435365:[email protected]:16819"}
self.page = 1
# 获取页面
def getPage(self,url):
res = requests.get(url,proxies=self.proxies,headers=self.headers)
res.encoding = "utf-8"
html = res.text
self.parsePage(html)
# 用正则解析页面
def parsePage(self,html):
p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S)
r_list = p.findall(html)
# [("首科花园","595"),(),()]
self.writePage(r_list)
# 保存本地文件
def writePage(self,r_list):
for r_tuple in r_list:# r_tuple ("首科花园","595")
for r_str in r_tuple:
with open("链家二手房.txt","a") as f:
f.write(r_str.strip() + " ")
with open("链家二手房.txt","a") as f:
f.write("\n")
# 主函数
def workOn(self):
while True:
print("正在爬取%d页" % self.page)
# 拼接URL
url = self.baseurl + str(self.page) + "/"
self.getPage(url)
print("第%d页爬取成功" % self.page)
c = input("是否继续爬取(y/n):")
if c.strip().lower() == "y":
self.page += 1
else:
print("爬取结束,谢谢使用!")
break
if __name__ == "__main__":
spider = LianJiaSpider()
spider.workOn()
# 存储到mysql
import requests
import re
import pymysql
import warnings
class LianJiaSpider:
def __init__(self):
self.baseurl = "https://bj.lianjia.com/ershoufang/pg"
self.headers = {"User-Agent":"Mozilla5.0/"}
self.proxies = {"HTTP":"http://309435365:[email protected]:16819"}
self.page = 1
# 创建数据库连接对象
self.db = pymysql.connect("localhost","root",
"123456",charset="utf8")
# 创建游标对象
self.cursor = self.db.cursor()
# 获取页面
def getPage(self,url):
res = requests.get(url,proxies=self.proxies,headers=self.headers)
res.encoding = "utf-8"
html = res.text
print("页面已获取,正在解析页面...")
self.parsePage(html)
# 用正则解析页面
def parsePage(self,html):
p = re.compile('<div class="houseInfo">.*?data-el="region">(.*?)</a>.*?<div class="totalPrice">.*?<span>(.*?)</span>',re.S)
r_list = p.findall(html)
# [("首科花园","595"),(),()]
print("正在存入mysql数据库...")
self.writeToMysql(r_list)
# 保存到MySQL数据库
def writeToMysql(self,r_list):
c_db = "create database if not exists spider;"
u_db = "use spider;"
c_tab = "create table if not exists lianjia(\
id int primary key auto_increment,\
name varchar(30),\
price decimal(20,2))charset=utf8;"
# 过滤警告
warnings.filterwarnings("error")
try:
self.cursor.execute(c_db)
except Warning:
pass
self.cursor.execute(u_db)
try:
self.cursor.execute(c_tab)
except Warning:
pass
# r_list : [("首科花园","595"),(),()]
for r_tuple in r_list:
s_insert = "insert into lianjia(name,price) \
values('%s','%s');" % \
(r_tuple[0].strip(),
float(r_tuple[1].strip())*10000)
self.cursor.execute(s_insert)
self.db.commit()
print("第%d页存入数据库成功" % self.page)
# 主函数
def workOn(self):
while True:
print("正在爬取%d页" % self.page)
# 拼接URL
url = self.baseurl + str(self.page) + "/"
self.getPage(url)
print("第%d页爬取成功" % self.page)
c = input("是否继续爬取(y/n):")
if c.strip().lower() == "y":
self.page += 1
else:
print("爬取结束,谢谢使用!")
break
if __name__ == "__main__":
spider = LianJiaSpider()
spider.workOn()