第一部分:
概念: 爬虫概念 工具和HTTP:
第二部分:
概念: request模块的学习:
案例代码:
01_try_requests.py → get方式获取请求:
import requests
url = "http://www.baidu.com"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
response = requests.get(url,headers=headers)
# print(response)
#获取网页的html字符串
# response.encoding = "utf-8"
#
# print(response.text)
print(response.content.decode())
02_try_request_post.py → post方式获取请求:
import requests
# POST请求
# response = requests.post(url,data = {请求体的字典}) ## 发送POST请求
url = "http://fanyi.baidu.com/basetrans"
query_string = {"query": "你好,世界",
"from": "zh",
"to": "en"}
# 发送带header的请求 -> 为了模拟浏览器,获取和浏览器一模一样的内容
headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
}
# 将heasers加入到请求中,如果是POST请求则带上data
response = requests.post(url, data=query_string, headers=headers)
print(response)
print(response.content.decode("unicode-escape"))
print(type(response.content.decode("unicode-escape"))) # 查看python的类型
运行结果:
parse.py → 用于处理请求一次不成功,仍将继续请求的方法(retrying模块的学习)
import requests
from retrying import retry
'''
专门请求url地址的方法
'''
# headers = {
# "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
headers = {
"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",
"Referer": "https://m.douban.com/tv/american"}
@retry(stop_max_attempt_number=3) #让被装饰的函数反复执行三次,三次全部报错才会报错,中间又一次正常,程序继续往后走
def _parse_url(url):
print("*"*100)
response = requests.get(url,headers=headers,timeout=5)
return response.content.decode()
def parse_url(url):
try:
html_str = _parse_url(url)
except:
html_str = None
return html_str
if __name__ == '__main__':
url = "http://www.baidu.com"
url1 = "www.baidu.com"
print(parse_url(url1))
运行结果:
03_try_login1 → 未登录状态爬取人人网页面的代码保存到本地
# coding=utf-8
import requests
url = "http://www.renren.com/327550029/profile"
headers = {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",
"Cookie":"anonymid=jcysvok0-5ad7y0; depovince=GW; jebecookies=7c8f517c-cd19-43e2-a3aa-167372392e98|||||; _r01_=1; JSESSIONID=abcCwQQelI5Mxd1cx48ew; ick_login=d616d74d-6f98-420c-af4d-f9e855b11e0d; _de=BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5; p=d0cd0b910f5be6e2328b8ce0fc7ab0569; first_login_flag=1; [email protected]; ln_hurl=http://hdn.xnimg.cn/photos/hdn421/20171230/1635/main_JQzq_ae7b0000a8791986.jpg; t=69282122a1706bf87220ce140d1d44579; societyguester=69282122a1706bf87220ce140d1d44579; id=327550029; xnsid=c31e4689; loginfrom=syshome; ch_id=10016"
}
response = requests.get(url,headers=headers)
with open("renren1.html","w",encoding="utf-8") as f:
f.write(response.content.decode())
运行结果:
04_try_login2.py → 打印人人网Cookie
# coding=utf-8
import requests
url = "http://www.renren.com/327550029/profile"
headers = {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",
}
cookie="anonymid=jcysvok0-5ad7y0; depovince=GW; jebecookies=7c8f517c-cd19-43e2-a3aa-167372392e98|||||; _r01_=1; JSESSIONID=abcCwQQelI5Mxd1cx48ew; ick_login=d616d74d-6f98-420c-af4d-f9e855b11e0d; _de=BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5; p=d0cd0b910f5be6e2328b8ce0fc7ab0569; first_login_flag=1; [email protected]; ln_hurl=http://hdn.xnimg.cn/photos/hdn421/20171230/1635/main_JQzq_ae7b0000a8791986.jpg; t=69282122a1706bf87220ce140d1d44579; societyguester=69282122a1706bf87220ce140d1d44579; id=327550029; xnsid=c31e4689; loginfrom=syshome; ch_id=10016"
cookie_dict = {i.split("=")[0]:i.split("=")[-1] for i in cookie.split("; ")}
print(cookie_dict)
response = requests.get(url,headers=headers,cookies=cookie_dict)
with open("renren2.html","w",encoding="utf-8") as f:
f.write(response.content.decode())
运行结果:
05_try_login3.py → 使用Post请求使用账户密码登录人人网,并爬去登录后的网页数据
# coding=utf-8
import requests
#实例化session
session = requests.session()
#使用session发送post请求,获取对方保存在本地的cookie
post_url = "http://www.renren.com/PLogin.do"
headers = {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",}
post_data = {"email":"[email protected]","password":"alarmchime"}
session.post(post_url,headers=headers,data=post_data)
# 在使用session 请求登录后的页面
url = "http://www.renren.com/327550029/profile"
response = session.get(url,headers=headers)
with open("renren3.html","w",encoding="utf-8") as f:
f.write(response.content.decode())
运行结果: 在这里已经看到用户的信息爬去了下来
第三部分:
概念: 数据提取方式
案例代码:
06_try_json.py → 实现Json字符串转换为python的dict类型,达到输入翻译效果
# coding=utf-8
import requests
import json
url = "http://fanyi.baidu.com/basetrans"
query_str = input("请输入要翻译的中文:")
data = {"query":query_str,
"from":"zh",
"to":"en"}
headers = {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",}
response = requests.post(url,data=data,headers=headers)
html_str = response.content.decode() #json字符串
dict_ret = json.loads(html_str) # Json字符串转换为python的dict类型
# # print(dict_ret) # 输出字符串
# # print(type(dict_ret)) #输出类型
print(dict_ret)
ret = dict_ret["trans"][0]["dst"] # 将dict_ret中取出某个值["trans"] -> 对应键, [0] -> 对应键中的第一个键值对, ["dst"] -> 对应键
print("翻译结果是:",ret)
# str类型 -> {"errno":0,"from":"zh","to":"en","trans":[{"dst":"Hello, world.","prefixWrap":0,"src":"你好,世界","relation":[],"result":[[0,"Hello, world.",["0|13"],[],["0|13"],["0|13"]]]}],"dict":{"symbols":[{"parts":[{"part_name":"网络","means":["Hello world"]}]}],"word_name":"你好,世界","from":"netdata"}}
# dict类型 -> {'errno': 0, 'from': 'zh', 'to': 'en', 'trans': [{'dst': 'Hello, world.', 'prefixWrap': 0, 'src': '你好,世界', 'relation': [], 'result': [[0, 'Hello, world.', ['0|13'], [], ['0|13'], ['0|13']]]}], 'dict': {'symbols': [{'parts': [{'part_name': '网络', 'means': ['Hello world']}]}], 'word_name': '你好,世界', 'from': 'netdata'}}
运行结果:
07_try_json.py → json.dumps是把python类型转换为字符串(将豆瓣的爬取数据以json格式存储到本地)
import json
import requests
url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=ios&for_mobile=1&start=0&count=18&loc_id=108288&_=1517148631877"
headers = {
"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1",
"Referer": "https://m.douban.com/tv/american"}
response = requests.get(url,headers=headers)
json_str = response.content.decode()
ret1 = json.loads(json_str)
print(ret1)
with open("douban.txt","w",encoding="utf-8") as f:
f.write(json.dumps(ret1,ensure_ascii=False,indent=2)) # json.dumps是把python类型转换为字符串
运行结果:
08_douban_spider.py → 从豆瓣中爬取数据保存json到本地
from parse import parse_url
import json
class DoubanSpider:
# 初始化
def __init__(self):
self.temp_url = "https://m.douban.com/rexxar/api/v2/subject_collection/filter_tv_american_hot/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=0"
# 提取数据
def get_contentf_list(self,html_str):
dict_data = json.loads(html_str)
content_list = dict_data["subject_collection_items"]
total = dict_data["total"]
return content_list,total
# 写入数据
def save_content_list(self,content_list):
with open("douban.json","a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
print("保存成功")
# 实现主要的逻辑方法
def run(self):
num = 0
total = 100
while num < total + 18:
# 1.url
url = self.temp_url.format(num)
print(url)
# 2.发送请求,获取响应
html_str = parse_url(url)
# 3.提取数据
content_list,total = self.get_contentf_list(html_str)
# 4.保存
self.save_content_list(content_list)
# 5.构造下一页的url地址,循环2-5步
num += 18
if __name__ == '__main__':
douban = DoubanSpider()
douban.run()
运行结果:
09_try_lxml.py → 从豆瓣中获取热门电影的数据
# coding=utf-8
from lxml import etree
import requests
url = "https://movie.douban.com/chart"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
response = requests.get(url,headers=headers)
html_str = response.content.decode()
# print(html_str)
#使用etree处理数据
html = etree.HTML(html_str)
print(html)
#1.获取所有的电影的url地址
url_list = html.xpath("//div[@class='indent']/div/table//div[@class='pl2']/a/@href")
# print(url_list)
#2.所有图片的地址
img_list = html.xpath("//div[@class='indent']/div/table//a[@class='nbg']/img/@src")
# print(img_list)
#3.需要吧每部电影组成一个字典,字典中是电影的更重数据,比如标题,url,图片地址,评论数,评分
# 思路:
#1.分组
#2.每一组提取数据
ret1 = html.xpath("//div[@class='indent']/div/table")
print(ret1)
for table in ret1:
item = {}
item["title"]=table.xpath(".//div[@class='pl2']/a/text()")[0].replace("/","").strip()
item["href"] = table.xpath(".//div[@class='pl2']/a/@href")[0]
item["img"] = table.xpath(".//a[@class='nbg']/img/@src")[0]
item["comment_num"] = table.xpath(".//span[@class='pl']/text()")[0]
item["rating_num"] = table.xpath(".//span[@class='rating_nums']/text()")[0]
print(item)
运行结果:
10_qiubaispider → 从糗事百科中获取热门数据
# coding=utf-8
import requests
from lxml import etree
import json
class QiubaiSpider:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
def get_url_list(self): #根据url地址的规律,构造url list
url_list = [self.url_temp.format(i) for i in range(1,14)]
return url_list
def parse_url(self,url):
print("now parseing :",url)
response = requests.get(url,headers=self.headers)
return response.content.decode()
def get_content_list(self,html_str):#3.提取数据
html = etree.HTML(html_str)
#1.分组
div_list = html.xpath("//div[@id='content-left']/div")
# print(div_list)
content_list = []
for div in div_list:
item = {}
item["author_name"] = div.xpath(".//h2/text()")[0].strip() if len(div.xpath(".//h2/text()"))>0 else None
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.strip() for i in item["content"]]
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
item["stats_comments"] = div.xpath(".//span[@class='stats-comments']//i/text()")
item["stats_comments"] = item["stats_comments"][0] if len(item["stats_comments"])>0 else None
item["img"] = div.xpath(".//div[@class='thumb']//img/@src")
item["img"] = "https:"+item["img"][0] if len(item["img"])>0 else None
content_list.append(item)
return content_list
def save_content_list(self,content_list):# 保存
with open("qiubai.txt","a",encoding="utf-8") as f:
for content in content_list:
f.write(json.dumps(content,ensure_ascii=False))
f.write("\n")
print("保存成功")
def run(self):#实现主要逻辑
#1.根据url地址的规律,构造url list
url_list = self.get_url_list()
#2.发送请求,获取响应
for url in url_list:
html_str = self.parse_url(url)
#3.提取数据
content_list = self.get_content_list(html_str)
#4.保存
self.save_content_list(content_list)
if __name__ == '__main__':
qiubai = QiubaiSpider()
qiubai.run()
运行结果:(下面的运行结果中注释掉了多个item的结果,只留了content)
总结:
通过这次爬虫学习,对爬虫的技术有了相对的认识,也算是对爬虫有了一定的概念性了解,学习之后感觉爬虫并没有以前想象中的那么难,也通过这次学习python爬虫,也算是对爬虫的初步入门,以后会慢慢加深爬虫的学习!