# ----2018-7-15 ------世界杯总决赛
import requests
from lxml import etree
import re
class TiBa_Image(object):
# 创建同意方法
def __init__(self):
self.base_url = 'http://tieba.baidu.com/f'
self.second_url = 'https://tieba.baidu.com'
self.headers = {"User-Agent": '"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"'}
self.first_xpath = '''//div[@class="grbm_row_wrapper"]/div[@class="grbm_ele_wrapper"]/a[@class="grbm_ele_a grbm_ele_big"]/@href'''
self.second_xpath = '//div[@class="ag_main_list"]/div/a/@href'
#发送请求
def send_request(self,url, params=None):
response = requests.get(url, params=params, headers=self.headers)
# 解析数据
data = response.content.decode()
return data
#保存文件,以文件名保存
def write_file(self, data, pic_id, i):
pic_id = str(pic_id)
flie_name = "image/" + pic_id + "/" + str(i) + '.jpg'
print(flie_name)
with open(flie_name, 'wb') as f:
f.write(data)
print('保存成功')
#解析数据 //div[@class="grbm_ele_wrapper"]/a/@href
def JieXi_data(self, data, xpath_str):
#1.转换类型
html_data = etree.HTML(data)
#2.解析 ----所有的主页图片连接
result_list = html_data.xpath(xpath_str)
# 3.返回数据
return result_list
#首页
def Home_Page(self):
# 1.发送请求
dict_parms = {
"kw": "美女",
"ie": "utf-8",
"tab": "album",
}
data = self.send_request(self.base_url,dict_parms)
# 正则替换注释数据
# re_data = re.compile('<!--.*', re.S)
data = re.sub('<!--.*', '', data)
details_url_list = self.JieXi_data(data, self.first_xpath)
# 返回每页数据
return details_url_list
#拼接每张图片
def Details_Page(self):
details_url_list = self.Home_Page()
#url----/p/1879660227
for details_url in details_url_list:
# 得到每一个数据--url https://tieba.baidu.com/+....
details = re.sub('/p/', '', details_url)
import os
os.mkdir('image/' + details)
print(details)
params = {
"kw": "美女",
"alt": "jview",
"tid": str(details),
}
r = requests.get('http://tieba.baidu.com/photo/g/bw/picture/list', params=params).text
pattern = re.compile(r'"pic_id":"(.*?)"')
result = pattern.findall(r)
i = 1
for pic_id in result:
url = 'https://imgsa.baidu.com/forum/pic/item/{}.jpg'.format(pic_id)
img_bytes = requests.get(url=url).content
self.write_file(img_bytes, details, i)
i += 1
# 启动
def run(self):
self.Details_Page()
if __name__ == '__main__':
TiBa_Image().run()
python 爬虫, 抓取百度美女吧图片
猜你喜欢
转载自blog.csdn.net/great_zhou/article/details/81050172
今日推荐
周排行