xpath方式改写获取贴吧的图片

import os
import re
import requests
from lxml import etree


class Tb(object):
def __init__(self, url, ):
self.url = url
self.User_Agent = {
"User - Agent": "Mozilla/5.0 (WindowsNT10.0;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome / 61.0.3163.91Safari / 537.36"}

def get_page_list(self):
url = "http://tieba.baidu.com/f?kw=" + self.url
date = requests.get(url, self.User_Agent)
html = etree.HTML(date.content)
end_page = html.xpath(u"//div/div/a[@class='last pagination-item ']/@href")
count = re.match(r"(.*?)pn=(\d*)", end_page[0]).group(2)
step = int(count) // 50
for i in range(0, step):
page = i * 50
url = "http://tieba.baidu.com/f?kw=" + self.url + "&pn=%s" % str(page)
self.get_tiezi_list(url)

def get_tiezi_list(self, url):
date = requests.get(url, self.User_Agent)
html = etree.HTML(date.content)
tiezi_list = html.xpath("//li/div[@class='t_con cleafix']/div/div/div/a/@href")
print(tiezi_list)
for tiezi in tiezi_list:
url = "http://tieba.baidu.com" + tiezi
self.get_picture(url)

def get_picture(self, url):
date = requests.get(url, self.User_Agent)
html = etree.HTML(date.content)
picture_list = html.xpath(u"//img[@class='BDE_Image']/@src")
print(picture_list)
if picture_list:
for picture_link in picture_list:
print(picture_link)
print(type(picture_link))
self.download_picture(picture_link)

def download_picture(self, picture_link):
picture_name = picture_link[-10:]
data = requests.get(picture_link,self.User_Agent)
try:
with open(r"./{}/-{}.jpg".format(self.url,picture_name), "wb") as f:
f.write(data.content)
except Exception as error:
print(error)

'''if i[0][-3:] == 'jpg':
with open(r'./{}/-{}.jpg'.format(self.url, picture), 'wb') as f:
f.write(picture_res.content)
time.sleep(1)
else:
with open(r'./{}/{}-{}{}.gif'.format(self.url, picture), num, tmp_str), 'wb') as f:
f.write(picture_res.content)
time.sleep(1) xpath取下的连接是元素不包括文件类型后缀 无法通过切片去判断类型 '''

def run(self):
try:
if not os.path.exists(self.url):
os.mkdir(self.url)
self.get_page_list()
except Exception as error:
print(error)


def main():
name = input("请输入要获取的贴吧名称:")

猜你喜欢

转载自www.cnblogs.com/guducp/p/9090377.html