美丽汤版本:
import requests
from bs4 import BeautifulSoup
import os
from hashlib import md5
def get_html(url, headers):
html = requests.get(url, headers)
return html.text
def parse_html(html):
'''
解析贴吧首页的帖子href放入列表
:param html:
:return:
'''
html_soup = BeautifulSoup(html, 'lxml')
# 拿到每个帖子的超级连接放到列表
a_list = html_soup.select('.j_thread_list .threadlist_title a')
# 再从每个a里面拿到每个帖子的href连接地址,形成列表
href_list = [a.get('href') for a in a_list]
return href_list
def parse_image(img_html):
'''
解析每个帖子里面的图片src放入列表
:param img_html:
:return:
'''
img_html_soup = BeautifulSoup(img_html, 'lxml')
img_list = img_html_soup.select('.BDE_Image')
src_list = [img.get('src') for img in img_list]
return src_list
def download_img(src, headers):
# 如果没有文件夹创建一个
dirname = 'zhaoliying'
if not os.path.exists(dirname):
os.mkdir(dirname)
# 请求下载图片内容
Imgcontent = requests.get(src, headers).content
# 避免图片重复写文件
file = md5(str(Imgcontent).encode('utf-8')).hexdigest()
filename = dirname + '\\' + file + '.jpg'
if not os.path.exists(filename):
with open(filename, 'wb') as f:
f.write(Imgcontent)
def main():
# 第一步先拿到颖宝吧的首页代码
url = 'http://tieba.baidu.com/f?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&pn=0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
html = get_html(url, headers)
# print(html)
# 第二步拿到当前页里面每个帖子的连接href属性
href_list = parse_html(html)
print(href_list)
# 第三步在循环遍历这个href列表去到每个帖子里面拿图片的src
for href in href_list:
# 遍历拿到href需要拼接为一个完整的url,继续调用获取html
url = 'http://tieba.baidu.com' + href
img_html = get_html(url, headers)
src_list = parse_image(img_html)
# 最后遍历这个src把每个src图片下载
for src in src_list:
download_img(src, headers)
if __name__ == '__main__':
main()
xpath版本:
import requests
from lxml import etree
import os
from hashlib import md5
def get_html(url, headers):
html = requests.get(url, headers)
return html.text
def parse_html(html):
content = etree.HTML(html)
href_list = content.xpath(
'//li[contains(@class,"j_thread_list")]//div[contains(@class,"threadlist_title")]//a/@href'
)
return href_list
def parse_image(img_html):
content = etree.HTML(img_html)
src_list = content.xpath('//img[@class="BDE_Image"]/@src')
return src_list
def download_img(url, headers):
img_content = requests.get(url, headers).content
if not os.path.exists("zhaoliying2"):
os.mkdir("zhaoliying2")
file = md5(str(img_content).encode('utf-8')).hexdigest()
filename = "zhaoliying2" + "\\" + file + ".jpg"
if not os.path.exists(filename):
with open(filename, 'wb') as f:
f.write(img_content)
def main():
pn = 0
url = "http://tieba.baidu.com/f?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&ie=utf-8&pn="+str(pn)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
for i in range(10):
html = get_html(url, headers)
href_list = parse_html(html)
for href in href_list:
# "http://tieba.baidu.com/p/5787343916"
url = "http://tieba.baidu.com" + href
img_html = get_html(url, headers)
src_list = parse_image(img_html)
for src in src_list:
download_img(src, headers)
pn += 50
if __name__ == '__main__':
main()