爬虫小案例 爬取百度贴吧赵丽颖图片案例 xpath 美丽汤

美丽汤版本:

import requests
from bs4 import BeautifulSoup
import os
from hashlib import md5


def get_html(url, headers):
   html = requests.get(url, headers)
   return html.text


def parse_html(html):
   '''
   解析贴吧首页的帖子href放入列表
   :param html:
   :return:
   '''
   html_soup = BeautifulSoup(html, 'lxml')
   # 拿到每个帖子的超级连接放到列表
   a_list = html_soup.select('.j_thread_list .threadlist_title a')
   # 再从每个a里面拿到每个帖子的href连接地址,形成列表
   href_list = [a.get('href') for a in a_list]
   return href_list


def parse_image(img_html):
   '''
   解析每个帖子里面的图片src放入列表
   :param img_html:
   :return:
   '''
   img_html_soup = BeautifulSoup(img_html, 'lxml')
   img_list = img_html_soup.select('.BDE_Image')
   src_list = [img.get('src') for img in img_list]
   return src_list


def download_img(src, headers):
   # 如果没有文件夹创建一个
   dirname = 'zhaoliying'
   if not os.path.exists(dirname):
      os.mkdir(dirname)
   # 请求下载图片内容
   Imgcontent = requests.get(src, headers).content

   # 避免图片重复写文件
   file = md5(str(Imgcontent).encode('utf-8')).hexdigest()
   filename = dirname + '\\' + file + '.jpg'
   if not os.path.exists(filename):
      with open(filename, 'wb') as f:
         f.write(Imgcontent)


def main():
   # 第一步先拿到颖宝吧的首页代码
   url = 'http://tieba.baidu.com/f?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&pn=0'
   headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
   }
   html = get_html(url, headers)
   # print(html)

   # 第二步拿到当前页里面每个帖子的连接href属性
   href_list = parse_html(html)
   print(href_list)

   # 第三步在循环遍历这个href列表去到每个帖子里面拿图片的src
   for href in href_list:
      # 遍历拿到href需要拼接为一个完整的url,继续调用获取html
      url = 'http://tieba.baidu.com' + href
      img_html = get_html(url, headers)
      src_list = parse_image(img_html)

      # 最后遍历这个src把每个src图片下载
      for src in src_list:
         download_img(src, headers)


if __name__ == '__main__':
   main()

xpath版本:

import requests
from lxml import etree
import os
from hashlib import md5


def get_html(url, headers):
	html = requests.get(url, headers)
	return html.text


def parse_html(html):
	content = etree.HTML(html)
	href_list = content.xpath(
		'//li[contains(@class,"j_thread_list")]//div[contains(@class,"threadlist_title")]//a/@href'
	)
	return href_list


def parse_image(img_html):
	content = etree.HTML(img_html)
	src_list = content.xpath('//img[@class="BDE_Image"]/@src')
	return src_list


def download_img(url, headers):
	img_content = requests.get(url, headers).content
	if not os.path.exists("zhaoliying2"):
		os.mkdir("zhaoliying2")
	file = md5(str(img_content).encode('utf-8')).hexdigest()
	filename = "zhaoliying2" + "\\" + file + ".jpg"
	if not os.path.exists(filename):
		with open(filename, 'wb') as f:
			f.write(img_content)


def main():
	pn = 0
	url = "http://tieba.baidu.com/f?kw=%E8%B5%B5%E4%B8%BD%E9%A2%96&ie=utf-8&pn="+str(pn)
	headers = {
		"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
	}
	for i in range(10):
		html = get_html(url, headers)
		href_list = parse_html(html)
		for href in href_list:
			# "http://tieba.baidu.com/p/5787343916"
			url = "http://tieba.baidu.com" + href
			img_html = get_html(url, headers)
			src_list = parse_image(img_html)
			for src in src_list:
				download_img(src, headers)
		pn += 50


if __name__ == '__main__':
	main()

猜你喜欢

转载自blog.csdn.net/antian1991/article/details/81054728