爬虫小案例 爬取百度贴吧杨幂图片 xpath 美丽汤

xpath版本

import requests
from lxml import etree
import os
from hashlib import md5


def get_html(url, headers):
	html = requests.get(url, headers)
	return html.text


def parse_html(html):
	content = etree.HTML(html)
	href_list = content.xpath(
		'//li[contains(@class,"j_thread_list")]//div[contains(@class,"threadlist_title")]/a/@href'
	)
	return href_list


def parse_image(img_list):
	content = etree.HTML(img_list)
	src_list = content.xpath('//img[@class="BDE_Image"]/@src')
	return src_list


def download_image(url, headers):
	image_content = requests.get(url, headers).content
	if not os.path.exists("yangmi"):
		os.mkdir("yangmi")
	file = md5(str(image_content).encode('utf-8')).hexdigest()
	filename = "yangmi" + "//" + file + ".jpg"
	if not os.path.exists(filename):
		with open(filename, 'wb') as f:
			f.write(image_content)


def main():
	pn = 0
	url = "http://tieba.baidu.com/f?kw=%E6%9D%A8%E5%B9%82&ie=utf-8&pn=" + str(pn)
	headers = {
		"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
	}
	for i in range(10):
		html = get_html(url, headers)
		href_list = parse_html(html)
		for href in href_list:
			src = "http://tieba.baidu.com" + href
			img_html = get_html(src, headers)
			src_list = parse_image(img_html)
			for src in src_list:
				download_image(src, headers)
		pn += 50


if __name__ == '__main__':
	main()

美丽汤版本:

import requests
from bs4 import BeautifulSoup
import os
from hashlib import md5


def get_html(url, headers):
	html = requests.get(url, headers)
	return html.text


def parse_html(html):
	html_soup = BeautifulSoup(html, 'lxml')
	a_list = html_soup.select('.j_thread_list .threadlist_title a')
	href_list = [a.get('href') for a in a_list]
	return href_list


def parse_image(img_html):
	img_html_soup = BeautifulSoup(img_html, 'lxml')
	img_list = img_html_soup.select('.BDE_Image')
	src_list = [img.get('src') for img in img_list]
	return src_list


def download_img(url, headers):
	imgcontent = requests.get(url, headers).content
	if not os.path.exists("yangmi2"):
		os.mkdir("yangmi2")
	file = md5(str(imgcontent).encode('utf-8')).hexdigest()
	filename = "yangmi2" + "//" + file + ".jpg"
	if not os.path.exists(filename):
		with open(filename, 'wb') as f:
			f.write(imgcontent)


def main():
	pn = 0
	url = "http://tieba.baidu.com/f?kw=%E6%9D%A8%E5%B9%82&ie=utf-8&pn=" + str(pn)
	headers = {
		"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
	}
	for i in range(10):
		html = get_html(url, headers)
		href_list = parse_html(html)
		for href in href_list:
			src = "http://tieba.baidu.com" + href
			img_html = get_html(src, headers)
			src_list = parse_image(img_html)
			for src in src_list:
				print(src)
				download_img(src, headers)
		pn += 50


if __name__ == '__main__':
	main()

猜你喜欢

转载自blog.csdn.net/antian1991/article/details/81054704