Python爬虫(一)-初探豆瓣Top250

最近突然对python兴趣浓厚,在看过几本基础书籍后,便想动手开始做做小的实践,首当其冲的便是爬虫,在学习了很多大神的爬虫入门贴后,深感佩服,在这里感谢以下两位博主的精彩分享:

1.xlzd 

2.Jecvay

其中,由于目前好多网站都采取了反爬行动,所以在学习的时候并不能完全采用上述两位博主的源码,但是参考价值很大。本文是在xlzd的基础篇的例子上稍微加了点爬取内容,即可以下载Top250的影片图片和对Top250评分进行提取。

代码如下:

# coding=utf-8

'''
爬取豆瓣电影Top250
'''
import requests
from bs4 import BeautifulSoup
import codecs # 字符转换
import re
from contextlib import closing
import os

DownLoad_url = 'https://movie.douban.com/top250'
'''   下载页面内容content  '''
def download_page(url):
	# 伪装浏览器
	headers = {
		'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
		              'Chrome/63.0.3239.132 Safari/537.36'}
	data = requests.get(url,headers=headers).text
	return data

'''   解析html  '''
def parse_html(html):
	soup = BeautifulSoup(html,"html.parser")
	movie_list_soup = soup.find('ol',attrs={'class':'grid_view'})

	# 存放电影名字的列表
	movie_name_list = []
	movie_pic = {}
	for movie_li in movie_list_soup.find_all('li'):
		detail = movie_li.find('div',attrs={'class':'hd'})
		movie_name = detail.find('span',attrs={'class':'title'}).getText()
		score = movie_li.find('div',attrs={'class':'bd'})
		movie_score = score.find('span',attrs={'class':'rating_num'}).getText()
		movie_detail = movie_name + movie_score
		movie_name_list.append(movie_detail)


	# 跳转业面
	next_page = soup.find('span',attrs={'class':'next'}).find('a')
	if next_page:
		return movie_name_list,DownLoad_url + next_page['href']
	return movie_name_list,None

def getPic(data):
	pic_list = re.findall(r'src="http.+?.jpg"',data)
	return pic_list

def download_pic(url,name):
	rootPath = 'F:\\PythonCode\\SavePath\\doubanTop250\\'
	if not os.path.exists(rootPath):
		os.makedirs(rootPath)
	response = requests.get(url,stream=True)
	pic_type = '.'+url.split('.')[-1]
	with closing(requests.get(url,stream=True)) as response:
		with open(rootPath+name+pic_type,'wb') as file:
			for data in response.iter_content(128):
				file.write(data)

def main():
	print(download_page(DownLoad_url))

if __name__ == '__main__':
	url = DownLoad_url
	n = 1
	with codecs.open('movies','w',encoding='utf-8') as fp:
		while url:
			html = download_page(url)
			picdata = getPic(html)
			movies_name = 0
			movies, url = parse_html(html)
			for picinfo in picdata:
				# print(str(n) + '--->' +picinfo[5:-1])
				# print(movies[movies_name])
				download_pic(picinfo[5:-1], 'Top' + str(n) +'-'+ movies[movies_name])
				print(movies[movies_name] + '下载完成!')
				n = n + 1
				movies_name += 1
			fp.write(u'\n'.join(movies))





猜你喜欢

转载自blog.csdn.net/xiaoxun2802/article/details/79001871