python爬虫实战：利用beautiful soup爬取猫眼电影TOP100榜单内容-1

但是他是用正则表达式完成的。作为编程小白，实在脑力不够去理解那些眼花缭乱的正则表达式。

于是直接利用后面学习的beautiful soup实战一回，同时个人增加了些实战

直接把代码分享下面，soup使用的不是特别熟练，有待优化。后续再用 pyquery 练练手。顺便搞下数据存储实战。

以后有空再分享我的实战操作心得。

import requests
from bs4 import BeautifulSoup
import os 
import time

start = time.clock()  # 添加程序运行计时功能。

file_path = 'D:\python3.6\scrapy\猫眼'   # 定义文件夹，方便后续check文件夹是否存在
file_name = 'maoyan.txt'   # 自定义命名文件名称，
file = file_path+'\\'+file_name     # 创建文件全地址，方便后续引用

url = "http://maoyan.com/board/4"  # 获取url的开始页
 
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}


def Create_file(file_path,file):   # 定义 检查和创建目标文件夹和文件的函数
	
	if os.path.exists(file_path)== False: # check文件夹不存在
		os.makedirs(file_path)   # 创建新的自定义文件夹
		fp = open(file,'w')   # 创建新的自定义文件
	# "w" 以写方式打开，只能写文件，如果文件不存在，创建该文件；如果文件已存在，先清空，再打开文件
	
	elif os.path.exists(file_path)== True: # check文件夹存在
		with open(file, 'w', encoding='utf-8') as f: # 打开目标文件夹中的文件
			f.seek(0)
	# f.seek(offset[,where])把文件指针移动到相对于where的offset位置。where为0表示文件开始处，这是默认值 ；1表示当前位置；2表示文件结尾
			f.truncate()
	#清空文件内容，注意：仅当以 "r+" "rb+" "w" "wb" "wb+"等以可写模式打开的文件才可以执行该功能

def get_all_pages(start):
# 定义获取所有pages页的目标内容的函数
	
	pages=[]	
	for n in range(0,100,10):
	# 获取offset的步进值，注意把int的n转换为str
	# 遍历所有的url，并获取每一页page的目标内容
		if n==0:
			url=start
		else:
			url=start+'?offset='+str(n)
			
		r = requests.get(url, headers=headers) 
		soup = BeautifulSoup(r.content, 'lxml')
		page= soup.find_all(name='dd')	
		# 获取该gage的所有dd节点的内容
		
		pages.extend(page)
		# 将获取的所有page list扩展成pages，方便下面遍历每个dd节点内容
		
	return pages
	# 返回所有pages的dd节点的内容，每个dd节点内容都以list方式存储其中

	
Create_file(file_path,file)

text = get_all_pages(url)

for film in text:
# 遍历列表 text中的所有元素，也就是每个dd节点内容
# 这个for循环应该优化成 自定义函数形式；
	dict ={}
	# 创建空dict
	
	# print(type(film)) # 确认 film 属性为tag,故可以使用tag相关的方法处理film
	# print('*'*50) # 可以分隔检查输出的内容，方便对照

	dict['Index']=film.i.string # 选取film的第一个子节点 i 的的string属性值
	
	comment1 = film.div.div.div	# 获取第三重直接子孙节点，例如下面注释中的<div class="movie-item-info"> 节点全部元素
	
	name= comment1.find_all(name='p')[0].string
	star = comment1.find_all(name='p')[1].string
	releasetime = comment1.find_all(name='p')[2].string
	
	dict['name']=name
	dict['star']=str.strip(star)
	dict['releasetime']=releasetime
		
	comment2 = comment1.find_next_sibling() 
	# 获取第三重直接子孙节点的next节点，例如下面注释中的<div class="movie-item-info"> 节点全部元素
	# print(comment2)	 # 检查comment2是否为目标文本
	sco1=comment2.i.string
	sco2=comment2.i.find_next_sibling().string
	
	# print(type(sco1)) # 判断sco1 为tag类型
	# print(sco1) # 检查sco1是否为目标输出内容

	score = (sco1.string+str.strip(sco2))# 获取合并后的score字符串
	dict['score']=score
	
	print(dict) # 检查dict是否为目标输出内容
	
	with open(file, 'a', encoding='utf-8') as f: # 以打开目标file文件
		f.write(str(dict)+'\n')   # 注意添加 换行符 '\n',实现每个dict自动换行写入txt中		
end = time.clock() # 添加程序运行计时功能。
print('抓取完成','\n','耗时：',end-start) # 添加程序运行计时功能。

运行结果参考：

python爬虫实战：利用beautiful soup爬取猫眼电影TOP100榜单内容-1

猜你喜欢