爬取猫眼电影TOP100(回顾正则表达式)

回头来复习一下正则表达式,选了猫眼电影来练练手

import requests
import re
import csv
import codecs
import time

headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36',
	'Referer': 'https://maoyan.com/board/4?offset=90'
}

with open("F://猫眼电影.csv","ab+") as f:
	f.write(codecs.BOM_UTF8)

fp = open('F://猫眼电影.csv','a+',newline='',encoding='utf-8')
writer = csv.writer(fp)
writer.writerow(['电影','演员','上映时间','评分'])

def explian_HTML(url):
	print(url)
	response = requests.get(url,headers = headers)
	titles = re.findall('<p class="name">.*?<.*?title="(.*?)"',response.text,re.S)
	authors = re.findall('<p class="star">\s*(.*?)\s*</p>',response.text)
	times = re.findall('<p class="releasetime">(.*?)</p>',response.text,re.S)
	point_1 = re.findall('<i class="integer">(.*?)</i>',response.text)
	point_2 = re.findall('<i class="fraction">(.*?)</i>',response.text)
	for a,b,title,author,time in zip(point_1,point_2,titles,authors,times):
		point = a+b
		writer.writerow([title,author,time,point])

if __name__ == '__main__':
	urls = ["https://maoyan.com/board/4?offset={}".format(i) for i in range(0, 91, 10)]
	for url in urls:
		explian_HTML(url)
		time.sleep(0.5)#设置休眠时间,否则会过快而爬不下来

fp.close()

猜你喜欢

转载自blog.csdn.net/weixin_43901998/article/details/88382065