思路分析,首先分析猫眼电影的top100榜的网页特点,然后根据网页信息编写用于匹配的正则表达式,然后提取对应的信息即可。
接下来是对存储文件的操作,将最终的结果存储在csv结果。因为在编写过程中,要试很多次,为了不要每次都手动删除文件,因此在程序运行开始之前判断文件是否已经存在,如果存在则删除,然后开始便开始爬取信息。代码如下:
import re
import requests
import csv
import os
class MaoYan():
def __init__(self):
self.url = "http://maoyan.com/board/4"
self.fieldnames = ['index','title', 'actor','time','score']
self.file = 0
def file_del(self):
files = os.listdir(os.getcwd()) #列出目录下的文件
for file in files:
suffix = file.split('.')[-1]
if suffix == 'csv':
os.remove(file)
print(file,"文件已经被删除")
def file_cre(self):
self.file = open('my.csv','a', newline='')
self.writer = csv.DictWriter(self.file, fieldnames=self.fieldnames)
self.writer.writeheader()
def url_cre(self):
for i in range(10):
url_start = "http://maoyan.com/board/4?offset="
url = url_start + str(i*10)
yield url
def html_get(self):
for url in self.url_cre():
r = requests.get(url)
html = r.text
yield html
def html_parse(self):
for html in self.html_get():
p2 = re.compile('<dd>.*?board-index.*?>(.*?)</i>.*?class="name".*?><a.*?>(.*?)</a>.*?"star".*?>(.*?)</p>.*?releasetime.*?>(.*?)</p>.*?integer.*?>(.*?)</i>.*?fraction.*?>(.*?)</i>.*?</dd>', re.S)
items = re.findall(p2,html)
for item in items:
info = {
'index':item[0],
'title':item[1],
'actor':item[2].strip().split(':')[-1],
'time':item[3].split(':')[-1],
'score':item[4]+item[5],
}
print(info)
self.save_to_csv(info)
def save_to_csv(self,info):
global writer
if self.writer.writerow(info):
print("保存成功")
def file_close(self):
self.file.close() #关闭文件
def main(self):
self.file_del() #首先进行文件删除
self.file_cre() #创建文件
self.html_parse() #解析并存储
self.file_close() #关闭文件
if __name__ == "__main__":
app = MaoYan()
app.main()