使用面向对象以及Xpath爬取猫眼电影排行100
import time import json import requests from lxml import etree class MaoYanTop100(object): def __init__(self): self.url = 'https://maoyan.com/board/4' self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36" } def get_data(self, url, param): response = requests.get(url, params=param, headers=self.headers, verify=False) print(response.request.url) return response.text def parse_content(self, text): dom = etree.HTML(text) return dom def parse(self, dom): nodes = dom.xpath('//dd') for node in nodes: yield { 'title': node.xpath('.//p/a/@title')[0], 'actor': node.xpath('.//p[@class="star"]/text()')[0].strip(), 'time': node.xpath('.//p[@class="releasetime"]/text()')[0] } def save(self, f, item): f.write(json.dumps(item, indent=2, ensure_ascii=False) + ',\n') def run(self): page = int(input('请输入页码:')) with open('maoyan.json', 'w') as f_obj: for i in range(page): param = { 'start': i * 10 } text = self.get_data(self.url, param) time.sleep(1) dom = self.parse_content(text) my_generator = self.parse(dom) while True: try: # item_list = [] item = next(my_generator) # item_list.append(item) self.save(f_obj, item) except: break print(f'第{i + 1}页数据已保存完成') if __name__ == '__main__': maoyan = MaoYanTop100() maoyan.run()