python反反爬,爬取猫眼评分

#!/usr/bin/env python
# coding:utf-8
# __author__ = "南楼"
# __date__ = "2019/4/24 15:29"

import requests
import re
import os
from fontTools.ttLib import TTFont #下载字体 class MaoYan(object): def __init__(self): self.url = 'http://maoyan.com/films/1198214' self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36" } self.base_num = {} # 编号—数字 self.base_obj = {} # 编号—对象 # base.woff 为当前网站下载的一个字体 self.base_font_file = TTFont('./fonts/base.woff') # 需要先下载字体编辑软件(FontCreator),以便查看对应关系 self.base_num["uniF3BA"] = "0" self.base_num["uniF2A9"] = "1" self.base_num["uniE6A5"] = "2" self.base_num["uniF680"] = "3" self.base_num["uniE69C"] = "4" self.base_num["uniE710"] = "5" self.base_num["uniE07D"] = "6" self.base_num["uniE5A7"] = "7" self.base_num["uniEC7A"] = "8" self.base_num["uniE2A3"] = "9" for key in self.base_num: self.base_obj[key] =self.base_font_file['glyf'][key] def baseobj(self): for key in self.base_num: self.base_obj[key] =self.base_font_file['glyf'][key] # 获得woff内编号对应的字体对象 return self.base_obj # 发送请求获得响应 def get_html(self, url): response = requests.get(url, headers=self.headers) return response.content def create_font(self, re_font_file): # 列出已下载文件 file_list = os.listdir('./fonts') # 判断是否已下载 if re_font_file not in file_list: print('不在字体库中, 下载:', re_font_file) url = 'http://vfile.meituan.net/colorstone/' + re_font_file new_file = self.get_html(url) with open('./fonts/' + re_font_file, 'wb') as f: f.write(new_file) # 打开字体文件,创建 self.font_file属性 self.font_file = TTFont('./fonts/' + re_font_file) def get_num_from_font_file(self, re_star): newstar = re_star.upper().replace("&#X", "uni") realnum = newstar.replace(";", "") numlist = realnum.split(".") # gly_list = self.font_file.getGlyphOrder() #uni列表['glyph00000', 'x', 'uniF680', 'uniE2A3', 'uniE710', 'uniE69C', 'uniEC7A', 'uniF2A9', 'uniE5A7', 'uniE07D', 'uniE6A5', 'uniF3BA'] star_rating = [] for hax_num in numlist: font_file_num = self.font_file['glyf'][hax_num] for key in self.baseobj(): if font_file_num == self.base_obj[key]: star_rating.append(self.base_num[key]) # 星级评分待优化,暂不支持10.0, star_rating = star_rating[0]+"."+star_rating[1] return star_rating def start_crawl(self): html = self.get_html(self.url).decode('utf-8') # 正则匹配字体文件 re_font_file = re.findall(r'vfile\.meituan\.net\/colorstone\/(\w+\.woff)', html)[0] self.create_font(re_font_file) # 正则匹配星级评分 re_star_rating = re.findall(r'<span class="index-left info-num ">\s+<span class="stonefont">(.*?)</span>\s+</span>', html)[0] star_rating = self.get_num_from_font_file(re_star_rating) print("星级评分:", star_rating) if __name__ == '__main__': m = MaoYan() m.start_crawl()

猜你喜欢

转载自www.cnblogs.com/cola-lxj/p/10773563.html