#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: zty import io import sys import requests import re from bs4 import BeautifulSoup sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') myurl = 'https://movie.douban.com/cinema/nowplaying/beijing/' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } req = requests.get(myurl, headers=headers) mysoup = BeautifulSoup(req.text, 'lxml') pt = mysoup.find_all('li', class_='list-item') names = re.findall(r'data-title="(.*?)"', str(pt)) # 电影名称 scoresx = re.findall(r'(subject-rate|text-tip).*>(.*?)<', str(pt)) # 该正则表达式截取了(subject-rate|text-tip)这一部分,不知道怎么写不截取此部分的表达式 scores = [] # 电影评分 for scoresxx in scoresx: scores.append(scoresxx[1]) # 截取真正的评分 for i in range(0, len(scores)): print(names[i]+' '+scores[i]+'\n') with open('beijing.txt', 'a') as f: f.write(names[i]+' '+scores[i]+'\n') # 保存文件的模式?使用w模式不行?原因?
遗留问题:
1、ulopen()和requests.get()的区别?
2、正确截取评分的正则表达式
3、文件模式问题