爬取豆瓣正在上映的电影名称及评分

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: zty
import io
import sys
import requests
import re
from bs4 import BeautifulSoup
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030')
myurl = 'https://movie.douban.com/cinema/nowplaying/beijing/'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
req = requests.get(myurl, headers=headers)
mysoup = BeautifulSoup(req.text, 'lxml')
pt = mysoup.find_all('li', class_='list-item')
names = re.findall(r'data-title="(.*?)"', str(pt))      # 电影名称
scoresx = re.findall(r'(subject-rate|text-tip).*>(.*?)<', str(pt))  # 该正则表达式截取了(subject-rate|text-tip)这一部分,不知道怎么写不截取此部分的表达式
scores = []     # 电影评分
for scoresxx in scoresx:
    scores.append(scoresxx[1])  # 截取真正的评分
for i in range(0, len(scores)):
    print(names[i]+' '+scores[i]+'\n')
    with open('beijing.txt', 'a') as f:
     f.write(names[i]+' '+scores[i]+'\n')  # 保存文件的模式?使用w模式不行?原因?

遗留问题:

1、ulopen()和requests.get()的区别?

2、正确截取评分的正则表达式

3、文件模式问题

猜你喜欢

转载自www.cnblogs.com/dannvivian/p/9493890.html