使用python3爬取都摆即将上映和正在热映的电影,代码如下
直接使用bs4获取页面,使用css 获取到对应的信息后,使用字符串拼接的方式,将正在热映和即将上映的信息拼接出来并写入到html页面中,在爬取完毕后打开生成的html。
内容比较简单
直接使用bs4获取页面,使用css 获取到对应的信息后,使用字符串拼接的方式,将正在热映和即将上映的信息拼接出来并写入到html页面中,在爬取完毕后打开生成的html。
内容比较简单
[码云的地址](https://gitee.com/xlelou/spider/blob/master/doubai.py)
import requests
import json
from bs4 import BeautifulSoup
import webbrowser
# 热映的url
nowplayingUrl = 'https://movie.douban.com/cinema/nowplaying/dongying/'
class getM():
# 获取热映的数据
def getNowPlaying (url):
r = requests.get(url)
res = r.text
soup = BeautifulSoup(res,'html.parser')
div = soup.find(id = 'nowplaying').find_all("li", attrs={"class": "list-item"})
# print(div)
nowplaying = ''
for i in range(len(div)):
# if i < 10 :
# print(div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate'))
if div[i].find("li", attrs={"class": 'stitle'}).find('a').get('title') != None:
nowplaying += '电影名称:'+ div[i].find("li", attrs={"class": 'stitle'}).find('a').get('title')+ '\r\n'
else:
nowplaying += '电影名称:'+'暂无名称'+ '\r\n'
if div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate') != None:
nowplaying += '评分:' + div[i].find("li", attrs={"class": 'srating'}).find('span','subject-rate').text+ '\r\n'
else:
nowplaying +='评分:' + '暂无评分'+'\r\n'
if div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href') != None:
nowplaying += '电影简介:<a target="_blank" href='+ div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href')+'/>'+div[i].find("li", attrs={"class": 'stitle'}).find('a').get('href')+'</a>' +'\r\n'
else:
nowplaying += '电影简介:'+'暂无简介'+ '\r\n'
if div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href') != None:
nowplaying += '购票地址:<a target="_blank" href='+ div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href')+ '>'+div[i].find("li", attrs={"class": 'sbtn'}).find('a').get('href')+'</a>'+'\r\n'
else:
nowplaying += '购票地址:'+'暂无地址'+ '\r\n'
return nowplaying
def getComing (url):
r = requests.get(url)
res = r.text
soup = BeautifulSoup(res,'html.parser')
trs = soup.find('table','coming_list').find('tbody').find_all('tr')
coming = ''
for i in range(len(trs)):
if trs[i]:
coming += '上映日期:' + trs[i].find_all('td')[0].text.strip() + '\r\n'
coming += '片名:' + trs[i].find_all('td')[1].text.strip() + '\r\n'
coming += '类型:' + trs[i].find_all('td')[2].text.strip() + '\r\n'
coming += '制片地区:' + trs[i].find_all('td')[3].text.strip() + '\r\n'
coming += '想看:' + trs[i].find_all('td')[4].text.strip() + '\r\n'
coming += '简介:<a target="_blank" href=' + trs[i].find_all('td')[1].find('a').get('href').strip() + '>'+trs[i].find_all('td')[1].find('a').get('href').strip()+'</a>' +'\r\n'
coming += '\r\n' + '\r\n'
# pass
return coming
GEN_HTML = 'asd.html'
print(getM.getNowPlaying(nowplayingUrl))
print(getM.getComing('https://movie.douban.com/coming'))
content = getM.getNowPlaying(nowplayingUrl).replace('\r\n','<br/>')
coming = getM.getComing('https://movie.douban.com/coming').replace('\r\n','<br/>')
f = open(GEN_HTML,'w',encoding='utf8')
message = """
<html>
<head>
<meta name="renderer" content="webkit" />
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head>
<body>
<p>Hello,World!</p>
<p>豆瓣电影</p>
<div>
<p>热映</p>
%s
</div>
<div>
<p>即将上映</p>
%s
</div>
</body>
</html>"""%(content,coming)
f.write(message)
f.close()
webbrowser.open(GEN_HTML,new = 1)