爬取某电影网站(未写完)(不会获取视频链接)

import requests
import bs4
import lxml
import re
import time
from bs4 import BeautifulSoup
#网站
url = 'https://www.88ys.cc'
#电影或电视剧的名字
film = '家有女友'
#代理ip
proxy='120.24.245.33:16818'#已过期,需续费
proxies = {
'http':'http://'+proxy,
'https':'https://'+proxy
}
#gzip访问速度更快
headers = {
"Accept-Encoding": "gzip"
}
####搜索结果####
def search():
####搜索结果####
#搜索链接
url_search = url + '/index.php?m=vod-search'
#post需要提交的参数
data = {
'wd':film ,
'submit':''
}
#提交搜索内容的表单
#①无代理
r_s = requests.post(url_search, data=data)
#②有代理
# r_s = requests.post(url_search, data=data, proxies=proxies, headers=headers)
#设置编码
r_s.encoding = 'utf-8'
#接收返回的网页
text_s = r_s.text
#
pat = re.compile(r'<a class="link-hover" href="(.*?)"')
pat_is = re.findall(pat, text_s)
# print(pat_is)
return pat_is
####获取集数####
def List(pat_search):
####获取集数####
#搜索结果链接
url_list = url + pat_search
#打开合并的链接
# r_list = requests.get(url_list, proxies=proxies, headers=headers)
r_list = requests.get(url_list)
#设置字符编码
r_list.encoding = 'utf-8'
#接收链接网页
text_list = r_list.text
# print(text_list)
#使用BeautifulSoup获取第一个片源的所有集数链接
text_l_b = BeautifulSoup(text_list,'lxml')
stab81 = text_l_b.find_all(name='div', attrs={'id':'stab81'})
stab81_re = re.findall(re.compile(r'href="(.*?)"'), str(stab81[0]))
# print(stab81_re)
return stab81_re
####搜索结果文字信息####
def search_news(pat_search):
####搜索结果文字信息####
#存储获取的信息以集合形式返回
#[0]电影名[1]影片类型[2]语言
information = []
#建立连接
url_search = url + pat_search
r = requests.get(url_search)
#设置编码
r.encoding = 'utf-8'
#获取电影信息的div
bs = str(BeautifulSoup(r.text, 'lxml').find_all('div', class_='ct-c'))
#获取电影名称,并添加到集合
h1_bs = BeautifulSoup(bs,'lxml')
h1_re = re.findall(re.compile(r'>(.*?)<'), str(h1_bs.h1))
information.append(h1_re[0])
#类型
type_re = re.findall(re.compile(r'类型:</span>(.*?)</dd>'), bs)
information.append(type_re[0])
#语言
language_re = re.findall(re.compile(r'语言:</span>(.*?)</dd>'), bs)
information.append(language_re[0])
# print(information)
return information
####Main函数(循环获取搜索结果和集数)####
def _for_():
####循环获取搜索结果和集数####
#调用搜索
pat_search = search()
#接收影片返回信息
information = None
#接收片源一返回的信息
stab81 = None
# 循环调用方法
for i in pat_search:
#调用方法并接收
information = search_news(i)
#调用方法并接收
stab81 = List(i)
#打印
print(information)
#打印
print(stab81)
#使用延时防止运行太快被网站强制断开连接
time.sleep(3)
# 调用主(Main)函数
_for_()
####未做完,查看器和爬取源码不一致####
def a():
_url_ = url + '/vod-play-id-56106-src-1-num-1.html'
r = requests.get(_url_)
r.encoding = 'utf-8'
bs = BeautifulSoup(r.text,'lxml')
print(bs.prettify())
 
 


猜你喜欢

转载自www.cnblogs.com/Ly-233/p/11205661.html