Python爬虫爬取电影网站–独播库(小白实战)

使用Python爬虫实现了爬取电影网站–独播库的数据,话不多说,直接上代码
独播库链接

详细代码

from lxml import etree
import requests

# 独播库的url 
HomePage_url = 'https://www.duboku.net'
HEADERS = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}

# 通过解析分页的url,来获取电影详细数据的url
def get_HomePage(url):
    # 1.将url的页面爬取下来
    response = requests.get(url, headers = HEADERS)
    text = response.text
    html = etree.HTML(text)
    # 获取电影url链接,href属性
    hrefs = html.xpath("//li[@class='col-lg-8 col-md-6 col-sm-4 col-xs-3']//a[@class='myui-vodlist__thumb lazyload']/@href")
    # 1. 将数据存入列表中(map)
    hrefs = map(lambda url:HomePage_url+url, hrefs)
    # 返回的hrefs是一个列表
    # 如:https://www.duboku.net/voddetail-1174.html
    return hrefs

# 用来解析电影的url,获取电影的详细数据, url 如下
# https://www.duboku.net/voddetail-1174.html
def parse_href(url):
    movie = {}
    response = requests.get(url, headers = HEADERS)
    text = response.text
    html = etree.HTML(text)
    teleplay = html.xpath("//div[@class='myui-content__detail']//h1/text()")[0]
    movie['电视剧'] = teleplay
    col = html.xpath("//div[@class='col-xs-1']")[0]
    a = html.xpath("//div[@class='myui-content__thumb']")[0]
    img = a.xpath("//img/@data-original")
    movie['图片url'] = img
    grade = col.xpath("//div[@id='rating']//span[@class='branch']/text()")[0]
    movie['评分'] = grade
    sort = col.xpath("//p[@class='data']//a/text()")[0]
    movie['分类'] = sort
    region = col.xpath("//p[@class='data']//a/text()")[1]
    movie['地区'] = region
    year = col.xpath("//p[@class='data']//a/text()")[2]
    movie['年份'] = year
    starring = col.xpath("//p[@class='data']//a/text()")
    param_1 = len(starring)-1
    param_2 = param_1 - 4
    starring = starring[param_2:param_1]
    movie['主演'] = starring
    director = col.xpath("//p[@class='data']//a/text()")
    director = director[len(director)-1]
    movie['导演'] = director
    updateTime = col.xpath("//p[@class='data hidden-sm hidden-xs']//span/text()")
    updateTime = updateTime[len(updateTime)-1]
    movie['更新时间'] = updateTime
    update_status = col.xpath("//span[@class='pic-text text-right']/text()")
    movie['更新情况'] = update_status
    introduction = col.xpath("//p[@class='desc text-collapse hidden-xs']/text()")
    movie['简介'] = introduction
    return movie

def execute():
    base_url = 'https://www.duboku.net/vodshow/13--------{}---.html'
    # range() 相当于产生1-6的数组列表
    for x in range(1, 6):
        # 获取所有的分页的url,如下:
        # https://www.duboku.net/vodshow/13--------1---.html
        # https://www.duboku.net/vodshow/13--------5---.html
        url = base_url.format(x)
        hrefs = get_HomePage(url)
        for href_url in hrefs:
            movie = parse_href(href_url)
        #     break
        # break
            print(movie)
            print('==========='*30)

if __name__ == '__main__':
    execute()

运行结果如下(部分数据):

{‘电视剧’: ‘锦衣之下’, ‘图片url’: [‘https://www.duboku.net/upload/vod/20200131-1/fd4228a1a0a7481168437156952625da.jpg’], ‘评分’: ‘9.1’, ‘分类’: ‘陆剧’, ‘地区’: ‘内地’, ‘年份’: ‘2019’, ‘主演’: [‘任嘉伦’, ‘谭松韵’, ‘韩栋’, ‘叶青’], ‘导演’: ‘尹涛’, ‘更新时间’: ‘2020-02-02 12:29:09’, ‘更新情况’: [‘更新至46集’], ‘简介’: [’《锦衣之下》线上看,全集共55集 每周五至周一12点更新2集。《锦衣之下》剧情简介:天赋异禀的六扇门女捕快袁今夏(谭松韵饰)因为一桩案件和性情狠辣的锦衣卫陆绎(任嘉伦饰)结下梁子,今夏本以为此生与他再’]}
=============================================================
{‘电视剧’: ‘下一站是幸福’, ‘图片url’: [‘https://www.duboku.net/upload/vod/20200126-1/84525120d89870c828b6faaa6b464064.jpg’], ‘评分’: ‘8.8’, ‘分类’: ‘陆剧’, ‘地区’: ‘内地’, ‘年份’: ‘2020’, ‘主演’: [‘张雨剑’, ‘虞书欣’, ‘杨之楹’, ‘张磊’], ‘导演’: ‘丁梓光’, ‘更新时间’: ‘2020-02-01 11:29:35’, ‘更新情况’: [‘更新至14集’], ‘简介’: [’《下一站是幸福》线上看,全集共41集,每周日至周四21点更2集,周五六1集。《下一站是幸福》简介:贺繁星的公司面临被收购的危机,与元宋的感情也因年龄的差距而受到诸多非议,感情和事业几乎同时出现的危机让’]}
=============================================================
{‘电视剧’: ‘还没爱够’, ‘图片url’: [‘https://www.duboku.net/upload/vod/20191231-1/e913c73d9b63144ce3a0b3dae4714350.jpg’], ‘评分’: ‘7.9’, ‘分类’: ‘陆剧’, ‘地区’: ‘内地’, ‘年份’: ‘2020’, ‘主演’: [‘韩庚’, ‘王晓晨’, ‘叶祖新’, ‘张晓晨’], ‘导演’: ‘王迎’, ‘更新时间’: ‘2020-02-01 10:14:24’, ‘更新情况’: [‘全46集’], ‘简介’: [’《还没爱够》线上看,全集共46集。《还没爱够》讲述了“恐婚”青年陈炯再遇“被逃婚”前任姜小汐,互相理解再次相爱并克服“恐婚症”,走向婚姻的故事。’]}
=============================================================
{‘电视剧’: ‘新世界’, ‘图片url’: [‘https://www.duboku.net/upload/vod/20200111-1/5f2180a62b249e9495a7f95bd3973825.jpg’], ‘评分’: ‘7.5’, ‘分类’: ‘陆剧’, ‘地区’: ‘内地’, ‘年份’: ‘2020’, ‘主演’: [‘胡静’, ‘李纯’, ‘秦汉’, ‘赵峥’], ‘导演’: ‘徐兵’, ‘更新时间’: ‘2020-02-01 10:00:39’, ‘更新情况’: [‘更新至34集’], ‘简介’: [’《新世界》线上看,全集共70集,每周日至周五22点更2集,周六更1集。《新世界》剧情简介:新中国解放前夕,白纸坊警署小警察徐天在追查未婚妻贾小朵被害案件过程中,意外参与到中国共产党和平解放北平的事业当’]}

这是我的第一篇博客,希望能对你有所帮助

发布了9 篇原创文章 · 获赞 2 · 访问量 299

猜你喜欢

转载自blog.csdn.net/weixin_44941564/article/details/104148567
今日推荐