利用requests和lxml库爬取电影天堂中最新电影前10页

今天为了在磨练我的爬虫技术,并且顺便复习一下requests和lxml库,写了一个小小的爬虫--------爬取电影天堂中最新电影前10页的内容,说实话,经历了很多坎坷,才爬取成功的,每写一段代都必须检验爬取的结果是否正确,不过,最终完成了这个小小的任务。

第一步:

第二步: 

第三步:

 

话不多说,直接上代码!

import requests
from lxml import etree

#网页的基本URL
BASE_URL = "https://www.dytt8.net"

#存储电影的各类信息
MOVIES = []

#存储每个电影的URL
MOVIE_URLS = []

#网页的请求头信息
HEADERS = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400"
}

#获取网页的源代码
def getHtml(url):
    try:
        response = requests.get(url,headers = HEADERS)
        response.raise_for_status()
        response.encoding = response.apparent_encoding
        return response.text
    except:
        "爬取失败"

#获取一页的电影的url地址
def get_movie_urls(page):
    html = etree.HTML(page)
    urls = html.xpath("//div[@class = 'co_content8']//ul//table//a/@href")
    MOVIE_URLS.extend(list(map(lambda url:BASE_URL+url,urls)))


#根据每一个电影的url地址来获得详细的电影信息
def get_movie_detail(url):
    movie = {}
    page = getHtml(url)
    html = etree.HTML(page)
    #获取电影名称
    title = html.xpath("//h1/font[@color = '#07519a']/text()")[0]
    movie["title"] = title
    #获取电影的海报
    poster = html.xpath("//div[@id = 'Zoom']//img/@src")[0]
    movie["poster"] = poster
    #获取全部的信息
    content = html.xpath("//div[@id = 'Zoom']//p//text()")
    #获取各项信息
    for index,info in enumerate(content):
        if info.startswith("◎译  名"):
            translate_name = info.replace("◎译  名","").strip()
            movie["translate_name"] = translate_name
        elif info.startswith("◎片  名"):
            movie_name = info.replace("◎片  名", "").strip()
            movie["translate_name"] = movie_name
        elif info.startswith("◎年  代"):
            year = info.replace("◎年  代", "").strip()
            movie["translate_name"] = year
        elif info.startswith("◎产  地"):
            country = info.replace("◎产  地","").strip()
            movie["country"] = country
        elif info.startswith("◎类  别"):
            typ = info.replace("◎类  别", "").strip()
            movie["typ"] = typ
        elif info.startswith("◎语  言"):
            language = info.replace("◎语  言", "").strip()
            movie["language"] = language
        elif info.startswith("◎字  幕"):
            zimu = info.replace("◎字  幕", "").strip()
            movie["zimu"] = zimu
        elif info.startswith("◎上映日期"):
            data = info.replace("◎上映日期", "").strip()
            movie["data"] = data
        elif info.startswith("◎豆瓣评分"):
            douban = info.replace("◎豆瓣评分", "").strip()
            movie["douban"] = douban
        elif info.startswith("◎片  长"):
            time = info.replace("◎片  长", "").strip()
            movie["time"] = time
        elif info.startswith("◎导  演"):
            diretion = info.replace("◎导  演", "").strip()
            movie["diretion"] = diretion
        elif info.startswith("◎主  演"):
            actors = []
            actor = info.replace("◎主  演", "").strip()
            actors.append(actor)
            for name in range(index+1,len(content)):
                if content[name].startswith("◎标  签") or content[name].startswith("◎简  介"):
                    break
                actors.append(content[name].strip())
            movie["actors"] = actors
        elif info.startswith("◎标  签"):
            label = info.replace("◎标  签", "").strip()
            movie["label"] = label
        elif info.startswith("◎简  介 "):
            unuse = info.replace("◎标  签", "").strip()
            for introdu in range(index+1,len(content)):
                if content[introdu].startswith("【下载地址】"):
                    break
                movie['introdu'] = content[introdu].strip()
                break

    MOVIES.append(movie)

if __name__ == '__main__':
    #这个for循环获取每一页中电影的URL地址
    for i in range(1,11):
        url = "https://www.dytt8.net/html/gndy/dyzz/list_23_"+str(i)+".html"
        page = getHtml(url)
        get_movie_urls(page)
    #这个for循环获取每一个电影的详细信息并输出
    for index,url in enumerate(MOVIE_URLS):
        print("---------------正在爬取第{}个电影-----------------".format(index+1))
        #获取每部电影的详细信息并输出
        get_movie_detail(url)
        print(MOVIES[index])
        print("--------------------------------------------------")

欢迎各位志同道合的小伙伴评论哦!!

猜你喜欢

转载自blog.csdn.net/yanzhiguo98/article/details/86566617