爬取电影天堂

电影天堂爬虫之网页分析

from lxml import etree
import requests
BASE_DOMAIN = 'http://www.ygdy8.net'
url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
#response.text
#response.context
#requests库,默认会使用自己猜测是编码方式将抓取下来
# 的网页进行解码,然后存储到text属性中
#在电影天堂的网页中,因为编码方式,requests库猜错了,所以产生乱码
response = requests.get(url,headers=headers)
text = response.content.decode('gbk')
# etree = html.etree
html = etree.HTML(text)
#//代表的是子孙节点,table[@class='tbspan']代表的是class为tbspan的table,//a/@href这是获取a标签下的href属性值
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
for detail_url in detail_urls:
    print(BASE_DOMAIN+detail_url)

# print(text)

电影天堂爬虫之爬取详情页url

from lxml import etree
import requests
BASE_DOMAIN = 'http://www.ygdy8.net'
url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}
def get_detail_urls(url):
    #response.text
    #response.context
    #requests库,默认会使用自己猜测是编码方式将抓取下来
    # 的网页进行解码,然后存储到text属性中
    #在电影天堂的网页中,因为编码方式,requests库猜错了,所以产生乱码
    response = requests.get(url,headers=headers)
    text = response.content.decode('gbk')
    # etree = html.etree
    html = etree.HTML(text)
    #//代表的是子孙节点,table[@class='tbspan']代表的是class为tbspan的table,//a/@href这是获取a标签下的href属性值
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls
    # for detail_url in detail_urls:
    #     print(BASE_DOMAIN+detail_url)
    # print(text)

def spider():
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    for x in range(1,8):
        url = base_url.format(x)
        print(url)
if __name__ == '__main__':
    spider()

电影天堂爬虫之爬虫完成

# -*- coding: utf-8 -*-
from lxml import etree
import requests
import time
BASE_DOMAIN = 'http://www.ygdy8.net'
url = 'http://www.ygdy8.net/html/gndy/dyzz/index.html'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
}


def get_detail_urls(url):
    #response.text
    #response.context
    #requests库,默认会使用自己猜测是编码方式将抓取下来
    # 的网页进行解码,然后存储到text属性中
    #在电影天堂的网页中,因为编码方式,requests库猜错了,所以产生乱码
    response = requests.get(url,headers=headers)
    text = response.text
    # etree = html.etree
    html = etree.HTML(text)
    #//代表的是子孙节点,table[@class='tbspan']代表的是class为tbspan的table,//a/@href这是获取a标签下的href属性值
    detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
    detail_urls = map(lambda url:BASE_DOMAIN+url,detail_urls)
    return detail_urls
    # print(30*"=")
    # for detail_url in detail_urls:
    #     print(BASE_DOMAIN+detail_url)
    # print(text)

def get_detail_page(url):
    movie = {}
    response = requests.get(url,headers=headers)
    text = response.content.decode('gbk')
    html = etree.HTML(text)
    title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
    # print(title)
    movie['title'] = title

    Zoom = html.xpath("//div[@id='Zoom']")[0]
    imgs = Zoom.xpath(".//img/@src")
    cover= imgs[0]
    print(cover)
    screenshot = imgs[1]
    movie['cover']=cover
    movie['screenshot'] = screenshot
    infos = Zoom.xpath(".//text()")
    for index,info in enumerate(infos):
        # print(info)
        if info.startswith("◎年  代"):
            info = info.replace("◎年  代","").strip()
            movie['year'] = info
            # print(info)
        elif info.startswith("◎产  地"):
            info = info.replace("◎产  地","").strip()
            movie['country'] = info
            # print(info)

        elif info.startswith("◎类  别"):
            info = info.replace("◎类  别","").strip()
            movie['category'] = info
            # print(info)
        elif info.startswith("◎豆瓣评分"):
            info = info.replace("◎豆瓣评分", "").strip()
            movie['douban_rating'] = info
            # print(info)
        elif info.startswith("◎片  长"):
            info = info.replace("◎片  长", "").strip()
            movie['duration'] = info
            # print(info)
        elif info.startswith("◎导  演"):
            info = info.replace("◎导  演", "").strip()
            movie['director'] = info
            # print(info)
        elif info.startswith("◎主  演"):
            info = info.replace("◎主  演", "").strip()
            # movie['year'] = info
            # print(info)
            actors = [info]
            for x in range(index+1,len(infos)):
                actor = infos[x].strip()
                if actor.startswith("◎"):
                    break
                # print(actor)
                actors.append(actor)
            print(actors)
            movie['actors']=actors
        elif info.startswith("◎简  介"):
            info = info.replace("◎简  介","")
            context = ""
            for x in range(index+1,len(infos)):
                profile = infos[x].strip()
                if profile.startswith("【下载地址】"):
                    break
                context+=infos[x]
                print(profile)
                movie['profile']=profile

    download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
    print(download_url)
    movie['download_url'] = 'download_url'
    return movie

def spider():
    base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html"
    movies = []
    for x in range(1,8):
        url = base_url.format(x)
        # print(url)
        detail_urls = get_detail_urls(url)
        for detail_url in detail_urls:
            movie = get_detail_page(detail_url)
            movies.append(movie)
            # print(movie)
            # print(30*"=")
            with open("demo.txt",'a+',encoding='utf-8') as fp:#a是追加
                # for key,volue in movie:
                fp.write(str(movie))
                fp.write("\n======================================================\n")
        #     break
        # break
    # print(movies)
if __name__ == '__main__':
    start = time.time()
    spider()
    end = time.time()
    print(end-start)

猜你喜欢

转载自blog.csdn.net/henusyb/article/details/89415601