python爬虫六:爬取电影图片及简介

# -*- coding: utf-8 -*-

#2345电影排行榜
import requests
from bs4 import BeautifulSoup

#获取网站的通用类
def get_html(url):
    try:

        r=requests.get(url,timeout=30)
        r.raise_for_status()

        print(r.encoding)
        r.encoding='gbk'
        return r.text

    except:
        print ("wrong")

#下载图片的通用工具类
def get_pic_from_url(url):
        #从url以二进制的格式下载图片数据
        pic_content = requests.get(url,stream=True).content
        open('filename','wb').write(pic_content)



def main(url):
    html=get_html(url)
    soup=BeautifulSoup(html,'lxml')
    #获取这个ul大标签的对象
    moves_list=soup.find('ul',attrs={'class':'picList clearfix'});

    #获取ul下的所有的li标签
    li_list=moves_list.find_all('li')

    for li in li_list:
        #获取图片的连接
        '''
        <img onerror="javascript:this.src='//imgwx1.2345.com/dypcimg/tv/newimages/default_poster.jpg'" 
        src="//imgwx4.2345.com/dypcimg/img/f/66/sup198834_223x310.jpg" alt="壹号别墅" title="壹号别墅" width="130" height="173">
       '''
        img_src=li.find('img')['src']
        #print(img_src)

        #获取影片的名字
        '''
        <span class="sTit"><a href="//dianying.2345.com/detail/195766.html" target="_blank">妈妈咪鸭</a></span>
        '''
        name=li.find('span',attrs={'class':'sTit'}).a.text
        #print (name)

        #上映时间
        '''
        <span class="sIntro">上映时间:2015-05-12</span>
        '''
        try:
            time=li.find('span',attrs={'class':'sIntro'}).text
            #print (time)
        except:
            time='还没上映'
            #print ('还没上映')

        #角色
        '''
        <p class="pActor">主演:
        <a target="_blank" href="//dianying.2345.com/list/---ZHANGYI5---.html" title="张译">张译</a>
        <a target="_blank" href="//dianying.2345.com/list/---HUANGJINGYU---.html" title="黄景瑜">黄景瑜</a>
        <a target="_blank" href="//dianying.2345.com/list/---HAIQING---.html" title="海清">海清</a></p>
        '''
        actors=li.find('p',attrs='pActor')
        act=''
        for actor in actors:
             act+=actor.string+' '
        #print (act)


        #介绍
        '''
        <p class="pTxt pIntroShow">简介:作为远达建筑公司的副总监杨维(王健饰),
        工作上处处受到上级和同事的打压,家庭中妻子(王妍饰)对其也不尊重。各种的压迫下,导致杨维走上歧途。
        将周燕(吕小漫饰)、白亚楠(徐艺涵饰)、沈美玲(刘雨晴饰)分别抓到自己的地窖中,将其虐待......地窖外面
        ,三位女性的亲人苦苦寻找,白亚楠的父亲白景山(梁岩饰)和周燕... 
        <a href="javascript:void(0);" target="_self" class="aMore pIntroShowMore">展开全部 <i class="iconfont"></i></a></p>
        '''
        instroture=li.find('p',attrs={'class':'pTxt pIntroShow'}).text
        #print (instroture)

        print ('{}\t{}\n{}\n{}\n'.format(name,time,act,instroture))

        #下载图片
        with open('C:testdata/image/'+name+'.png','wb+') as f:
            f.write(requests.get('http:'+img_src).content)



        #获取
    #print (li_list)
    #print (soup)



url='http://dianying.2345.com/top/'

#调用方法
if __name__=='__main__':
   main(url)

猜你喜欢

转载自blog.csdn.net/qq_38788128/article/details/80481708