Python爬取电影信息

版权声明:未经允许不得转载 https://blog.csdn.net/qq_42952437/article/details/85528042
from bs4 import BeautifulSoup
import re
import requests
from selenium import webdriver
import time

for i in range(1, 4):#爬去第一页到第三页的信息
    firefox = webdriver.Firefox()
    firefox.get('http://vip.1905.com/list/t_1/p%so6.shtml' % i)
    time.sleep(10)
    html = firefox.page_source
    firefox.quit()


    soup = BeautifulSoup(html, 'lxml')

    for soups in soup.find_all(class_='borderBox'):
        for scores in soups.find_all(class_='fr score'):
            for score in scores:
                print("得分:", score)
        for titles in soups.find_all(class_='name'):
            for title in titles:
                print("剧名:", title)
        for years in soups.find_all(class_='hidden year'):
            for year in years:
                print("时间:", year)
        for zhu_actors in soups.find_all(class_="hidden actor"):
            for zhu_actor in zhu_actors:
                print("主演:", zhu_actor)
        for pages in soups.find_all(class_='hidden descr'):
            for page in pages:
                print("简介:", page)
        for urls in soups.find_all(class_='hidden url'):
            for url in urls:
                print("网址:", url)


        with open('file1.txt', 'a+', encoding='utf-8')as f:
            f.write("得分:"+score+'\n'+"标题:"+title+'\n'+"时间:"+year+'\n' +
                    "主演:" +zhu_actor+'\n'+"简介:"+page+'\n'+"网址:"+url+'\n')
            f.write('='*50+'\n')
    patter = re.compile('<img.*?data-lazysrc="(.*?)".*?lazyImg.*?.*?>')
    imgs = re.findall(patter, html)
    print(imgs)
    for img in imgs:
        url = 'http:' + img
        print(img)
        tupian = img.split('/')[-1]
        res = requests.get(url)
        with open('D:\爬虫\Video' + '\\' + tupian, 'ab')as f:
            f.write(res.content)

利用火狐浏览器模拟登录爬取源码  !

猜你喜欢

转载自blog.csdn.net/qq_42952437/article/details/85528042