版权声明:未经允许不得转载 https://blog.csdn.net/qq_42952437/article/details/85528042
from bs4 import BeautifulSoup
import re
import requests
from selenium import webdriver
import time
for i in range(1, 4):#爬去第一页到第三页的信息
firefox = webdriver.Firefox()
firefox.get('http://vip.1905.com/list/t_1/p%so6.shtml' % i)
time.sleep(10)
html = firefox.page_source
firefox.quit()
soup = BeautifulSoup(html, 'lxml')
for soups in soup.find_all(class_='borderBox'):
for scores in soups.find_all(class_='fr score'):
for score in scores:
print("得分:", score)
for titles in soups.find_all(class_='name'):
for title in titles:
print("剧名:", title)
for years in soups.find_all(class_='hidden year'):
for year in years:
print("时间:", year)
for zhu_actors in soups.find_all(class_="hidden actor"):
for zhu_actor in zhu_actors:
print("主演:", zhu_actor)
for pages in soups.find_all(class_='hidden descr'):
for page in pages:
print("简介:", page)
for urls in soups.find_all(class_='hidden url'):
for url in urls:
print("网址:", url)
with open('file1.txt', 'a+', encoding='utf-8')as f:
f.write("得分:"+score+'\n'+"标题:"+title+'\n'+"时间:"+year+'\n' +
"主演:" +zhu_actor+'\n'+"简介:"+page+'\n'+"网址:"+url+'\n')
f.write('='*50+'\n')
patter = re.compile('<img.*?data-lazysrc="(.*?)".*?lazyImg.*?.*?>')
imgs = re.findall(patter, html)
print(imgs)
for img in imgs:
url = 'http:' + img
print(img)
tupian = img.split('/')[-1]
res = requests.get(url)
with open('D:\爬虫\Video' + '\\' + tupian, 'ab')as f:
f.write(res.content)
利用火狐浏览器模拟登录爬取源码 !