#-*- coding:utf-8 -*-
import urllib.request
import os
import io
import sys
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
url="http://jandan.net/ooxx/"
def url_open(url):
req=urllib.request.Request(url)
req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
response=urllib.request.urlopen(req)
html=response.read()
return html
def urlimg_open(page_url):
chrome_options=Options()
chrome_options.add_argument('--headless') #启用谷歌浏览器无头模式,不打开浏览器,后台加载网页
wb=webdriver.Chrome(chrome_options=chrome_options)
wb.get(page_url)
time.sleep(2)
page1=wb.page_source
return page1
def get_page(url): #解析主页面并得到当前页的数字
html=url_open(url)
soup=BeautifulSoup(html,'lxml')
page_num=soup.find_all('span',class_='current-comment-page') #得到【当前页】要去掉前面的中括号
page_num=page_num[0].get_text()
return int(page_num.strip('[]'))
def find_imgs(page_url): #找到图片地址
print (page_url)
img_addrs=urlimg_open(page_url)
soup_img_addrs=BeautifulSoup(img_addrs,'lxml')
img_addrs=soup_img_addrs.find_all('img')
return img_addrs
def save_imgs(img_addrs):
lenth=len(img_addrs)
for j in range(lenth):
img_addr=img_addrs[j].attrs['src']
i f img_addr.startswith('http'):
print(img_addr)
img_name=img_addr.split('/')[-1] #剥离图片地址
urllib.request.urlretrieve(img_addr,img_name)
def download_mm(url):
pages=1
folder='OOXXx'
os.mkdir(folder)
os.chdir(folder)
page_num=int(get_page(url))
for i in range(page_num):
page_num=page_num-1
page_url=url+'page-'+str(page_num)+'#comments'
img_addrs=find_imgs(page_url)
save_imgs(img_addrs)
if __name__=='__main__':
download_mm(url)