我的第一个成功的爬虫程序

#-*- coding:utf-8 -*-
import urllib.request
import os
import io
import sys
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8')
url="http://jandan.net/ooxx/"

def url_open(url):
  req=urllib.request.Request(url)
  req.add_header("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
  response=urllib.request.urlopen(req)
  html=response.read()
  return html
def urlimg_open(page_url):
  chrome_options=Options()
  chrome_options.add_argument('--headless') #启用谷歌浏览器无头模式,不打开浏览器,后台加载网页  
  wb=webdriver.Chrome(chrome_options=chrome_options)
  wb.get(page_url)
  time.sleep(2)
  page1=wb.page_source
  return page1
def get_page(url): #解析主页面并得到当前页的数字
  html=url_open(url)
  soup=BeautifulSoup(html,'lxml')
  page_num=soup.find_all('span',class_='current-comment-page') #得到【当前页】要去掉前面的中括号
  page_num=page_num[0].get_text()
  return int(page_num.strip('[]'))
def find_imgs(page_url): #找到图片地址
  print (page_url)
  img_addrs=urlimg_open(page_url)
  soup_img_addrs=BeautifulSoup(img_addrs,'lxml')
  img_addrs=soup_img_addrs.find_all('img')
  return img_addrs
def save_imgs(img_addrs):
  lenth=len(img_addrs)
  for j in range(lenth):  
    img_addr=img_addrs[j].attrs['src']
i    f img_addr.startswith('http'):
    print(img_addr)
    img_name=img_addr.split('/')[-1] #剥离图片地址
    urllib.request.urlretrieve(img_addr,img_name)
def download_mm(url):
  pages=1
  folder='OOXXx'
  os.mkdir(folder)
  os.chdir(folder)
  page_num=int(get_page(url))
  for i in range(page_num):
    page_num=page_num-1
    page_url=url+'page-'+str(page_num)+'#comments'
    img_addrs=find_imgs(page_url)
    save_imgs(img_addrs)
if __name__=='__main__':
  download_mm(url)

猜你喜欢

转载自www.cnblogs.com/oldsnow/p/9582940.html