从零开始学习--Python-爬虫 7月3日

Python

---小白121的记录笔记

用爬虫爬取煎蛋网妹纸图片

源码：

import os
import re
import requests
from bs4 import BeautifulSoup
from  selenium import webdriver
from  selenium.webdriver.support.ui import WebDriverWait

chrome="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
browser = webdriver.Chrome(chrome)
wait = WebDriverWait(browser,2)
file = os
num = 1



def get_source(url):    #用来利用 webdriver 来请求对应网站
    print('正在爬取 "%s"' % url)
    try:
        browser.get(url)
        html = browser.page_source
        if html :
            return html
    except EOFError:
        return None

def get_pares(html,num, url):    #解析没带进度条的网站
    url_num = analysis(url)
    tmp = url_num[0]
    temp = int(tmp) + 1

    soup = BeautifulSoup(html,'lxml')
    image = soup.select('img')
    find_image = re.findall('<img src="(.*?)" style="', '%s'%image, re.S)
    file.mkdir('F:\\python测试\\data\\煎蛋网妹纸爬取\\%s' % temp)
    for i in find_image:
        s_c = requests.get(i)
        print('正在下载：%s'%i)
        save = open('F:\\python测试\\data\\煎蛋网妹纸爬取\\%s\\'%temp + str(num) + '.jpg', 'wb')
        save.write(s_c.content)
        save.close()
        num += 1



def pares_one(html):    #解析带进度条的网站
    soup = BeautifulSoup(html,'lxml')
    find_html = soup.select('#body #comments .comments .cp-pagenavi a')
    find_url = re.findall('<a class="previous-comment-page" href="(.*?)" title="Older Comments">下一页<','%s'% find_html, re.S)
    url = 'http:' + '%s' % find_url[0]
    return url




def analysis(url): #获取 当前网站 的页数
    url_num  = re.findall('/page-(.*?)#comments', '%s'%url)
    return url_num


def next(url): #进行翻页 
    while num > 0 and num < 60 :
        html = get_source(url)
        next_url = pares_one(html)
        get_pares(html, num, next_url)
        next(next_url)

def main():
    url = 'http://jandan.net/ooxx'
    next(url)

if __name__ == '__main__':
    main()

---源自公众号python那些事

主要运用了 webdriver 和 beautiful 可以先学会这2个第三方库在去运用

成品图

邪恶的马赛克~

从零开始学习--Python-爬虫 7月3日

猜你喜欢