一个爬虫的练习(妹子图)

一个爬虫项目(抓妹子的图)

url加密了,这儿用base64解密,js 自带一个token(解开图片的url地址) 

话不多说 看源码:

#!/usr/bin/python
# -*- coding: utf-8 -*-
import hashlib
import base64
from bs4 import BeautifulSoup
import requests
import re
import os
import queue
import threading
import math
from multiprocessing import Pool
import sys
sys.stderr = None
'''
url解码
'''


def parse(imgHash, constant):
    return decode_base64(imgHash).decode('utf8')

def md5(src):
    m = hashlib.md5()
    m.update(src.encode("utf8"))
    return m.hexdigest()


def decode_base64(data):
    missing_padding = 4 - len(data) % 4
    if missing_padding:
        data += '=' * missing_padding
    return base64.b64decode(data)


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
}

'''
页面抓取类
'''


class Spider(threading.Thread):

    def __init__(self, pages, proxies, url_manager):
        threading.Thread.__init__(self)
        self.pages = pages
        self.proxies = proxies
        self.url_manager = url_manager

    def get_Page(self, page, proxies, url_manager):
        bs_page = BeautifulSoup(page, "lxml")

        '''
        获取js文件地址从而得到constant常量
        '''
        try:
            model = re.findall(r'.*<script\ssrc=\"\/\/(cdn.jandan.net\/static\/min.*?)\"><\/script>.*', page)
            jsfile_url = "http://" + model[len(model) - 1]  # 页面上可能有两个地址,取最后一个匹配的地址
        except Exception as e:
            print(e)
        jsfile = requests.get(jsfile_url, headers=headers, proxies=proxies, timeout=3).text

        constant = re.search(r'.*remove\(\);var\sc=\w+\(e,\"(\w+)\".*', jsfile).group(1)
        '''
        向parse函数传入constant常量和img-hash得到图片地址
        '''
        for item in bs_page.select('.img-hash'):
            img_url = 'http:' + parse(item.text, constant)
            url_manager.addNewUrl(img_url)

    def run(self):
        for page in self.pages:
            self.get_Page(page, self.proxies, self.url_manager)


'''
程序入口
'''


def main(amount):
    url_manager = UrlManager()
    proxies = {'http': ''}  # 尚未添加ip代理功能,程序已能正常运行

    current_url = 'http://jandan.net/ooxx'  # 当前页面url
    '''
    多线程抓取页面地址
    '''
    pages = []  # 所有待抓取页面
    try:
        for i in range(amount):
            current_page = requests.get(current_url, headers=headers).text  # 当前页面源码
            pages.append(current_page)
            current_url = 'http:' + re.search(r'.*Older\sComments\"\shref=\"(.*?)\"\sclass.*', current_page).group(
                1)  # 提取下个页面url
    except Exception as e:
        pass

    page_threads = []
    t_amount = 10 if len(pages) > 10 else len(pages)  # 页面抓取线程数
    for i in range(t_amount):
        t = Spider(pages[math.ceil(int((len(pages)) / t_amount) * i):math.ceil(int((len(pages)) / t_amount) * (i + 1))],
                   proxies, url_manager)
        page_threads.append(t)
    for t in page_threads:
        t.start()
    for t in page_threads:
        t.join()

    img_threads = []
    for i in range(10):  # 固定10个线程用于下载图片
        t = Download(url_manager)
        img_threads.append(t)
    for t in img_threads:
        t.start()
    for t in img_threads:
        t.join()


L = threading.Lock()

'''
图片下载类
'''


class Download(threading.Thread):
    def __init__(self, url_manager):
        threading.Thread.__init__(self)
        self.url_manager = url_manager
        self.pic_headers = headers
        self.pic_headers['Host'] = 'wx3.sinaimg.cn'

    def download_Img(self, url):
        isGif = re.match(r'(.*\.sinaimg\.cn\/)(\w+)(\/.+\.gif)', url)
        if isGif:
            url = isGif.group(1) + 'large' + isGif.group(3)

        extensionName = re.match(r'.*(\.\w+)', url).group(1)  # 图片扩展名

        L.acquire()
        if not os.path.exists('img'):
            os.mkdir('img')
        with open('img/' + str(len(os.listdir('./img'))) + extensionName, 'wb') as f:
            # headers['Host']='wx3.sinaimg.cn'
            f.write(requests.get(url, headers=self.pic_headers).content)
            f.close()
        L.release()

    def run(self):
        while not self.url_manager.isEmpty():
            imgUrl = self.url_manager.getNewUrl()
            self.download_Img(imgUrl)
            self.url_manager.addOldUrl(imgUrl)


'''
url仓库,提供url更新以及记录功能
'''


class UrlManager:
    def __init__(self):
        self.url_used = []
        self.url_target = queue.Queue()
        if os.path.exists('url.txt'):
            with open('url.txt', 'r') as f:
                for eachline in f.readlines():
                    self.url_used.append(eachline.strip())
        else:
            open("url.txt", 'w')

    def getNewUrl(self):
        return self.url_target.get()

    def isEmpty(self):
        return self.url_target.empty()

    def addNewUrl(self, newUrl):
        if newUrl in self.url_used:
            pass
        else:
            self.url_target.put(newUrl)

    def addOldUrl(self, oldUrl):
        self.url_used.append(oldUrl)
        with open('url.txt', 'a') as f:
            f.write(oldUrl + '\n')


if __name__ == '__main__':
    num_list= [i for i in range(48)]
    res_l = []
    p = Pool()
    for i in num_list:
        res = p.apply_async(main, args=(int(i),))
        res_l.append(res)
    for k in res_l:
        res = k.get()
        print('下载妹子(%s)'%k)

  

基于多线程,多进程(并且屏蔽了所有的错误,可以在上面扩展),谢谢!

猜你喜欢

转载自www.cnblogs.com/rianley/p/9254595.html