多线程爬虫 爬取妹子图网站

# -*- coding:utf-8 -*-
import requests
import threading
import re
import time
import os



all_urls = []   # 我们拼接好所有分页的url
all_img_urls = [] # 所有页面里的(就是all_urls[] 里的 每个页面的,图片点开后的每个系列图片的url) 图片列表链接
g_lock = threading.Lock() #声明一个锁
pic_links = []    #储存所有图片的地址

headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 '
        +'(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        'Host':'www.meizitu.com'
    }

class Spider():
    def __init__(self, target_url, headers):       # target_url   目标url
        self.target_url = target_url
        self.headers = headers



    def getUrls(self, start_page, page_num):
        global all_urls
        for i in range(start_page, page_num+1):
            url = self.target_url % i
            all_urls.append(url)


class Producer(threading.Thread):

    def run(self):


        while  len(all_urls)>0:
            g_lock.acquire()    # 在访问all_urls 的时候,需要用到锁机制
            page_url = all_urls.pop()   # 用pop方法移除最后一个元素,并且返回该值   获取了每一个页面的url
            print(page_url)
            g_lock.release()

            try:
                print("分析:" + page_url)
                response = requests.get(page_url, headers = headers, timeout = 3)
                all_pic_link = re.findall('<a target=\'_blank\' href="(.*?)">',response.text,re.S) #原来是 '<a target="_blank" href="(.*?)">' 这里要加''
                global all_img_urls
                g_lock.acquire()
                all_img_urls += all_pic_link  # 将所有的all_pic_link 拼接起来
                print(all_img_urls)
                g_lock.release()
                time.sleep(0.5)
            except:
                pass


# 下面创建一个消费者/观察者,不断关注刚才获取的图片详情页面的数组


class Consumer(threading.Thread):
    def run(self):
        global all_img_urls # 导入图片详情页面的数组
        print("线程 %s 正在运行" % threading.current_thread().name)
        while(len(all_img_urls) > 0):
            g_lock.acquire()
            img_url = all_img_urls.pop()
            g_lock.release()

            try:
                response = requests.get(img_url, headers = headers)
                response.encoding='gb2312'        # 调用的页面编码是gb2312,这里需要转换一下编码
                title = re.search('<title>(.*?) | 妹子图</title>', response.text).group(1)   # 返回匹配到的第一个
                all_pic_src = re.findall('<img alt=.*? src="(.*?)"></br>', response.text, re.S)   # 匹配的是每张图片的链接  这里应该是有很多链接

                pic_dict = {title:all_pic_src}     #声明字典, key为title, value 为图片链接
                global pic_links       # 导入最初声明的储存所有图片链接的列表
                g_lock.acquire()
                pic_links.append(pic_dict)     # 这是一个列表, 列表的每个值是一个字典,字典分为 key和values
                print(title + "获取成功")
                g_lock.release()
            except:
                pass
            time.sleep(1)


# 下面写一个下载类
class DownPic(threading.Thread):
    def run(self):
        while True:          #写成死循环,检测图片链接数组是否更新
            global pic_links       # 导入储存所有图片链接的数组
            # 先上锁,锁住
            g_lock.acquire()
            if len(pic_links) == 0:
                g_lock.release()
                continue
            else:
                pic = pic_links.pop()
                g_lock.release()
                for key, values in pic.items():
                    path = key.rstrip("//")   #删除开头的/
                    pos = "图片保存位置"
                    is_exists = os.path.exists(pos+"//"+path)

                    if not is_exists:
                        # 如果目录不存在 ,创建目录,
                        os.makedirs(pos+"//"+path)
                        print(pos+"//"+path, "创建成功")

                    else:
                        print(pos+"//"+path, "已经存在")

                    for pic in values :
                        filename = pos+"//"+path+"/"+pic.split('/')[-1]
                        if os.pos+"//"+path.exists(filename):
                            continue
                        else:
                            try:
                                response = requests.get(pic, headers=headers)
                                with open(filename, 'wb') as f:
                                    f.write(response.content)
                                    f.close()
                            except Exception as e:
                                print(e)
                                pass






if __name__ == '__main__':

    threads = []

    target_url = 'http://www.meizitu.com/a/pure_%d.html'  #图片集和列表规则
    spider = Spider(target_url, headers)
    spider.getUrls(1, 2)
    for x in range(2):
        t = Producer()
        t.start()          # 可以调用 自己实现的fun方法, 但是不会多线程运行
        threads.append(t)

    for tt in threads:
        tt.join()
    print("进行到我这里了")

    for x in range(10):
        ta = Consumer()
        ta.start()

    for x in range(1):
        down = DownPic()
        down.start()

猜你喜欢

转载自www.cnblogs.com/dairuiquan/p/10191975.html