爬虫09-多线程爬虫

import threading
from queue import Queue
import requests
from lxml import etree
import json
import os


# 线程类 用于爬取页面的数据
class ThreadCrawl(threading.Thread):

    # 定义请求头
    def __init__(self, thread_id):
        threading.Thread.__init__(self)
        self.threadID = thread_id
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
            'Accept-Language': 'zh-CN,zh;q=0.8'
        }

    # 重写run方法 线程和进程类的run方法会自动运行 无需调用
    def run(self):
        print('Starting', self.threadID)
        self.qiushi_spider()
        print('Exiting', self.threadID)

    # 爬取网站数据的方法
    def qiushi_spider(self):
        # 判断是否还有需要爬取的页面
        while not page_queue.empty():
            page = page_queue.get()
            url = 'http://www.qiushibaike.com/8hr/page/' + str(page) + '/'
            print('爬虫的id：', self.threadID, ', 爬取网页的页码：', str(page))
            timeout = 4
            while timeout > 0:
                timeout = timeout - 1
                try:
                    content = requests.get(url, headers=self.headers, timeout=0.5)
                    data_queue.put(content.text)
                    break
                except Exception as e:
                    print('qiushi_spider', e)


# 解析页面数据类 用于解析页面数据
class ThreadParser(threading.Thread):

    # 线程的id用于分辨线程
    def __init__(self, thread_id, file):
        threading.Thread.__init__(self)
        self.threadID = thread_id
        self.file = file

    def run(self):
        print('Starting', self.threadID)
        while not exitFlag_Parser:
            try:
                '''
                调用队列对象的get()方法从队头删除并返回一个项目。可选参数为block，默认为True。
                如果队列为空且block为True，get()就使调用线程暂停，直至有项目可用。
                如果队列为空且block为False，队列将引发Empty异常。
                '''
                item = data_queue.get(False)
                if not item:
                    pass
                self.parse_date(item)
                # 提示线程join是否阻塞
                data_queue.task_done()
            except:
                pass
        print('-' * 160)
        print('Exiting', self.threadID)

    # 解析页面数据 item网页全部数据
    def parse_date(self, item):
        try:
            html = etree.HTML(item)
            result = html.xpath('//div[contains(@id,"qiushi_tag")]')
            for site in result:
                try:
                    img_url = site.xpath('.//img/@src')[0]
                    print('*'*30)
                    title = site.xpath('.//h2')[0].text.strip()
                    content = site.xpath('.//div[@class="content"]/span')[0].text.strip()
                    vote = ''
                    comments = ''
                    try:
                        vote = site.xpath('.//i')[0].text
                        comments = site.xpath('.//i')[1].text
                    except:
                        pass
                    data = {
                        'imgUrl': img_url,
                        'title': title,
                        'content': content,
                        'vote': vote,
                        'comments': comments,
                    }
                    if mutex.acquire():
                        data = json.dumps(data, ensure_ascii=False)
                        print('save……', data)
                        self.file.write(data+'\n')
                        mutex.release()
                except Exception as e:
                    print('site in result', e)
        except Exception as e:
            print('parse_date', e)


def main():
    # 判断data文件夹是否存在
    if not os.path.exists('data'):
        os.mkdir('data')
    # 打开一个文件对象
    output = open('data/qiushibaike.json', 'a', encoding='utf-8')
    # 爬取数据的页码范围
    for page in range(1, 11):
        page_queue.put(page)
    # 创建爬取数据的多线程
    crawl_threads = []
    crawl_list = ['crawl-1', 'crawl-2', 'crawl-3']
    for thread_id in crawl_list:
        thread = ThreadCrawl(thread_id)
        thread.start()
        crawl_threads.append(thread)

    # 创建解析数据的多线程
    parse_threads = []
    parse_list = ['parse-1', 'parse-2', 'parse-3']
    for thread_id in parse_list:
        thread = ThreadParser(thread_id, output)
        thread.start()
        parse_threads.append(thread)

    # 判断页码的队列是否为空 不为空阻塞主进程
    while not page_queue.empty():
        pass
    # 阻塞所有爬取的线程
    for t in crawl_threads:
        t.join()
    # 判断需要解析的数据队列是否为空 不为空阻塞主进程
    while not data_queue.empty():
        pass
    global exitFlag_Parser
    print('-'*80)
    # 所有数据均已解析 终止线程
    exitFlag_Parser = True
    # 阻塞所有爬取的线程
    for t in parse_threads:
        t.join()

    print('Exiting Main Thread')
    if mutex.acquire():
        output.close()


if __name__ == '__main__':
    # 页面数据的队列
    data_queue = Queue()
    # 需要获取的网页的队列
    page_queue = Queue(50)
    # bool类型 收购退出解析
    exitFlag_Parser = False
    # 锁 数据的储存
    mutex = threading.Lock()
    main()


# msg = False
# while not msg:
#     pass
#
# print('hehe')
爬虫09-多线程爬虫

猜你喜欢