多线程爬取新闻标题和链接

新闻分页地址：https://news.cnblogs.com/n/page/10/；url中最后一个数字代表页码
from concurrent.futures import ThreadPoolExecutor
import threading
import time
from queue import Queue
import logging
import requests
from bs4 import BeautifulSoup

# 日志参数的设定
FORMAT = "%(asctime)s %(threadName)s %(thread)d %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

# 多线程对象
event = threading.Event()

# url的前缀和user-agent值的设定
base_url = 'https://news.cnblogs.com'
page_path = '/n/page/'
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

# 队列对象
urls = Queue()  # 待爬取队列，省略已爬取队列
htmls = Queue()  # 从原网页里爬取的全部html内容：太大，无用数据太多，不保存
outputs = Queue()  # 提取的数据，结果输出队列

# 1.创建urls；周而复始的创建要爬取的url；start表示起始页面，stop表示终止页面
def create_urls(start, stop, step=1):
    for i in range(start, stop+1, step):
        url = "{}{}{}/".format(base_url, page_path, i)
        # print(url)
        urls.put(url)  # 将生成的url放入待爬取的url的队列里
        print('url创建完毕')

# create_urls(1,10)  # 创建page1到page10的url
# print(urls.qsize())  # 队列的大小为10

# 2.使用url来发起request请求，返回response对象
def crawler():  # 多线程
    while not event.is_set():
        try:
            url = urls.get(True, 1)  # 设置超时时间为1秒
            response = requests.get(url, headers={'User-agent': ua})
            with response:
                html = response.text  # 异步方式获取文本信息
                htmls.put(html)  # 将每个页面内容存放进对应的htmls队列里
                print('url:', url)
        # 捕获超时抛出的错误
        except Exception as e:
            print(e)
            # logging.error(e)


# 3.分析提取有用的数据入库
def parse():
    while not event.is_set():
        try:
            html = htmls.get(True, 1) 
            soup = BeautifulSoup(html, 'lxml')  # 解析html内容

            news = soup.select('h2.news_entry a')  # 提取所需标签内容
            for n in news:
                title = n.text
                ref = base_url + n.attrs.get('href')
                print('get_title:', title, 'get_ref:', ref)
                outputs.put((title, ref))  # 提取出的标题和链接内容存放至对应队列里

        except Exception as e:
            print(e)
            # logging.error(e)

# 4.入库;保存到文件中
def save(path):
    with open(path, 'a+', encoding='utf-8') as f:
        while not event.is_set():
            try:
                title, ref = outputs.get(True, 1)  # 元组结构
                print('save_title:', title, 'save_ref:', ref)
                f.write('{}_{}\n'.format(title, ref))
                f.flush()  # 爬取内容保存到文件中
            except Exception as e:
                print(e)
                # logging.error(e)

# 线程池中，启动线程（最大线程数为10）
executor = ThreadPoolExecutor(max_workers=10)
executor.submit(create_urls, 1, 10)  # 起始urls，以后queue中parse有用的url也可以加入
executor.submit(parse)
executor.submit(save, 'news.txt')

for i in range(7):
    executor.submit(crawler)


while True:
    cmd = input('>>>')
    if cmd.strip() == 'q':  # 在console栏里输入q，就会过一秒后停止多线程运行
        event.set()
        executor.shutdown()
        print('closing')
        time.sleep(1)
        break
多线程爬取新闻标题和链接

猜你喜欢