python+selenium多线程与多进程爬虫

使用python+selenium抓取深圳证券交易所本所公告数据,刚开始是用单进程爬取的,最近将代码修改了一下,分别用多进程和多线程进行抓取,速度非常快。如果对selenium不了解的请移步别的地方学习一下。

多进程爬取

# coding=utf-8
'''
多进程抓取深圳证券交易所本所公告数据
标题和公告内容写入了不同的csv文件里
Author:西兰
Date:2019-11-30
'''

from selenium import webdriver
import time
import csv
from multiprocessing import Process

def process(start,end,num):
    driver_path = r"D:\chromedriver.exe"
    #使用开发者模式
    #options = webdriver.ChromeOptions()
    #options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.implicitly_wait(1)


    count=1
    for j in range(start,end):
      if(count%21==0):
           count=1
      if(j==1):
          url="http://www.szse.cn/disclosure/notice/index.html"
      else:
          url="http://www.szse.cn/disclosure/notice/index_"+str(j-1)+".html"
      # if(j%10==0):#每处理10页数据,关闭并重启一次浏览器
      #     browser.quit()
      #     browser = webdriver.Chrome(executable_path=driver_path)

      for i in range(20):
        browser.get(url)
        browser.maximize_window()
        print("####################################################第",j,"页,第",count,"条记录")
        # 获取列表页handle
        list_page_handle = browser.current_window_handle
        div_content = browser.find_element_by_css_selector('div.g-content-list')
        li_list = div_content.find_elements_by_tag_name('li')
        a_href = li_list[i].find_element_by_tag_name('a').get_attribute('href')
        if(a_href.find('.pdf')>0 or a_href.find('.doc')>0 or a_href.find('.DOC')>0):continue
        print(a_href)
        li_list[i].find_element_by_tag_name('a').click()
        all_handles = browser.window_handles
        for handle in all_handles:
            if (handle != list_page_handle):
                browser.switch_to.window(handle)
        #标题
        title_div = browser.find_element_by_css_selector('div.des-header')
        title_h2 = title_div.find_element_by_tag_name('h2')
        print(title_h2.text)
        data_row_title = [title_h2.text]
        with open('./data/sz_data_title' + str(num) + '.csv', 'a+', newline="", encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row_title)
        #公告内容
        content_div = browser.find_element_by_id('desContent')
        p_content_list = content_div.find_elements_by_tag_name('p')
        final_text=""
        for p in p_content_list:
            final_text+=p.text.strip()
        print(final_text)
        data_row = [final_text]
        with open('./data/sz_data'+ str(num) +'.csv', 'a+', newline="",encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row)
        time.sleep(1)
        count += 1
        browser.close()
        browser.switch_to.window(list_page_handle)

def main():
    #开启4个进程,传入爬取的页码范围
    process_list = []
    p1 = Process(target=process, args=(400,600,1))
    p1.start()
    p2 = Process(target=process, args=(600, 800,1))
    p2.start()
    p3 = Process(target=process, args=(800, 1000, 1))
    p3.start()
    p4 = Process(target=process, args=(1000, 1129, 1))
    p4.start()
    process_list.append(p1)
    process_list.append(p2)
    process_list.append(p3)
    process_list.append(p4)
    for t in process_list:
        t.join()

if __name__ == '__main__':
    s = time.time()
    main()
    e = time.time()
    print('总用时:',e-s)

多线程爬取

# coding=utf-8
# --coding--=utf-8
'''
多线程抓取深圳证券交易所本所公告数据
Author:西兰
Date:2019-11-30
'''

from selenium import webdriver
import time
import csv
from threading import Thread

def process(start,end,num):
    driver_path = r"D:\chromedriver.exe"
    #使用开发者模式
    #options = webdriver.ChromeOptions()
    #options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.implicitly_wait(1)


    count=1
    for j in range(start,end):
      if(count%21==0):
           count=1
      if(j==1):
          url="http://www.szse.cn/disclosure/notice/index.html"
      else:
          url="http://www.szse.cn/disclosure/notice/index_"+str(j-1)+".html"
      # if(j%10==0):#每处理10页数据,关闭并重启一次浏览器
      #     browser.quit()
      #     browser = webdriver.Chrome(executable_path=driver_path)

      for i in range(20):
        browser.get(url)
        browser.maximize_window()
        print("####################################################第",j,"页,第",count,"条记录")
        # 获取列表页handle
        list_page_handle = browser.current_window_handle
        div_content = browser.find_element_by_css_selector('div.g-content-list')
        li_list = div_content.find_elements_by_tag_name('li')
        a_href = li_list[i].find_element_by_tag_name('a').get_attribute('href')
        if(a_href.find('.pdf')>0 or a_href.find('.doc')>0 or a_href.find('.DOC')>0):continue
        print(a_href)
        li_list[i].find_element_by_tag_name('a').click()
        all_handles = browser.window_handles
        for handle in all_handles:
            if (handle != list_page_handle):
                browser.switch_to.window(handle)
        #标题
        title_div = browser.find_element_by_css_selector('div.des-header')
        title_h2 = title_div.find_element_by_tag_name('h2')
        print(title_h2.text)
        data_row_title = [title_h2.text]
        with open('./data/sz_data_title' + str(num) + '.csv', 'a+', newline="", encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row_title)
        #公告内容
        content_div = browser.find_element_by_id('desContent')
        p_content_list = content_div.find_elements_by_tag_name('p')
        final_text=""
        for p in p_content_list:
            final_text+=p.text.strip()
        print(final_text)
        data_row = [final_text]
        with open('./data/sz_data'+ str(num) +'.csv', 'a+', newline="",encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row)
        time.sleep(1)
        count += 1
        browser.close()
        browser.switch_to.window(list_page_handle)

def main():
	 #开启4个进程,传入爬取的页码范围
    thead_list = []
    t1 = Thread(target=process, args=(400,600,1))
    t1.start()
    t2 = Thread(target=process, args=(600, 800,1))
    t2.start()
    t3 = Thread(target=process, args=(800, 1000, 3))
    t3.start()
    t4 = Thread(target=process, args=(1000, 1129, 4))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()

if __name__ == '__main__':
    s = time.time()
    main()
    e = time.time()
    print('总用时:',e-s)

喜欢编程的朋友可以关注我的公众号,我们一起进步!
在这里插入图片描述

参考:python多进程与多线程

猜你喜欢

转载自blog.csdn.net/broccoli2/article/details/103333431