for cycling and multi-threaded + selenium
Example one
for loop
# -*- coding: utf-8 -*- """ Datetime: 2019/6/22 Author: Zhang Yafei Description: """ import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from concurrent.futures import ThreadPoolExecutor import functools chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument('--disable-gpu') def timeit(func): """ Decorator: judging function execution time :param func: :return: """ @functools.wraps(func) def inner(*args, **kwargs): start = time.time() ret = func(*args, **kwargs) end = time.time() - start if end < 60: print (f 'spend time: \ t {round (end, 2)} s') else: min, sec = divmod (than, 60) print (f 'takes time \ t {round (min)} divided \ t {round (sec, 2)} s') return right return inner class PolicyUrlDownload(object): "" "Policy Data download" "" def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True): self.url_list = [url.format(page) for page in range(1, pages_num+1)] self.output_file = output_file self.a_xpath = a_xpath if headless: self.driver = webdriver.Chrome(options=chrome_options) else: self.driver = webdriver.Chrome() def start(self, page, url): with open(self.output_file, mode='a', encoding='utf-8') as file: print(f"make request to {url}") self.driver.get(url) titles = self.driver.find_elements_by_xpath(self.a_xpath) for title in titles: href = title.get_attribute('href') file.write(f'{page}\t{href}\n') print(f'{url} download completed') def run(self): for page, url in enumerate(self.url_list): self.start(page+1, url) self.driver.close() @timeit def main(setting): policy_data = PolicyUrlDownload(**setting) policy_data.run() if __name__ == '__main__': start_time = time.time() print ( '######################## ##################### start the download #### ') # Multiple configuration page address Download settings = [ { 'Output_file': 'integrated management of drug supply security .txt', 'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html', 'pages_num': 8, 'a_xpath': '//div[@id="active0"]/ul/li/a' }, { 'Output_file': 'integrated management of drug supply security .txt', 'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html', 'pages_num': 9, 'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a' } ] for setting in settings: main(setting) print ( 'successfully downloaded, a total time spent', round (time.time () - start_time, 2), 's')
result
Download success, spent a total time of 28.46 seconds
Multithreading
# -*- coding: utf-8 -*- """ Datetime: 2019/6/22 Author: Zhang Yafei Description: """ import time from selenium import webdriver from selenium.webdriver.chrome.options import Options from concurrent.futures import ThreadPoolExecutor import functools chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument('--disable-gpu') def timeit(func): """ Decorator: judging function execution time :param func: :return: """ @functools.wraps(func) def inner(*args, **kwargs): start = time.time() ret = func(*args, **kwargs) end = time.time() - start if end < 60: print (f 'spend time: \ t {round (end, 2)} s') else: min, sec = divmod (than, 60) print (f 'takes time \ t {round (min)} divided \ t {round (sec, 2)} s') return right return inner class PolicyUrlDownload(object): "" "Policy Data download" "" def __init__(self, url, pages_num, output_file, a_xpath, headless: bool=True): self.url_list = [url.format(page) for page in range(1, pages_num+1)] self.output_file = output_file self.a_xpath = a_xpath if headless: self.driver = webdriver.Chrome(options=chrome_options) else: self.driver = webdriver.Chrome() def start(self, page, url): with open(self.output_file, mode='a', encoding='utf-8') as file: print(f"make request to {url}") self.driver.get(url) titles = self.driver.find_elements_by_xpath(self.a_xpath) for title in titles: href = title.get_attribute('href') file.write(f'{page}\t{href}\n') print(f'{url} download completed') def run(self): for page, url in enumerate(self.url_list): self.start(page+1, url) self.driver.close() @timeit def main(setting): policy_data = PolicyUrlDownload(**setting) policy_data.run() if __name__ == '__main__': start_time = time.time() print ( '######################## ##################### start the download #### ') # Multiple configuration page address Download settings = [ { 'Output_file': 'integrated management of drug supply security .txt', 'url': 'http://cdsip.nhfpc.gov.cn/work/0-{}.html', 'pages_num': 8, 'a_xpath': '//div[@id="active0"]/ul/li/a' }, { 'Output_file': 'integrated management of drug supply security .txt', 'url': 'http://cdsip.nhfpc.gov.cn/policy/0-{}-0.html', 'pages_num': 9, 'a_xpath': '//div[@class="infoContent box-body"]/ul/li/a' } ] with ThreadPoolExecutor() as pool: pool.map(main, settings) print ( 'successfully downloaded, a total time spent', round (time.time () - start_time, 2), 's')
result
Time spent: 18.04 seconds
Example Two
Order execution
# -*- coding: utf-8 -*- import os import time from concurrent.futures import ThreadPoolExecutor from hashlib import md5 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import numpy as np class PolicyPageDownload(object): "" "Policy Data download" "" def __init__(self, file, dir_name, url_list): self.file = file self.dir_name = dir_name self.urls = url_list self.chrome_options = Options() self.chrome_options.add_argument("--headless") self.chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome() def start(self, url): """ start download :param url: :return: """ self.driver.get(url) response = self.driver.page_source print(f'make request to {url}') file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' print('11111111111') with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file: file.write(response) print(f'{url} download completed') def run(self): "" "Entry point" "" [self.start(url) for url in self.urls] self.driver.quit() def filter_urls(dir_name, urls): """ Filter url :param urls: :return: """ encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls] has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0] encode_urls = set(encode_urls) - set(has_file) down_urls = list( filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls)) print (f 'co {len (set (urls))} \ t downloaded {len (set (has_file))} \ t need to download {len (encode_urls)}') return down_urls def run(url_list): policy = PolicyPageDownload(url_list=url_list, **setting) policy.run() def main(file, dir_name): if not os.path.exists(dir_name): os.mkdir (dir_name) inputfile = open(file, 'r', encoding='utf-8') urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)] if os.path.exists(dir_name): urls = filter_urls(dir_name, urls) run(urls) if __name__ == '__main__': start_time = time.time() setting = { 'File': 'integrated management of drug supply security .txt', 'Dir_name': 'integrated management of drug supply security' } main(**setting) print ( 'successfully downloaded, a total time spent', round (time.time () - start_time, 2), 's')
Multithreading
# -*- coding: utf-8 -*- import os import time from concurrent.futures import ThreadPoolExecutor from hashlib import md5 from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import numpy as np class PolicyPageDownload(object): "" "Policy Data download" "" def __init__(self, file, dir_name, url_list): self.file = file self.dir_name = dir_name self.urls = url_list self.chrome_options = Options() self.chrome_options.add_argument("--headless") self.chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(options=self.chrome_options) # self.driver = webdriver.Chrome() def start(self, url): """ start download :param url: :return: """ self.driver.get(url) response = self.driver.page_source print(f'make request to {url}') file_name = md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' print('11111111111') with open(f'{self.dir_name}/{file_name}', 'w', encoding='utf-8') as file: file.write(response) print(f'{url} download completed') def run(self): "" "Entry point" "" [self.start(url) for url in self.urls] self.driver.quit() def filter_urls(dir_name, urls): """ Filter url :param urls: :return: """ encode_urls = [md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' for url in urls] has_file = [file for file in os.listdir(dir_name) if os.path.getsize(os.path.join(dir_name, file)) > 0] encode_urls = set(encode_urls) - set(has_file) down_urls = list( filter(lambda url: md5(bytes(url, encoding='utf-8')).hexdigest() + '.html' in encode_urls, urls)) print (f 'co {len (set (urls))} \ t downloaded {len (set (has_file))} \ t need to download {len (encode_urls)}') return down_urls def run(url_list): policy = PolicyPageDownload(url_list=url_list, **setting) policy.run() def main(file, dir_name): if not os.path.exists(dir_name): os.mkdir (dir_name) inputfile = open(file, 'r', encoding='utf-8') urls = [line.strip().split('\t')[1] for index, line in enumerate(inputfile)] if os.path.exists(dir_name): urls = filter_urls(dir_name, urls) with ThreadPoolExecutor() as pool: pool.map(run, np.array_split(urls, 4)) if __name__ == '__main__': start_time = time.time() setting = { 'File': 'integrated management of drug supply security .txt', 'Dir_name': 'integrated management of drug supply security' } main(**setting) print ( 'successfully downloaded, a total time spent', round (time.time () - start_time, 2), 's')
operation result
# 50 for circulation: the download is successful, spent a total time 48.62 seconds # 150 for cycle: total time spent 150.22 seconds Thread # 150: Total spend time 80.84 seconds
- CONCLUSION: The driver of the big spending, try to create once, use many times, words can not share a concurrent driver, you must re-create
- Tips Summary: create multiple threads, the number of the best and the same number of cpu, each thread to create a driver