# Added: 1. Make object-oriented 2. Add exception judgment to prevent the program from being interrupted due to error 3. Check whether the txt file exists. If it exists, skip and download the next file # Added: Multithreading, can download multiple files at the same time 2018.1.11 import requests from bs4 import BeautifulSoup import time import them import threading class Book1: def __init__(self,start_url): self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##Browser request headers (most If the website does not have this request header, an error will be reported, please be sure to add it) self.start_url = start_url # start crawler def star_url(self): try: books=self.requests(self.start_url) # Locate the target novel and parse it books_url=books.find('div', class_="tab-item clearfix").find_all('div',class_="yd-book-item yd-book-item-pull-left") except: print('Target novel parsing error!!!!') for a in books_url: try: book1 = a.find('a') book1_href = book1['href'] book1_gy=self.requests(book1_href) # Locate the summary page of the novel allbook_url = book1_gy.find('div', class_="b-oper").find('a')['href'] open_url=self.requests(allbook_url) name = open_url.find('div', class_="chapName").find('strong').get_text() # Locate the name of the novel path = 'G:/novel/' + name + '.txt' exist = os.path.exists(path) if exist: print('\n《' + name + '》' + ', already exists\n') continue print('"' + name + '"' + ', download starts') chapter = open_url.find('div', class_="clearfix dirconone").find_all('a') # Locate the novel directory except: print(name + ', Directory read error!') continue for i in chapter: try: title = i.get_text() # table of contents chapter href = i['href'] # Take out the href attribute of the a tag html=self.requests(href) # Parse specific chapter pages content = html.find('div', class_="mainContenr").get_text() text = open(path, 'a', encoding='utf-8') text.write('\n' + title + '\n\n\n' + content + '\n\n\n') print(name+' '+' "' + title + '"' + 'download complete') except: print(name+': '+title + ', the content of the chapter is read incorrectly!') continue print('"' + name + '"' + ', download completed'+'\n\n\n') # Parse the page def requests(self,url): try: content = requests.get(url, headers=self.headers) content.encoding = 'gbk' soup=BeautifulSoup(content.text,'lxml') return soup except: print('Error parsing the web page!!!!') # Define the multi-threaded execution function def threads(count): for i in range(count): threading.Thread(target=Book1.star_url, args=()).start() time.sleep(5) # To perform multi-threading, you need a few plus a few (the efficiency operation of adding chicken legs is O(∩_∩)O ha) Book1=Book1('http://www.quanshuwang.com/all/lastupdate_5_0_0_0_1_0_1.html') threads(10)
Python crawler - batch download of multi-threaded novels
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=326272738&siteId=291194637
Ranking