Python crawler - batch download of multi-threaded novels

 
 
# Added: 1. Make object-oriented 2. Add exception judgment to prevent the program from being interrupted due to error 3. Check whether the txt file exists. If it exists, skip and download the next file
# Added: Multithreading, can download multiple files at the same time 2018.1.11

import requests
from bs4 import BeautifulSoup
import time
import them
import threading

class Book1:
    def __init__(self,start_url):
        self.headers = { 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} ##Browser request headers (most If the website does not have this request header, an error will be reported, please be sure to add it)
        self.start_url = start_url

    # start crawler
    def star_url(self):
        try:
            books=self.requests(self.start_url)
            # Locate the target novel and parse it
            books_url=books.find('div', class_="tab-item clearfix").find_all('div',class_="yd-book-item yd-book-item-pull-left")
        except:
            print('Target novel parsing error!!!!')
        for a in books_url:
            try:
                book1 = a.find('a')
                book1_href = book1['href']
                book1_gy=self.requests(book1_href) # Locate the summary page of the novel
                allbook_url = book1_gy.find('div', class_="b-oper").find('a')['href']
                open_url=self.requests(allbook_url)
                name = open_url.find('div', class_="chapName").find('strong').get_text() # Locate the name of the novel
                path = 'G:/novel/' + name + '.txt'
                exist = os.path.exists(path)
                if exist:
                    print('\n《' + name + '》' + ', already exists\n')
                    continue
                print('"' + name + '"' + ', download starts')
                chapter = open_url.find('div', class_="clearfix dirconone").find_all('a') # Locate the novel directory
            except:
                print(name + ', Directory read error!')
                continue
            for i in chapter:
                try:
                    title = i.get_text() # table of contents chapter
                    href = i['href'] # Take out the href attribute of the a tag
                    html=self.requests(href)
                    # Parse specific chapter pages
                    content = html.find('div', class_="mainContenr").get_text()
                    text = open(path, 'a', encoding='utf-8')
                    text.write('\n' + title + '\n\n\n' + content + '\n\n\n')
                    print(name+' '+' "' + title + '"' + 'download complete')
                except:
                     print(name+': '+title + ', the content of the chapter is read incorrectly!')
                     continue
            print('"' + name + '"' + ', download completed'+'\n\n\n')

    # Parse the page
    def requests(self,url):
        try:
            content = requests.get(url, headers=self.headers)
            content.encoding = 'gbk'
            soup=BeautifulSoup(content.text,'lxml')
            return soup
        except:
            print('Error parsing the web page!!!!')

# Define the multi-threaded execution function
def threads(count):
    for i in range(count):
        threading.Thread(target=Book1.star_url, args=()).start()
        time.sleep(5)

# To perform multi-threading, you need a few plus a few (the efficiency operation of adding chicken legs is O(∩_∩)O ha)
Book1=Book1('http://www.quanshuwang.com/all/lastupdate_5_0_0_0_1_0_1.html')
threads(10)

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=326272738&siteId=291194637