Fiction crawling python + urllib + lxml

from urllib import parse
from urllib import request
from lxml import etree
import time

class Novel:
    def __init__(self,*args):
        self.name = args[0]
        self.dict = args[1]
        self.txt = ''
        for key in sorted(self.dict):
            self.txt = self.txt + self.dict[key]

    def write(self):
        f = open(self.name+'.txt','w')
        f.write(self.txt)
        f.close()

#获取网页源代码
def get_http_page(url,**kw):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
    }
    req = request.Request(url,headers=headers)
    response = request.urlopen(req)
    page = response.read()
    encoding = 'gbk'
    if kw:
        encoding = kw['encoding']
    page = page.decode(encoding)
    return page

#获取漫画目录
def get_comics_directory(url):
    url_list = []
    page = get_http_page(url,encoding='utf-8')
    html = etree.HTML(page)
    result = html.xpath('/html/body/div[2]/div/div[2]/h3/a')
    elment_select = None
    if len(result):
        url2 = result[0].get('href')
    if url2:
        page = get_http_page(url2)
        html = etree.HTML(page)
        elment_select = html.xpath('/html/body/div[4]/div[9]/span[2]/select')
        if len(elment_select):
            result_option = elment_select[0].findall('option')
            for option in result_option:
                url_list.append('https://m.wenxuemi6.com{}'.format(option.get('value')))
    return url_list

def downdload_txt(url_list,**kw):
    if kw:
        start = int(kw['start'])
        stop = int (kw['stop'])
        if start >= 0 and start < len(url_list) and stop > start and stop <len(url_list):
            count = kw['start']
            count_max = kW [ ' STOP ' ]
     the else : 
        COUNT = 0 
        count_max = len (URL_LIST)
     Print ( ' are crawling directory and chapter address, please wait ...... ' ) 
    D = {}
     the while COUNT < count_max: 
        URL = URL_LIST [ COUNT] 
        Page = get_http_page (URL) 
        HTML = etree.HTML (Page) 
        Result = html.xpath ( ' / HTML / body / div [. 4] / UL [2] / Li / A ' )
        txt = ''
        if type(result).__name__ == 'list':
            for l in result:
                url = 'https://m.wenxuemi6.com{}'.format(l.get('href'))
                #url_list.append('https://m.wenxuemi6.com{}'.format(l.get('href')))
                print('Download chapters by URL:{}'.format(url))
                d2 = {'{}'.format(count): ''}
                page = get_http_page(url)
                html = etree.HTML(page)
                url_next = html.xpath('//*[@id="pb_next"]')
                t = html.xpath('//*[@id="nr1"]/text()')
                t2 = html.xpath('//*[@id="nr1"]/p')
                txt_title = ''
                txt_title_list = html.xpath('//*[@id="nr_title"]/text()')
                if type(txt_title_list).__name__ == 'list':
                    if (len(txt_title_list) == 1):
                        txt_title = txt_title_list[0]
                txt = txt + txt_title + '\r\n'
                for l2 in t:
                    txt = txt + l2 + '\r\n'
                if type(t2).__name__ == 'list':
                    if len(t2) == 1:
                        url = 'https://m.wenxuemi6.com{}'.format(l.get('href')[:-5] + '_2.html')
                        print('Download chapters by URL:{}'.format(url))
                        page = get_http_page(url)
                        html = etree.HTML(page)
                        t = html.xpath('//*[@id="nr1"]/text()')
                        for l2 in t:
                            txt TXT + L2 + = ' \ R & lt \ n- ' 
                D2 [ ' {} ' .format (COUNT)] = TXT 
                d.update (D2) 
                the time.sleep ( . 1 )
     return D 



IF  the __name__ == ' __main__ ' : 
    txt_name = INPUT ( " Please enter the title to be searched: " ) 
    URL = ' https://m.wenxuemi6.com/search.php?keyword= {} ' .format (parse.quote (txt_name)) 
    Referer = URL 
    URL_LIST =get_comics_directory (URL)
     # novel downloaded first page directory under 
    D = downdload_txt (URL_LIST, Start = 0, = STOP. 1 ) 
    N1 = Novel (txt_name, D)
     # Write File [txt_name] .txt to the current directory 
    n1. write () 

    # download full novel 
    D2 = downdload_txt (URL_LIST, Start = 0, = STOP. 1 ) 
    N2 = novel (txt_name, D2)
     # write file [txt_name] .txt to the current directory 
    n2.write ()

 

Guess you like

Origin www.cnblogs.com/Dmail/p/11615049.html