爬取笔趣阁文章

分享一个爬虫的部分代码

import requests
import time
from lxml import etree

def get_session():  # 获取会话
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    session = requests.session()
    session.headers.update(header)
    return session


def get_html(url):  # 连接页面
    for i in range(2):
        res = get_session().get(url, timeout=5)
        res.encoding = 'utf-8'
        time.sleep(0.2)
        return res


def get_conxpath(url, i_xpath):  # 得到某页面某xpayh下内容
    res = get_html(url)
    time.sleep(0.3)
    cont = etree.HTML(res.text).xpath(i_xpath)
    return cont


def write_ap_file(thing, path):
    with open(path, 'a+', encoding='utf-8') as f:
        f.write(thing)


def running():
    tiny_list_url = 'http://www.xbiquge.la/0/119/'
    tiny_lists = get_conxpath(tiny_list_url, '//*[@id="list"]/dl/dd/a/@href')
    m = 0
    while m <= len(tiny_lists):
        tiny_url = 'http://www.xbiquge.la'+tiny_lists[m]
        docs = get_conxpath(tiny_url, '//*[@id="content"]/text()')
        title = get_conxpath(tiny_url, '//*[@id="wrapper"]/div/div/div/h1/text()')
        end_text = '\n\n'+''.join(title)+'\n\n'
        for doc in docs:
            if '\r' in doc:
                doc.replace('\r', '\n')
            end_text += doc
        write_ap_file(end_text, 'G:\儒道至圣.TXT')
        m += 1
        print('{} in {}'.format(m, len(tiny_lists)))


if __name__ == '__main__':
    running()

因为应对各种错误的代码过于繁杂,这里就不展示了^_^

[申明:禁止商业用途]

发布了3 篇原创文章 · 获赞 0 · 访问量 89

猜你喜欢

转载自blog.csdn.net/herry_g/article/details/104094407
今日推荐