Crawl the novel xpath (a currency maggot ge) understands everything

Disclaimer: The applications involved in this article are only for learning and communication, and shall not be used for any commercial purposes. The data comes from public content on the Internet, and no private and authorized information (personal information, etc.) has been obtained. Any legal disputes arising from this have nothing to do with me! It is forbidden to use the technology in this article or the source code of the Github project associated with this article for any purpose.

Most novel websites are like this (especially various websites) If you don’t know how to block, you can use Baidu, the code is for reference only

Target website: https://www.xxxbiquge (.com) 2022/5/24 Currently still available.

Required tools: python;

Third-party library: parser request..... (download by yourself)

import requests
import re
import parsel
import os

Step 1: Get the source code of the required web page to get the source code of the crawled novel

# url = input("请输入需要爬取的小说:")
url = 'https://www.xxxbiquge.com/5/5900/'
headers = {
    "User-Agent": # 用自己的  或者挂代理
}
resp = requests.get(url, headers=headers)

Two: Get the source code information The url of each chapter (need to be spliced) Title (as the title of the file)

# 拿到 小说标题
select0 = parsel.Selector(resp.text)  # 解析网页 
novel_name = select0.xpath("/html/body/div[1]/div[5]/div[1]/div[2]/div[1]/div[1]/h1/text()").get()
# print(novel_name)

# re  xpath scc 去找小说内容

# re拿到小说地址
url_child = re.findall('<a style="" href="(.*?)">(.*?)</a>', resp.text)

for novel in url_child:
    # print(url)
    novel_name_child = novel[1]
    novel_url = "https://www.xxxbiquge.com" + novel[0]

Three: Visit each web page and get the content of the novel

    resp_child = requests.get(novel_url, headers=headers).text
    # print(resp_child)
    select = parsel.Selector(resp_child)
    novel_title = select.xpath('/html/body/div[1]/div[5]/div/div/div[2]/h1/text()').get()
    # 去除 标题写入的符号
    novel_title = re.sub('[\\\/:?!<>]', '', novel_title)
    novel_comtent_list = select.css('#content::text').getall()
    novel_comtent = '\n'.join(novel_comtent_list)
    # print(novel_comtent)

Four: Create a file with os

novel_name = re.sub('[\\\/:?!<>]', '', novel_name)
if not os.path.exists(f"novel\\" + novel_name):
    os.mkdir(f"novel\\" + novel_name)

Last save: remember to close

    with open(f'novel\\{novel_name}\\{novel_title}.txt', mode='a', encoding='utf-8') as f:
        f.write(novel_title)
        f.write("\n")
        f.write(novel_comtent)
        print("开始打印:" + novel_title)

print("打印完成")



resp.close()

Full code:

import requests
import re
import parsel
import os



# url = input("请输入需要爬取的小说:")
url = 'https://www.xxxbiquge.com/5/5900/'
headers = {
    "User-Agent": 
}
resp = requests.get(url, headers=headers)
# print(resp.text)

# 拿个 小说标题
select0 = parsel.Selector(resp.text)
novel_name = select0.xpath("/html/body/div[1]/div[5]/div[1]/div[2]/div[1]/div[1]/h1/text()").get()
# print(novel_name)


novel_name = re.sub('[\\\/:?!<>]', '', novel_name)
if not os.path.exists(f"novel\\" + novel_name):
    os.mkdir(f"novel\\" + novel_name)


# re  xpath scc 去找小说内容

url_child = re.findall('<a style="" href="(.*?)">(.*?)</a>', resp.text)

for novel in url_child:
    # print(url)
    novel_name_child = novel[1]
    novel_url = "https://www.xxxbiquge.com" + novel[0]
    # print(novel_url, novel_name)
    resp_child = requests.get(novel_url, headers=headers).text
    # print(resp_child)
    select = parsel.Selector(resp_child)
    novel_title = select.xpath('/html/body/div[1]/div[5]/div/div/div[2]/h1/text()').get()
    # 去除 标题写入的符号
    novel_title = re.sub('[\\\/:?!<>]', '', novel_title)
    novel_comtent_list = select.css('#content::text').getall()
    novel_comtent = '\n'.join(novel_comtent_list)
    # print(novel_comtent)


    # 创建存储文件
    with open(f'novel\\{novel_name}\\{novel_title}.txt', mode='a', encoding='utf-8') as f:
        f.write(novel_title)
        f.write("\n")
        f.write(novel_comtent)
        print("开始打印:" + novel_title)

print("打印完成")



resp.close()

Guess you like

Origin blog.csdn.net/qq_25976859/article/details/124940363