Python implements crawling novels on a certain website (crawler)

This code is for learning and communication only! !

reptile:

A crawler is an automated program used to obtain and extract data from the Internet. It simulates the behavior of human users, browsing and crawling information on web pages, and saving or processing the required data.

The code demonstrated below is for crawling novels

Let’s look at the running effect diagram first

 After the download is completed, it can be directly merged into one file. It is super convenient. It is not a single file that looks troublesome.

code show as below:

If you need to crawl a specific novel, just modify the number of the corresponding novel.

import requests
from bs4 import BeautifulSoup
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

ids = "16585"
url = f'http://www.qiuyelou.net/{ids}/'

def get_soup(url):
    # 发送请求获取页面内容,并返回解析后的BeautifulSoup对象
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def download_chapter(chap_url, title):
    # 下载章节内容
    retries = 3
    while retries > 0:
        try:
            chapter_response = requests.get(chap_url, headers=headers)
            chapter_response.encoding = "utf-8"
            chapter_soup = BeautifulSoup(chapter_response.text, 'html.parser')

            # 提取章节标题和内容
            chapter_title = chapter_soup.find('div', class_='title').h1.text
            idclass = chap_url.split("/")[-1].replace(".html", "")
            chapter_content = chapter_soup.find('div', id=f'content{idclass}').text

            # 将章节标题和内容写入文件
            with open(f"缓存/{title}.txt", 'a', encoding='utf-8') as f:
                f.write(chapter_title + '\n\n')
                f.write(chapter_content + '\n\n')
            print('已下载:', chapter_title)
            break
        except requests.exceptions.RequestException:
            print('请求失败,重试...')
            retries -= 1
            time.sleep(1)
    else:
        print('无法下载章节:', chap_url)

soup = get_soup(url)
title = soup.find('div', class_='title').h1.text
print('正在下载小说:', title)

chapter_urls = [f'http://www.qiuyelou.net/{ids}/{chapter["href"]}' for chapter in soup.select('dd a')]
print(chapter_urls)

for chap_url in chapter_urls:
    download_chapter(chap_url, title)

print('小说下载完成!')

Guess you like

Origin blog.csdn.net/oiadkt/article/details/131655362