Python实现某网站爬取小说(爬虫)

该代码仅供学习交流!!

爬虫:

爬虫是一种自动化程序,用于从互联网上获取和提取数据。它模拟人类用户的行为,在网页上浏览和抓取信息,并将所需数据保存或处理。

下面演示的代码是爬取小说的

先看运行效果图

 下载完成后,直接合并为一个文件,超方便好吧,不是单个的文件看着麻烦

代码如下:

需要爬取指定小说就修改一下对应小说的编号即可

import requests
from bs4 import BeautifulSoup
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

ids = "16585"
url = f'http://www.qiuyelou.net/{ids}/'

def get_soup(url):
    # 发送请求获取页面内容,并返回解析后的BeautifulSoup对象
    response = requests.get(url, headers=headers)
    response.encoding = "utf-8"
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def download_chapter(chap_url, title):
    # 下载章节内容
    retries = 3
    while retries > 0:
        try:
            chapter_response = requests.get(chap_url, headers=headers)
            chapter_response.encoding = "utf-8"
            chapter_soup = BeautifulSoup(chapter_response.text, 'html.parser')

            # 提取章节标题和内容
            chapter_title = chapter_soup.find('div', class_='title').h1.text
            idclass = chap_url.split("/")[-1].replace(".html", "")
            chapter_content = chapter_soup.find('div', id=f'content{idclass}').text

            # 将章节标题和内容写入文件
            with open(f"缓存/{title}.txt", 'a', encoding='utf-8') as f:
                f.write(chapter_title + '\n\n')
                f.write(chapter_content + '\n\n')
            print('已下载:', chapter_title)
            break
        except requests.exceptions.RequestException:
            print('请求失败,重试...')
            retries -= 1
            time.sleep(1)
    else:
        print('无法下载章节:', chap_url)

soup = get_soup(url)
title = soup.find('div', class_='title').h1.text
print('正在下载小说:', title)

chapter_urls = [f'http://www.qiuyelou.net/{ids}/{chapter["href"]}' for chapter in soup.select('dd a')]
print(chapter_urls)

for chap_url in chapter_urls:
    download_chapter(chap_url, title)

print('小说下载完成!')

猜你喜欢

转载自blog.csdn.net/oiadkt/article/details/131655362