Python网络爬虫——我的第二个爬虫(爬取小说信息)

前言

首先感谢https://blog.csdn.net/c406495762/article/details/78123502,在大佬们的帮助下我完成了人生中第二次爬虫的体验,这次爬取的是小说信息,用到的知识点是Request库和BeautifulSoup库。

开始

1.首先还是老样子

# -*- coding:UTF-8 -*-

2.要用到的包

import requests
import sys
from bs4 import BeautifulSoup

3.存放URL等信息的方法

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章节名
        self.urls = []  # 存放章节链接
        self.nums = 0  # 章节数

4.获取下载链接的方法

    def get_download_url(self):
        # 获取下载链接
        req = requests.get(url=self.target)
        html = req.text
        div_bf = BeautifulSoup(html,"html.parser")
        div = div_bf.find_all('div', class_='listmain')
        a_bf = BeautifulSoup(str(div[0]),"html.parser")
        a = a_bf.find_all('a')
        self.nums = len(a[15:])  #剔除不必要的章节,并统计章节数
        for each in a[15:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))
  1. def get_contents(self,target):
    # 获取章节内容
    req = requests.get(url=target)
    html = req.text
    bf = BeautifulSoup(html,”html.parser”)
    texts = bf.find_all(‘div’, class_=’showtxt’)
    texts = texts[0].text.replace(‘\xa0’*8,’\n\n’)
    return texts

获取章节内容的方法

6.将获取的内容写入文件的方法

    def writer(self, name, path, text):
         #写入文件
         write_flag = True
         with open(path,'a', encoding='utf-8') as f:
             f.write(name + '\n')
             f.writelines(text)
             f.write('\n\n')

7.主函数

if __name__ == "__main__":
    dl = downloader()
    dl.get_download_url()
    print('《一年永恒》开始下载:')
    for i in range(dl.nums):
        dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i]))
        sys.stdout.write("  已下载:%.3f%%" % float(i / dl.nums) + '\r')
        sys.stdout.flush()
    print('《一年永恒》下载完成')

8.完整代码片

# -*- coding:UTF-8 -*-

import requests
import sys
from bs4 import BeautifulSoup

class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章节名
        self.urls = []  # 存放章节链接
        self.nums = 0  # 章节数


    def get_download_url(self):
        # 获取下载链接
        req = requests.get(url=self.target)
        html = req.text
        div_bf = BeautifulSoup(html,"html.parser")
        div = div_bf.find_all('div', class_='listmain')
        a_bf = BeautifulSoup(str(div[0]),"html.parser")
        a = a_bf.find_all('a')
        self.nums = len(a[15:])  #剔除不必要的章节,并统计章节数
        for each in a[15:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))


    def get_contents(self,target):
         # 获取章节内容
         req = requests.get(url=target)
         html = req.text
         bf = BeautifulSoup(html,"html.parser")
         texts = bf.find_all('div', class_='showtxt')
         texts = texts[0].text.replace('\xa0'*8,'\n\n')
         return texts


    def writer(self, name, path, text):
         #写入文件
         write_flag = True
         with open(path,'a', encoding='utf-8') as f:
             f.write(name + '\n')
             f.writelines(text)
             f.write('\n\n')

if __name__ == "__main__":
    dl = downloader()
    dl.get_download_url()
    print('《一年永恒》开始下载:')
    for i in range(dl.nums):
        dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i]))
        sys.stdout.write("  已下载:%.3f%%" % float(i / dl.nums) + '\r')
        sys.stdout.flush()
    print('《一年永恒》下载完成')=

后记

在大佬们的帮助下完成了第二次爬虫,也对Requests和BeautifulSorp库有了更多的认识,我会自己继续努力,写出一个属于自己的爬虫。

猜你喜欢

转载自blog.csdn.net/zyw644451/article/details/79746503