Python爬虫基础入门

爬取的网址：https://www.23hh.com/book/0/189/
在这里插入图片描述
需求：获取小说的章节目录及其对应的章节内容
需要的库：requests、BeautifulSoup和re。利用requests库发送浏览器请求，BeautifulSoup和re库对获取到的数据进行分析、提取。
分别使用pip install requests和pip install BeautifulSoup4安装
对网页源码进行分析：
在这里插入图片描述
1、创建testcraw包
2、创建craw_site.py文件用于获取章节目录及其链接

import requests
from bs4 import BeautifulSoup
import re


def getSoup(website):
    try:
        res = requests.get(url=website)  # 发送请求
        res.raise_for_status()  # 检测返回状态码是否正常
        res.encoding = res.apparent_encoding  # 避免中文乱码
        content = res.text
        soup = BeautifulSoup(content, 'html.parser')
        return soup  # 返回BeautifulSoup对象
    except requests.HTTPError as e:
        return e


def result(website):
    chapter, siteLst = [], []
    try:
        soup = getSoup(website)
    except requests.HTTPError as e:
        return e
    else:
        for i in soup.find_all('dd'):  # 利用BeautifulSoup类的find_all方法对数据进行筛选
            for j in i.find_all('a'):
                k = j.string
                isExisted = re.match('[\u4e00-\u9fa5]+章', k)  # 利用正则表达式筛选
                if isExisted is not None:
                    chapter.append(j.string)
                    siteLst.append(website + j.attrs['href'][12:])  # 提取小说各个章节得我连接
        lst = list(zip(chapter, siteLst))
        for i in range(12):
            del lst[0]
        return lst  # 返回章节目录及其连接

3、创建mysql_helper.py文件用于保存数据

import pymysql


class MysqlTool(object):
    def getConn(self):
        conn = None
        try:
            conn = pymysql.connect(host='localhost',
                                   user='root',
                                   password='5180',
                                   port=3306,
                                   db='fictions')
        except Exception as e:
            print('\033[31m{}\033[0m'.format(e))
        return conn

    def closeConn(self, conn):
        try:
            if conn is not None:
                conn.commit()
                conn.close()
        except Exception as e:
            print('\033[31m{}\033[0m'.format(e))

    def getCursor(self, conn):
        cur = None
        try:
            if conn is not None:
                cur = conn.cursor()
        except Exception as e:
            print('\033[31m{}\033[0m'.format(e))
        return cur

    def closeCursor(self, cur):
        try:
            if cur is not None:
                cur.close()
        except Exception as e:
            print('\033[31m{}\033[0m'.format(e))

    def insert(self, cur, chapter='', content=''):
        sql = 'insert into perfect_world(chapter, content) values(%s, %s);'
        count = cur.execute(sql, (chapter, content))
        if count > 0:
            print('{} 抓取成功'.format(chapter))

创建fictions数据库和如下所示的表：
在这里插入图片描述

4、创建subpage.py文件用于获取子页的正文内容

import requests
from bs4 import BeautifulSoup
from testcraw.craw_site import result
from testcraw.mysql_helper import MysqlTool
import re


def test(website):
    for i in result(website):
        chapter, content = i[0], ''
        site = i[1]
        res = requests.get(url=site, timeout=60)
        res.raise_for_status()
        res.encoding = res.apparent_encoding
        demo = res.text
        soup = BeautifulSoup(demo, 'html.parser')
        for i in soup.find_all(attrs={
    
    'id': 'content'}):
            for j in i.stripped_strings:
                content += (j + '\n')
        content = re.sub(pattern='纯文字在线阅读本站域名手机同步阅读请访问', repl='', string=content, count=1)
        mt = MysqlTool()
        conn = mt.getConn()
        cur = mt.getCursor(conn)
        mt.insert(cur, chapter, content)
        mt.closeCursor(cur)
        mt.closeConn(conn)


test('https://www.23hh.com/book/0/189/')

爬取部分内容如下：

在这里插入图片描述

Python爬虫基础入门

猜你喜欢