正则爬取网页数据(二)

# 用正则爬取网页数据, 并存储到本地pymysql数据库中
import re,
random, time, pymysql
from urllib.request import Request, urlopen


class QSBKDataTool(object):
    #  [('\n猩猩眨呀眨\n', '24', '\n\n\n昨晚同学聚会,以前的死对头非要坐我旁边,盯着我脸上的痘痘说她现在不吃肉,皮肤变好了。<br/>我放下筷子盯着她的水桶腰,说我只吃一点点肉,但不吃主食,所以体重控制的不错……<br/>聚会结束各自离去,也许是冤家路窄,我俩在牛肉面馆里又遇到了。\n\n', '5785', '43')]
    remove_n = re.compile(r'\n', re.S)
    remove_br = re.compile(r'<br/>|<br>', re.S)

    @classmethod
    def process_data(cls, origin_data):
        result_data = []
        for data in origin_data:
            # 处理昵称 data[0]
            nick_name = data[0]
            nick_name = re.sub(cls.remove_n, '', nick_name) # str字符串中的replace()

            # 处理内容 data[3]
            content = data[3]
            content = re.sub(cls.remove_n, '', content)
            content = re.sub(cls.remove_br, '', content)

            result_data.append((nick_name, data[1], data[2], content, data[4], data[5]))
        return result_data

    @classmethod
    def process_next(cls, data):
        next_page_str = data[0][1]
        next_page_str = re.sub(cls.remove_n, '', next_page_str)
        return (data[0][0], next_page_str)


class QSBKDBTool(object):

    db = None
    cursor = None

    @classmethod
    def connect_db(cls):
        cls.db = pymysql.connect(host='localhost', user='root', passwd='123456', db='qsbk', port=3306, charset='utf8')
        cls.cursor = cls.db.cursor()

    @classmethod
    def save_list_data(cls, list_data):
        # 遍历list_data,执行insert操作
        for q_name, q_age, q_href, q_content, q_smail_num, q_comment_num in list_data:
            # 表使用文章的id作为主键 /article/120510346
            q_id = q_href.split('/')[2]
            insert_sql = "INSERT INTO qsbk (`q_id`, `q_name`, `q_age`, `q_href`, `q_content`, `q_smail_num`, q_comment_num ) VALUES (%s, %s, %s, %s, %s, %s, %s)"
            try:
                cls.cursor.execute(insert_sql, (q_id, q_name, q_age, q_href, q_content, q_smail_num, q_comment_num ))
                cls.db.commit()
            except Exception as e:
                print('主键冲突或者内容有表情数据,跳过...')
                cls.db.rollback()

    @classmethod
    def save_detail_data(cls,q_id,detail_data):
        if detail_data:
            for comment in detail_data:
                insert_sql = "INSERT INTO detail (comment, q_id) VALUES (%s,%s)"
                try:
                    cls.cursor.execute(insert_sql, (comment, q_id))
                    cls.db.commit()
                except Exception as e:
                    print('详情页主键冲突或者内容有表情数据,跳过...',e)
                    cls.db.rollback()

    @classmethod
    def connect_close(cls):
        cls.cursor.close()
        cls.db.close()


class QSBKDetailSpider(object):
    """
    解析详情页
    """
    user_agent_list = [
        "User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
        "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    ]
    def __init__(self, url):
        self.url = url

    def get_page_detail(self):
        user_agent = random.choice(self.user_agent_list)
        request = Request(self.url, headers={'User-Agent': user_agent})

        # 源代码中可能含有表情
        try:
            response = urlopen(request)
            # 表情在decode()的时候,有可能会成功,但是存入数据库失败
            # 表情在decode()的时候,直接异常
            try:
                origtin_html = response.read().decode()
            except Exception as e:
                print('decode()失败,原因:{},url:{}'.format(e, self.url))

                # 这一页源代码获取失败
                return None
        except Exception as e:
            print('urlopen()失败,原因:{}, url:{}'.format(e, self.url))
            return None
        else:
            return origtin_html

    def parse_page_detail(self, origin_html):
        if origin_html != None:
            comment = re.findall(re.compile(r'<a.*?class="userlogin".*?<span class="body">(.*?)</span>', re.S), origin_html)
            # pop()返回被移除的对象
            comment.pop(len(comment)-1)

            return comment
        else:
            print('详情页源代码为空')

class QSBKSpider(object):
    user_agent_list = [
        "User-Agent, Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1",
        "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
        "User-Agent, Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
    ]

    def __init__(self):
        self.base_url = 'https://www.qiushibaike.com/hot/page/'

    def get_page_list(self, page_num):
        """
        获取列表页数据
        :param page_num: 页码
        :return: 数据
        """
        url = self.base_url + str(page_num)
        user_agent = random.choice(self.user_agent_list)
        request = Request(url, headers={'User-Agent': user_agent})

        # 源代码中可能含有表情
        try:
            response = urlopen(request)
            # 表情在decode()的时候,有可能会成功,但是存入数据库失败
            # 表情在decode()的时候,直接异常
            try:
                origtin_html = response.read().decode()
            except Exception as e:
                print('decode()失败,原因:{},url:{}'.format(e, url))

                # 这一页源代码获取失败
                return None
        except Exception as e:
            print('urlopen()失败,原因:{}, url:{}'.format(e, url))
            return None
        else:
            return origtin_html

    def parser_page_list(self, origin_html):
        """
        解析列表页的数据
        :param origin_html: 某一页的网页源代码
        :return: 解析并处理后的数据
        """
        if origin_html != None:
            pattern = re.compile(r'<div class="author clearfix">.*?<h2>(.*?)</h2>.*?<div class="articleGender.*?">(.*?)</div>.*?<a.*?href="(.*?)".*?>.*?<div class="content">.*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<i class="number">(.*?)</i>', re.S)
            origin_data = re.findall(pattern, origin_html)
            result_data = QSBKDataTool.process_data(origin_data)

            # 将列表页的resul_data保存到数据库
            QSBKDBTool.save_list_data(result_data)

            # 处理详情页数据
            self.get_detail_url(result_data)

            # 处理下一页
            next_page_pattern = re.compile(r'.*<span class="page-numbers">.*?<a href="(.*?)".*?>.*?<span.*?>(.*?)</span>', re.S)
            res = re.findall(next_page_pattern, origin_html)
            # 判断是否有下一页,如果有,继续上述的逻辑;如果没有,停止爬虫
            next_data = QSBKDataTool.process_next(res)
            if next_data[1] == '下一页':
                relation_url = next_data[0]
                number = re.search('(\d+)', relation_url).group()
                html = self.get_page_list(number)
                self.parser_page_list(html)
                time.sleep(3)
            else:
                print('已经是最后一页了')

        else:
            print('origin_htmlNone')

    def get_detail_url(self, data):
        for data_tuple in data:
            detail_url = 'https://www.qiushibaike.com' + data_tuple[2]

            q_id = data_tuple[2].split('/')[2]

            detail_spider = QSBKDetailSpider(detail_url)
            detail_html = detail_spider.get_page_detail()
            res = detail_spider.parse_page_detail(detail_html)
            QSBKDBTool.save_detail_data(q_id, res)


if __name__ == "__main__":
    QSBKDBTool.connect_db()
    qsbk = QSBKSpider()
    origin_html = qsbk.get_page_list(1)
    qsbk.parser_page_list(origin_html)
    QSBKDBTool.connect_close()

猜你喜欢

转载自blog.csdn.net/qq_42336542/article/details/80697152