爬虫 - 博客爬取并入库


'''
对崔庆才的个人博客上的文章基本信息的爬取 (共41页）
https://cuiqingcai.com/page/1
标题、链接、浏览的数目、评论的数目以及喜欢的人数
'''
import re
import requests
import logging
from lxml import etree
import pymysql

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DBconnect(object):
    def __init__(self):
        self.conn = pymysql.connect(host='localhost', port=3306, user='', password='',db='spider')
        self.cursor = self.conn.cursor()

    def save(self, table, data):
        '''判断数据是列表还是字典'''
        print('数据类型',type(data))
        try:
            if isinstance(data,dict):
                sql = "insert ignore into " + table + str(tuple(data.keys())) + 'values' + str(tuple(data.values())) + ";"
                self.cursor.execute(sql, data)
            elif isinstance(data,list):
                for d in data:
                    sql = "insert ignore into " + table + ' values' + str(tuple(d.values())) + ";"
                    print(sql)
                    self.cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            logging.error(e)
            self.conn.rollback()

class BlogSpider():
    def __init__(self):
        self.base_url = 'https://cuiqingcai.com/page/'
        self.total_page = 41

    def parse_url(self,url):
        res = requests.get(url,verify=False)
        return res.text

    def parse_content(self,html):
        tree = etree.HTML(html)
        articles = tree.xpath("//div[@class='content']/article")
        data_list = []
        for article in articles:
            category = article.xpath("./header/a/text()")
            category = category[0] if category else None
            title =article.xpath("./header/h2/a/text()")[0] if article.xpath("./header/h2/a/text()") else None
            synopsis = article.xpath("./span/text()")[0]
            picture = article.xpath("./div/a/img/@src")[0]
            author = article.xpath('./p/span[1]/a/text()')[0]
            publish_time = article.xpath("./p/span[2]/text()")[0]
            page_view = article.xpath("./p/span[3]/text()")[0]
            page_view = int(re.findall('\d+',page_view)[0])
            comment = article.xpath("./p/span[4]/a/text()")[0]
            comment = int(re.findall('\d+',comment)[0])
            likes = article.xpath("./p/span[5]/a/span/text()")[0]
            # data_dic = {'category':category,'title':title,'synopsis':synopsis,'picture':picture,'author':author,'publish_time':publish_time,
            #             'page_view':page_view,'comments':comments,'likes':likes}
            data_dic = {'title': title,'author': author, 'publish_time': publish_time,
                        'page_view': page_view, 'comment': comment}
            data_list.append(data_dic)
        return data_list

    def save_data(self,data_list):
        db = DBconnect()
        # 先创建表
        table_name = 'blogs'
        sql = """
        create table if not exists blogs(
        title varchar(100) not null,
        author varchar(30) not null,
        publish_time varchar(30) not null,
        page_view int(6) not null,
        comment int(6) not null
        );
        """
        sql2 = "alter table blogs add unique key(publish_time);"
        db.cursor.execute(sql)
        db.cursor.execute(sql2)
        # db.cursor.execute(sql)
        # 保存数据到数据库
        db.save(table='blogs',data = data_list)

    def run(self):
        for i in range(1,41):
            url = self.base_url + str(i)
            # 请求
            str_html = self.parse_url(url)
            # 解析网页
            data_list = self.parse_content(str_html)
            print(data_list)
            # 存储数据
            self.save_data(data_list)
        return {'status_code':'200'}

if __name__ == '__main__':
    bs = BlogSpider()
    bs.run()
爬虫 - 博客爬取并入库

猜你喜欢