python 爬取古诗文存入mysql

  • 使用正则提取数据,请求库requests,看代码,在存入数据库时,报错ERROR 1054 (42S22): Unknown column ‘title’ in ‘field list’。原来是我写sql 有问题,sql = “insert into poem(title,author,content,create_time) values({},{},{},{})”.format(title, author,content,crate_time)
    应该写成sql = “insert into poem(title,author,content,create_time) values(’{}’,’{}’,’{}’,’{}’)”.format(title, author,content,crate_time)。把插入的值放入引号中。
import datetime
import re
import pymysql
import requests

url = "https://www.gushiwen.org/"
headers = {
    'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}


class Spiderpoem(object):
    conn = pymysql.Connect(host="localhost", port=3306, user="root", password='mysql', database='poem_data',
                           charset="utf8")
    cs1 = conn.cursor()

    def get_requests(self, url, headers=None):
        """发送请求"""
        resp = requests.get(url, headers=headers)
        if resp.status_code == 200:
            # print(resp.request.headers)
            return resp.text
        return None

    def get_parse(self, response):
        """解析网页"""
        re_data = {
            "title": r'<div\sclass="sons">.*?<b>(.*?)</b>.*?</div>',
            "author": r'<p>.*?class="source">.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>.*?</p>',
            "content": r'<div\sclass="contson".*?>(.*?)</div>'
        }
        titles = self.reg_con(re_data["title"], response)
        authors = self.reg_con(re_data["author"], response)
        poems_list = self.reg_con(re_data["content"], response)
        contents = list()
        for item in poems_list:
            ite = re.sub(r'<.*?>|\s', "", item)
            contents.append(ite.strip())
        for value in zip(titles, authors, contents):
            title, author, content = value
            author = "".join([author[0], '.', author[1]])
            poem = {
                "title": title,
                "author": author,
                "content": content
            }
            yield poem

    def reg_con(self, params, response):
        """正则匹配"""
        if not response:
            return "请求错误"
        param = re.compile(params, re.DOTALL) # re.DOTALL 匹配换行等价于re.S
        result = re.findall(param, response)
        return result

    @classmethod
    def save_data(cls, poem):
        title = poem.get("title")
        author = poem.get("author")
        content = poem.get("content")
        crate_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        sql = "insert into poem(title,author,content,create_time) values('{}','{}','{}','{}')".format(title, author,
                                                                                                      content,
                                                                                                      crate_time)
        count = cls.cs1.execute(sql)
        print(count)
        cls.conn.commit()

    def main(self):
        resp = self.get_requests(url, headers)
        for it in self.get_parse(resp):
            self.save_data(it)

        self.cs1.close()
        self.conn.close()


if __name__ == '__main__':
    Spiderpoem().main()

发布了127 篇原创文章 · 获赞 25 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/weixin_44224529/article/details/103841355
今日推荐