使用python爬谜语并存入mysql数据库中

1、环境
cat /etc/redhat-release 
CentOS Linux release 8.1.1911 (Core) 
mysql -V
mysql  Ver 15.1 Distrib 10.3.17-MariaDB, for Linux (x86_64) using readline 5.1
python3 -V
Python 3.6.8

2、设置mysql的中文编码方式,不然会出现乱码
vi /etc/my.cnf.d/client.cnf 
[client]
default-character-set = utf8
vi /etc/my.cnf.d/mariadb-server.cnf
[mysqld]
character-set-server = utf8
collation-server = utf8_general_ci
systemctl restart mariadb

3、新建数据库、表,测试插入一条记录。
create database python default character set utf8 collate utf8_general_ci;
use python;
CREATE TABLE cmiyu (
 id INT(11) PRIMARY KEY AUTO_INCREMENT,
 miyu VARCHAR(255),
 midi VARCHAR(255)
 )default charset=utf8;

insert into cmiyu(miyu,midi) values('拳头大小似红桃,日日夜夜输养料,时时刻刻嘣嘣跳。 (打一人体器官)','心脏');

4、代码实现如下

import requests
from requests.packages import urllib3
import re
import bs4 as soup
import pandas as pd
import pymysql

urllib3.disable_warnings()

def get_data(url):
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
    r = requests.get(url,headers=headers,verify=False)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    return  r.text

def parse_data(html):
    info = []
    body = soup.BeautifulSoup(html,'html.parser')
#    context = body.find_all(href=re.compile('/etmy/[\d]'))
    context = body.find_all(href=re.compile('/zwmy/[\d]'))
    for lis in (context):
        content = []
        url1 = 'http://wap.cmiyu.com' + lis["href"]
        temp1 = soup.BeautifulSoup(get_data(url1), 'html.parser')
        temp1 = temp1.select("body > article > dl > dd > br > article")[0].get_text()
        content.extend([lis.getText(),temp1])
        info.append(content)
    return info

def mysql_insert(sql,parm):
        db = pymysql.connect("localhost", "root", "123456", "python", charset='utf8' )
        cursor = db.cursor()
        cursor.executemany(sql,parm)
        db.commit()
        cursor.close()
        db.close()

def save_data(data):
    filename = 'aa.csv'
    dataframe = pd.DataFrame(data)
    dataframe.to_csv(filename,encoding='utf-8',mode='a',index=False,sep=',',header=False)

def main():
    pagenum =  63
    for page in range(1,pagenum):
        pernum=page/pagenum*100
        print('正在爬取第'+str(page)+'页,'+'已经完成%.2f' % pernum + '%。')
#        url = r'http://wap.cmiyu.com/etmy/mytid%7D'+str(page)+'.html'
        url = r'http://wap.cmiyu.com/zwmy/my33'+str(page)+'.html'
        html = get_data(url)
        data = parse_data(html)
        insert_sql = "insert into cmiyu(miyu,midi) values(%s,%s)"
        mysql_insert(insert_sql,data)
#        save_data(data)

if __name__ == '__main__':
    print ('爬虫启动成功')
    main()
    print ('爬虫执行结束')

猜你喜欢

转载自blog.csdn.net/qq_37594711/article/details/106585008