1、环境
cat /etc/redhat-release
CentOS Linux release 8.1.1911 (Core)
mysql -V
mysql Ver 15.1 Distrib 10.3.17-MariaDB, for Linux (x86_64) using readline 5.1
python3 -V
Python 3.6.8
2、设置mysql的中文编码方式,不然会出现乱码
vi /etc/my.cnf.d/client.cnf
[client]
default-character-set = utf8
vi /etc/my.cnf.d/mariadb-server.cnf
[mysqld]
character-set-server = utf8
collation-server = utf8_general_ci
systemctl restart mariadb
3、新建数据库、表,测试插入一条记录。
create database python default character set utf8 collate utf8_general_ci;
use python;
CREATE TABLE cmiyu (
id INT(11) PRIMARY KEY AUTO_INCREMENT,
miyu VARCHAR(255),
midi VARCHAR(255)
)default charset=utf8;
insert into cmiyu(miyu,midi) values('拳头大小似红桃,日日夜夜输养料,时时刻刻嘣嘣跳。 (打一人体器官)','心脏');
4、代码实现如下
import requests
from requests.packages import urllib3
import re
import bs4 as soup
import pandas as pd
import pymysql
urllib3.disable_warnings()
def get_data(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
r = requests.get(url,headers=headers,verify=False)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def parse_data(html):
info = []
body = soup.BeautifulSoup(html,'html.parser')
# context = body.find_all(href=re.compile('/etmy/[\d]'))
context = body.find_all(href=re.compile('/zwmy/[\d]'))
for lis in (context):
content = []
url1 = 'http://wap.cmiyu.com' + lis["href"]
temp1 = soup.BeautifulSoup(get_data(url1), 'html.parser')
temp1 = temp1.select("body > article > dl > dd > br > article")[0].get_text()
content.extend([lis.getText(),temp1])
info.append(content)
return info
def mysql_insert(sql,parm):
db = pymysql.connect("localhost", "root", "123456", "python", charset='utf8' )
cursor = db.cursor()
cursor.executemany(sql,parm)
db.commit()
cursor.close()
db.close()
def save_data(data):
filename = 'aa.csv'
dataframe = pd.DataFrame(data)
dataframe.to_csv(filename,encoding='utf-8',mode='a',index=False,sep=',',header=False)
def main():
pagenum = 63
for page in range(1,pagenum):
pernum=page/pagenum*100
print('正在爬取第'+str(page)+'页,'+'已经完成%.2f' % pernum + '%。')
# url = r'http://wap.cmiyu.com/etmy/mytid%7D'+str(page)+'.html'
url = r'http://wap.cmiyu.com/zwmy/my33'+str(page)+'.html'
html = get_data(url)
data = parse_data(html)
insert_sql = "insert into cmiyu(miyu,midi) values(%s,%s)"
mysql_insert(insert_sql,data)
# save_data(data)
if __name__ == '__main__':
print ('爬虫启动成功')
main()
print ('爬虫执行结束')