KNN应用

# 预测年收入大于50K的人
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# 先导入文件
salary = pd.read_csv('./day9_data/adults.txt')
salary.head()

# 工作类型   # 教育程度  # 教育年限     # 职位        # 
# workclass  education    education_num  occupation   hours_per_week
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
age workclass final_weight education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country salary
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
salary.dtypes
- 输出 age int64 workclass object final_weight int64 education object education_num int64 marital_status object occupation object relationship object race object sex object capital_gain int64 capital_loss int64 hours_per_week int64 native_country object salary object dtype: object
salary.shape
- 输出 (32561, 15)
salary.columns
- 输出 Index([‘age’, ‘workclass’, ‘final_weight’, ‘education’, ‘education_num’, ‘marital_status’, ‘occupation’, ‘relationship’, ‘race’, ‘sex’, ‘capital_gain’, ‘capital_loss’, ‘hours_per_week’, ‘native_country’, ‘salary’], dtype=’object’)
target = salary['salary']

data = salary[['age', 'workclass', 'education', 'education_num', 'occupation', 'sex', 'hours_per_week', 'native_country','race']]
# 数据有了,目标有了

# 因为有字符串类型,机器学习不支持
# 转换成映射
knn = KNeighborsClassifier(n_neighbors=15)
data.dtypes
- 输出 age int64 workclass int64 education int64 education_num int64 occupation int64 sex int64 hours_per_week int64 native_country int64 race int64 dtype: object
# unique去重得到所有的工作类型
unique_ = data['workclass'].unique()
def convertstr2int(item):
    return np.argwhere(unique_ == item)[0, 0] + 1
data['workclass'] = data['workclass'].map(convertstr2int)
cols = ['education','occupation', 'sex',  'native_country', 'race']
for col in cols:
    # 查找每一列中不同的值
    unique_ = data[col].unique()
    # 找出值的索引, 索引最好不为0
    def convertstr2int(item):
        return np.argwhere(unique_ == item)[0, 0] + 1
    # 将该字段替换成映射值
    data[col] = data[col].map(convertstr2int)
data.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
age workclass education education_num occupation sex hours_per_week native_country race
0 39 1 1 13 1 1 40 1 1
1 50 2 1 13 2 1 13 1 1
2 38 3 2 9 3 1 40 1 1
3 53 3 3 7 3 1 40 1 2
4 28 3 1 13 4 2 40 2 2
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.005)
# 实例化
knn = KNeighborsClassifier(n_neighbors=15)
# 开始训练数据
knn.fit(X_train, y_train)
- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=15, p=2, weights=’uniform’)
# 开始预测
y_ = knn.predict(X_test)
# 准确率
knn.score(X_test,y_test)
- 输出 0.803680981595092 # 保存训练的模型 脸部识别的算法是被打包的 数学建模
from sklearn.externals import joblib

# 算法大部分都使用.m的尾缀
joblib.dump(knn, './50K.m')
- 输出 [‘./50K.m’]
# 有打包 就肯定有加载算法
# CV2
# 咱们sklearn自带加载算法的方法
knn_50k = joblib.load('./50K.m')
knn_50k.score(X_test,y_test)
- 输出 0.803680981595092 # 使用算法,识别是否是乳腺癌
分布式存储的原理

分布式:基于一主多从,多台从服务器监听主服务器,主服务器开放3306端口,开放对从服务器IP地址的支持,
都是基于binlog = 二进制传输流
主服务器 都是innodb 负责插入 (.frm 表的结构,表的索引)(.ibd储存的是数据)master

    从服务器slave 要设置主服务的ip 主服务器的端口, 一个主服务器的mysql账号
        需要主服务的 二进制传输文件编号 文件端口号
        新式类 经典类


        就是一台主机让多台从机允许监听,然后从服务器不停拷贝主服务器中的数据
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# 引入数据
# sep 的默认值是逗号 大部分的csv都是以‘,’分割的
canser = pd.read_csv('./day9_data/cancer.csv',sep='\t')
canser.head()

# Diagnosis 诊断 M是良性  B是恶性
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
ID Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_mean radius_max texture_max perimeter_max area_max smoothness_max compactness_max concavity_max concave_max symmetry_max fractal_max
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 32 columns

canser.shape
- 输出 (569, 32)

canser.dtypes
- 输出 ID int64 Diagnosis object radius_mean float64 texture_mean float64 perimeter_mean float64 area_mean float64 smoothness_mean float64 compactness_mean float64 concavity_mean float64 concave_mean float64 symmetry_mean float64 fractal_mean float64 radius_sd float64 texture_sd float64 perimeter_sd float64 area_sd float64 smoothness_sd float64 compactness_sd float64 concavity_sd float64 concave_sd float64 symmetry_sd float64 fractal_sd float64 radius_max float64 texture_max float64 perimeter_max float64 area_max float64 smoothness_max float64 compactness_max float64 concavity_max float64 concave_max float64 symmetry_max float64 fractal_max float64 dtype: object
target = canser['Diagnosis']
data = canser.iloc[:,2:]
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
# 实例化
knn = KNeighborsClassifier(n_neighbors=10)
# 数据训练
knn.fit(X_train, y_train)
- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights=’uniform’)
y_ = knn.predict(X_test)
#打分
knn.score(X_test, y_test)
- 输出 0.9473684210526315
# 制作一个交叉表 cross join
# pandas 中也有交叉表 crosstab()
# margins 外边 可以用来增加一组统计的数据值
pd.crosstab(index=y_, columns=y_test, margins=True, rownames=['Predict'], colnames=['True'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
True B M All
Predict
B 36 2 38
M 1 18 19
All 37 20 57
# 怎么提高预测的准确率
# 那么我们需要对数据清洗,数据归一化
# (item-min)/(max-min)
nd = np.array([1, 2, 3, 4, 5])
nd.max() = 5
nd.min() = 1

col = 'radius_mean'
def convert2normed(item):
    # 最大值
    d_max = data[col].max()
    # 最小值
    d_min = data[col].min()
    return (item - d_min)/(d_max - d_min)
# 这里做一个覆盖
data[col] = data[col].map(convert2normed)
# 将所有的字段全部进行归一化

cols = data.columns
cols
  • 输出

    Index([‘radius_mean’, ‘texture_mean’, ‘perimeter_mean’, ‘area_mean’,
    ‘smoothness_mean’, ‘compactness_mean’, ‘concavity_mean’, ‘concave_mean’,
    ‘symmetry_mean’, ‘fractal_mean’, ‘radius_sd’, ‘texture_sd’,
    ‘perimeter_sd’, ‘area_sd’, ‘smoothness_sd’, ‘compactness_sd’,
    ‘concavity_sd’, ‘concave_sd’, ‘symmetry_sd’, ‘fractal_sd’, ‘radius_max’,
    ‘texture_max’, ‘perimeter_max’, ‘area_max’, ‘smoothness_max’,
    ‘compactness_max’, ‘concavity_max’, ‘concave_max’, ‘symmetry_max’,
    ‘fractal_max’],
    dtype=’object’)

for col in cols:
    def convert2normed(item):
    # 最大值
        d_max = data[col].max()
        # 最小值
        d_min = data[col].min()
        return (item - d_min)/(d_max - d_min)
    data[col] = data[col].map(convert2normed)
# 重新分割
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)
  • 输出

    0.9824561403508771

利用分类绘制鸢尾花散点图

# 数据集
import sklearn.datasets as datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
data = iris['data']
target = iris['target']
target_names = iris['target_names']
data.shape
# 在机器学习的时候。4个属性代表4种维度
  • 输出

    (150, 4)

sepal = data[:, :2]
# 这是花萼的长度
sepal_length = sepal[:, 0]
#这是花萼的宽度
sepal_width = sepal[:,1]
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
cmap = ListedColormap(['red', 'orange', 'pink'])
plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)
<matplotlib.collections.PathCollection at 0xbb56c18>

这里写图片描述

petal = data[:, 2:]
# 这是花萼的长度
petal_length = petal[:, 0]
#这是花萼的宽度
petal_width = petal[:,1]
plt.scatter(petal_length, petal_width, c=target)
<matplotlib.collections.PathCollection at 0xaf43630>

这里写图片描述

# 首先我们还是要分类
# 我们自己制造数据
# 让数据点填满整张图片
# 我们要画的点怎么区分颜色
# KNN
knn = KNeighborsClassifier()
knn.fit(sepal, target)
  • 输出

    KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
    metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    weights=’uniform’)

# 制造数据
# 选取一个范围
import numpy as np
x = np.linspace(4, 8.2, 200)
y = np.linspace(1.8, 4.5, 150)
# meshgrid 网格
xx,yy = np.meshgrid(x, y)
display(xx.shape, yy.shape)
  • 输出
    (150, 200)
    (150, 200)
xx = xx.reshape(-1)
yy = yy.reshape(-1)
xy = np.c_[xx, yy]
plt.scatter(xy[:,0], xy[:,1])
<matplotlib.collections.PathCollection at 0xba0ee10>

这里写图片描述

# 预测,将我们生成的3万个点(数据)进行预测
X_test = xy

y_ = knn.predict(X_test)
# 这是刚刚30000个点的分类图
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')
<matplotlib.collections.PathCollection at 0xbaaec88>

这里写图片描述

# 将鸢尾花150个点也画进来
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')
plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)
<matplotlib.collections.PathCollection at 0xb45f0b8>

这里写图片描述

knn做线性回归

回归用于对于趋势的预测

台风预测

# 每年都要,太平洋中心就会形成台风,移动轨迹从海洋到陆地
# 中央气象台,预测,采集点,100km,风在转移的时候风速会下降
# 比如说20级,福建16级,红色预警
# 咱们要想要拿到风的运行轨迹,需要一个函数,预测风力,风力的函数
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import matplotlib.pyplot as plt
data = np.linspace(0, 10, 50)
# X, Y
# 噪点,让这个线性不那么平滑
target = np.sin(data)

target[::5] += np.random.randn(10) * 0.35
plt.scatter(data,target)
<matplotlib.collections.PathCollection at 0x9f30cc0>

这里写图片描述

data
target
# 让机器学习带有噪点的函数数据集
knn = KNeighborsRegressor()
knn.fit(data.reshape(data.size,1),target)
  • 输出

    KNeighborsRegressor(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
    metric_params=None, n_jobs=1, n_neighbors=5, p=2,
    weights=’uniform’)

X_test = np.linspace(0, 10, 5000)

# 求解的是y

y_ = knn.predict(X_test.reshape(X_test.size, 1))
y_
  • 输出

    array([ 0.42095792, 0.42095792, 0.42095792, …, -0.17098925,
    -0.17098925, -0.17098925])

# 在计算的过程中还是有点误差值的
plt.plot(X_test, y_, c='red')
plt.scatter(data, target)
<matplotlib.collections.PathCollection at 0xa41c240>

这里写图片描述

自己看理论: cuosor proxySQL(mysql的中间件) mysql8.0 与 5.7的区别

设计思想:

innodb,myisam 的区别:
事物的特性(原子性,一致性,持久性,隔离性) 三范式:分表思想
索引:主键,唯一,联合(联合索引也是普通索引),全文索引(fulltext key) (增加like的效率 ,只对myisam有效)
两张表:给你需求,让你写sql(连表查询,子查询)

存储过程:

类似于函数:写一条特别复杂的sql,连100表

函数:

触发器:

保证数据的完整性

人脸自动补全

  • 给个上半部分的脸,把下半部分给自动补全
  • 回归问题
  • 为什么是回归问题?如果是分类 70亿
  • 因为脸部有轮廓线
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
# 再导入几种算法
# LinearRegression 线性回归
# Ridge 山岭 岭回归
# Lasso 罗斯 罗斯回归
from sklearn.linear_model import LinearRegression, Ridge, Lasso
# y = wx
# w = y * x^-1
x = np.linspace(0,10, 100)
y = 3 * x 
# 回归要求的就是w这个系数, 如果有误差值,再添加
plt.plot(x, y)
[<matplotlib.lines.Line2D at 0xc92b780>]

这里写图片描述

# 引入人脸的数据集
import sklearn.datasets as datasets
# 提取数据集
face = datasets.fetch_olivetti_faces()
face
data = face['data']
data.shape
  • 输出

    (400, 4096)

import math
math.sqrt(data.shape[1])
  • 输出

    64.0

# 人脸, 上下分开, 切片
# 先切上半部分的脸
face_up = data[:,:2048]
# 下半部
face_down = data[:,2048:]
# 怎么样才可以随机取图
index = np.random.randint(0, 400,size=1)
plt.imshow(data[index].reshape(64, 64),cmap='gray')
<matplotlib.image.AxesImage at 0xc7f5da0>

这里写图片描述


axes = plt.subplot(121)
axes.imshow(face_up[index].reshape(32, 64), cmap='gray')
axes1 = plt.subplot(122)
axes1.imshow(face_down[index].reshape(32, 64), cmap='gray')
<matplotlib.image.AxesImage at 0xb9ecf28>

这里写图片描述

# 分割数据
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(face_up, face_down, test_size=0.02)
# 让四个算法同时计算,比较哪个更准确
# estimator 估计器

# 将4中算法保存在字典中
estimator = {
    'KNN': KNeighborsRegressor(),
    'LinearRG':LinearRegression(),
    'Ridge': Ridge(),
    'Lasso':Lasso()
}
# 循环训练
result_ = dict()
for key, estimator in estimator.items():
    # 开始训练
    estimator.fit(X_train, y_train)
    # 预测
    y_ = estimator.predict(X_test)
    # 将各个算法的结果保存起来
    result_[key] = y_
# 测试knn预测的结果
knn_y = result_['KNN']
face_true = np.concatenate([X_test[0], y_test[0]]).reshape(64, 64)
plt.imshow(face_true, cmap='gray')
<matplotlib.image.AxesImage at 0x114a1fd0>

这里写图片描述

result_ 
axes = plt.subplot(121)
axes.imshow(face_true, cmap='gray')
# 取KNN的预测图
face_knn_predict = np.concatenate([X_test[0], knn_y[0]]).reshape(64, 64)
axes1 = plt.subplot(122)
axes1.imshow(face_knn_predict, cmap='gray')
<matplotlib.image.AxesImage at 0x116c6dd8>

这里写图片描述

# 将所有的算法图片全部展示, 原图, 上半部分脸也展示, 6张图片
plt.figure(figsize=(6 * 2, 8 *2))
for i in range(8):
    # subplot
    axes = plt.subplot(8, 6, 1+ 6 * i)
    # 第一张图为真实的脸
    axes.axis('off')
    face_true = np.concatenate([X_test[i], y_test[i]]).reshape(64, 64)
    axes.imshow(face_true, cmap='gray')

    if i == 0:
        axes.set_title('True')
    # 第二行为上半部分脸
    axes = plt.subplot(8, 6, 2+ 6 * i)
    axes.axis('off')
    face_up = X_test[i].reshape(32, 64)
    axes.imshow(face_up, cmap='gray')
    if i == 0:
        axes.set_title('UP')
    # 打印算法的脸
    for j, key in enumerate(result_):
        y_ = result_[key]
        face_predict = np.concatenate([X_test[i], y_[i]]).reshape(64, 64)
        axes = plt.subplot(8, 6, 3 + 6 * i + j)
        axes.imshow(face_predict, cmap='gray')
        axes.axis('off')
        if i == 0:
            axes.set_title(key)

这里写图片描述

猜你喜欢

转载自blog.csdn.net/beichen0518/article/details/80780721
kNN
今日推荐