# 预测年收入大于50K的人

import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# 先导入文件
salary = pd.read_csv('./day9_data/adults.txt')
salary.head()

# 工作类型   # 教育程度  # 教育年限     # 职位        # 
# workclass  education    education_num  occupation   hours_per_week

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	age	workclass	final_weight	education	education_num	marital_status	occupation	relationship	race	sex	capital_gain	hours_per_week	native_country	salary
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

salary.dtypes

- 输出 age int64 workclass object final_weight int64 education object education_num int64 marital_status object occupation object relationship object race object sex object capital_gain int64 capital_loss int64 hours_per_week int64 native_country object salary object dtype: object

salary.shape

- 输出 (32561, 15)

salary.columns

- 输出 Index([‘age’, ‘workclass’, ‘final_weight’, ‘education’, ‘education_num’, ‘marital_status’, ‘occupation’, ‘relationship’, ‘race’, ‘sex’, ‘capital_gain’, ‘capital_loss’, ‘hours_per_week’, ‘native_country’, ‘salary’], dtype=’object’)

target = salary['salary']

data = salary[['age', 'workclass', 'education', 'education_num', 'occupation', 'sex', 'hours_per_week', 'native_country','race']]

# 数据有了，目标有了

# 因为有字符串类型，机器学习不支持
# 转换成映射
knn = KNeighborsClassifier(n_neighbors=15)

data.dtypes

- 输出 age int64 workclass int64 education int64 education_num int64 occupation int64 sex int64 hours_per_week int64 native_country int64 race int64 dtype: object

# unique去重得到所有的工作类型
unique_ = data['workclass'].unique()

def convertstr2int(item):
    return np.argwhere(unique_ == item)[0, 0] + 1

data['workclass'] = data['workclass'].map(convertstr2int)

cols = ['education','occupation', 'sex',  'native_country', 'race']
for col in cols:
    # 查找每一列中不同的值
    unique_ = data[col].unique()
    # 找出值的索引， 索引最好不为0
    def convertstr2int(item):
        return np.argwhere(unique_ == item)[0, 0] + 1
    # 将该字段替换成映射值
    data[col] = data[col].map(convertstr2int)

data.head()

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	age	workclass	education	education_num	occupation	sex	hours_per_week	native_country	race
0	39	1	1	13	1	1	40	1	1
1	50	2	1	13	2	1	13	1	1
2	38	3	2	9	3	1	40	1	1
3	53	3	3	7	3	1	40	1	2
4	28	3	1	13	4	2	40	2	2

# 分割数据
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.005)

# 实例化
knn = KNeighborsClassifier(n_neighbors=15)

# 开始训练数据
knn.fit(X_train, y_train)

- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=15, p=2, weights=’uniform’)

# 开始预测
y_ = knn.predict(X_test)
# 准确率
knn.score(X_test,y_test)

- 输出 0.803680981595092 # 保存训练的模型脸部识别的算法是被打包的数学建模

from sklearn.externals import joblib

# 算法大部分都使用.m的尾缀
joblib.dump(knn, './50K.m')

- 输出 [‘./50K.m’]

# 有打包 就肯定有加载算法
# CV2
# 咱们sklearn自带加载算法的方法
knn_50k = joblib.load('./50K.m')

knn_50k.score(X_test,y_test)

- 输出 0.803680981595092 # 使用算法，识别是否是乳腺癌

分布式存储的原理

分布式：基于一主多从，多台从服务器监听主服务器，主服务器开放3306端口，开放对从服务器IP地址的支持，
都是基于binlog = 二进制传输流
主服务器 都是innodb 负责插入 （.frm 表的结构，表的索引）（.ibd储存的是数据）master

    从服务器slave 要设置主服务的ip 主服务器的端口， 一个主服务器的mysql账号
        需要主服务的 二进制传输文件编号 文件端口号
        新式类 经典类


        就是一台主机让多台从机允许监听，然后从服务器不停拷贝主服务器中的数据

import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# 引入数据
# sep 的默认值是逗号 大部分的csv都是以‘，’分割的
canser = pd.read_csv('./day9_data/cancer.csv',sep='\t')
canser.head()

# Diagnosis 诊断 M是良性  B是恶性

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

	ID	Diagnosis	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave_mean	…	radius_max	texture_max	perimeter_max	area_max	smoothness_max	compactness_max	concavity_max	concave_max	symmetry_max	fractal_max
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	…	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	…	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	…	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	…	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	…	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 32 columns

canser.shape

- 输出 (569, 32)


canser.dtypes

- 输出 ID int64 Diagnosis object radius_mean float64 texture_mean float64 perimeter_mean float64 area_mean float64 smoothness_mean float64 compactness_mean float64 concavity_mean float64 concave_mean float64 symmetry_mean float64 fractal_mean float64 radius_sd float64 texture_sd float64 perimeter_sd float64 area_sd float64 smoothness_sd float64 compactness_sd float64 concavity_sd float64 concave_sd float64 symmetry_sd float64 fractal_sd float64 radius_max float64 texture_max float64 perimeter_max float64 area_max float64 smoothness_max float64 compactness_max float64 concavity_max float64 concave_max float64 symmetry_max float64 fractal_max float64 dtype: object

target = canser['Diagnosis']
data = canser.iloc[:,2:]

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)

# 实例化
knn = KNeighborsClassifier(n_neighbors=10)
# 数据训练
knn.fit(X_train, y_train)

- 输出 KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’, metric_params=None, n_jobs=1, n_neighbors=10, p=2, weights=’uniform’)

y_ = knn.predict(X_test)
#打分
knn.score(X_test, y_test)

- 输出 0.9473684210526315

# 制作一个交叉表 cross join
# pandas 中也有交叉表 crosstab()
# margins 外边 可以用来增加一组统计的数据值
pd.crosstab(index=y_, columns=y_test, margins=True, rownames=['Predict'], colnames=['True'])

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }

True	B	M	All
Predict
B	36	2	38
M	1	18	19
All	37	20	57

# 怎么提高预测的准确率
# 那么我们需要对数据清洗，数据归一化
# (item-min)/(max-min)

nd = np.array([1, 2, 3, 4, 5])
nd.max() = 5
nd.min() = 1

col = 'radius_mean'
def convert2normed(item):
    # 最大值
    d_max = data[col].max()
    # 最小值
    d_min = data[col].min()
    return (item - d_min)/(d_max - d_min)
# 这里做一个覆盖
data[col] = data[col].map(convert2normed)

# 将所有的字段全部进行归一化

cols = data.columns
cols

输出

Index([‘radius_mean’, ‘texture_mean’, ‘perimeter_mean’, ‘area_mean’,
‘smoothness_mean’, ‘compactness_mean’, ‘concavity_mean’, ‘concave_mean’,
‘symmetry_mean’, ‘fractal_mean’, ‘radius_sd’, ‘texture_sd’,
‘perimeter_sd’, ‘area_sd’, ‘smoothness_sd’, ‘compactness_sd’,
‘concavity_sd’, ‘concave_sd’, ‘symmetry_sd’, ‘fractal_sd’, ‘radius_max’,
‘texture_max’, ‘perimeter_max’, ‘area_max’, ‘smoothness_max’,
‘compactness_max’, ‘concavity_max’, ‘concave_max’, ‘symmetry_max’,
‘fractal_max’],
dtype=’object’)

for col in cols:
    def convert2normed(item):
    # 最大值
        d_max = data[col].max()
        # 最小值
        d_min = data[col].min()
        return (item - d_min)/(d_max - d_min)
    data[col] = data[col].map(convert2normed)

# 重新分割
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)

knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

输出

0.9824561403508771

利用分类绘制鸢尾花散点图

# 数据集
import sklearn.datasets as datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

data = iris['data']
target = iris['target']
target_names = iris['target_names']

data.shape
# 在机器学习的时候。4个属性代表4种维度

输出

(150, 4)

sepal = data[:, :2]
# 这是花萼的长度
sepal_length = sepal[:, 0]
#这是花萼的宽度
sepal_width = sepal[:,1]

import matplotlib.pyplot as plt

from matplotlib.colors import ListedColormap

cmap = ListedColormap(['red', 'orange', 'pink'])

plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)

<matplotlib.collections.PathCollection at 0xbb56c18>

这里写图片描述

petal = data[:, 2:]
# 这是花萼的长度
petal_length = petal[:, 0]
#这是花萼的宽度
petal_width = petal[:,1]

plt.scatter(petal_length, petal_width, c=target)

<matplotlib.collections.PathCollection at 0xaf43630>

这里写图片描述

# 首先我们还是要分类
# 我们自己制造数据
# 让数据点填满整张图片

# 我们要画的点怎么区分颜色
# KNN
knn = KNeighborsClassifier()
knn.fit(sepal, target)

输出

KNeighborsClassifier(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights=’uniform’)

# 制造数据
# 选取一个范围
import numpy as np

x = np.linspace(4, 8.2, 200)
y = np.linspace(1.8, 4.5, 150)

# meshgrid 网格
xx,yy = np.meshgrid(x, y)

display(xx.shape, yy.shape)

输出
(150, 200)
(150, 200)

xx = xx.reshape(-1)
yy = yy.reshape(-1)

xy = np.c_[xx, yy]

plt.scatter(xy[:,0], xy[:,1])

<matplotlib.collections.PathCollection at 0xba0ee10>

这里写图片描述

# 预测，将我们生成的3万个点（数据）进行预测
X_test = xy

y_ = knn.predict(X_test)

# 这是刚刚30000个点的分类图
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')

<matplotlib.collections.PathCollection at 0xbaaec88>

这里写图片描述

# 将鸢尾花150个点也画进来
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_, cmap='rainbow')
plt.scatter(sepal_length, sepal_width, c=target, cmap=cmap)

<matplotlib.collections.PathCollection at 0xb45f0b8>

这里写图片描述

knn做线性回归

回归用于对于趋势的预测

台风预测

# 每年都要，太平洋中心就会形成台风，移动轨迹从海洋到陆地
# 中央气象台，预测，采集点，100km，风在转移的时候风速会下降
# 比如说20级，福建16级，红色预警
# 咱们要想要拿到风的运行轨迹，需要一个函数，预测风力，风力的函数

from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import matplotlib.pyplot as plt

data = np.linspace(0, 10, 50)
# X, Y
# 噪点，让这个线性不那么平滑
target = np.sin(data)

target[::5] += np.random.randn(10) * 0.35
plt.scatter(data,target)

<matplotlib.collections.PathCollection at 0x9f30cc0>

这里写图片描述

data

target

# 让机器学习带有噪点的函数数据集
knn = KNeighborsRegressor()
knn.fit(data.reshape(data.size,1),target)

输出

KNeighborsRegressor(algorithm=’auto’, leaf_size=30, metric=’minkowski’,
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights=’uniform’)

X_test = np.linspace(0, 10, 5000)

# 求解的是y

y_ = knn.predict(X_test.reshape(X_test.size, 1))
y_

输出

array([ 0.42095792, 0.42095792, 0.42095792, …, -0.17098925,
-0.17098925, -0.17098925])

# 在计算的过程中还是有点误差值的
plt.plot(X_test, y_, c='red')
plt.scatter(data, target)

<matplotlib.collections.PathCollection at 0xa41c240>

这里写图片描述

自己看理论： cuosor proxySQL(mysql的中间件) mysql8.0 与 5.7的区别

设计思想：

innodb,myisam 的区别：
事物的特性（原子性，一致性，持久性，隔离性）三范式：分表思想
索引：主键，唯一，联合（联合索引也是普通索引），全文索引(fulltext key) (增加like的效率，只对myisam有效)
两张表：给你需求，让你写sql(连表查询，子查询)

存储过程：

类似于函数：写一条特别复杂的sql,连100表

函数：

触发器：

保证数据的完整性

人脸自动补全

给个上半部分的脸，把下半部分给自动补全
回归问题
为什么是回归问题？如果是分类 70亿
因为脸部有轮廓线

from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np

# 再导入几种算法
# LinearRegression 线性回归
# Ridge 山岭 岭回归
# Lasso 罗斯 罗斯回归
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# y = wx
# w = y * x^-1
x = np.linspace(0,10, 100)
y = 3 * x 
# 回归要求的就是w这个系数， 如果有误差值，再添加
plt.plot(x, y)

[<matplotlib.lines.Line2D at 0xc92b780>]

这里写图片描述

# 引入人脸的数据集
import sklearn.datasets as datasets

# 提取数据集
face = datasets.fetch_olivetti_faces()
face

data = face['data']
data.shape

输出

(400, 4096)

import math
math.sqrt(data.shape[1])

输出

64.0

# 人脸， 上下分开， 切片
# 先切上半部分的脸
face_up = data[:,:2048]
# 下半部
face_down = data[:,2048:]

# 怎么样才可以随机取图
index = np.random.randint(0, 400,size=1)
plt.imshow(data[index].reshape(64, 64),cmap='gray')

<matplotlib.image.AxesImage at 0xc7f5da0>

这里写图片描述


axes = plt.subplot(121)
axes.imshow(face_up[index].reshape(32, 64), cmap='gray')
axes1 = plt.subplot(122)
axes1.imshow(face_down[index].reshape(32, 64), cmap='gray')

<matplotlib.image.AxesImage at 0xb9ecf28>

这里写图片描述

# 分割数据
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(face_up, face_down, test_size=0.02)

# 让四个算法同时计算，比较哪个更准确
# estimator 估计器

# 将4中算法保存在字典中
estimator = {
    'KNN': KNeighborsRegressor(),
    'LinearRG':LinearRegression(),
    'Ridge': Ridge(),
    'Lasso':Lasso()
}

# 循环训练
result_ = dict()
for key, estimator in estimator.items():
    # 开始训练
    estimator.fit(X_train, y_train)
    # 预测
    y_ = estimator.predict(X_test)
    # 将各个算法的结果保存起来
    result_[key] = y_

# 测试knn预测的结果
knn_y = result_['KNN']
face_true = np.concatenate([X_test[0], y_test[0]]).reshape(64, 64)
plt.imshow(face_true, cmap='gray')

<matplotlib.image.AxesImage at 0x114a1fd0>

这里写图片描述

result_

axes = plt.subplot(121)
axes.imshow(face_true, cmap='gray')
# 取KNN的预测图
face_knn_predict = np.concatenate([X_test[0], knn_y[0]]).reshape(64, 64)
axes1 = plt.subplot(122)
axes1.imshow(face_knn_predict, cmap='gray')

<matplotlib.image.AxesImage at 0x116c6dd8>

这里写图片描述

# 将所有的算法图片全部展示， 原图， 上半部分脸也展示， 6张图片
plt.figure(figsize=(6 * 2, 8 *2))
for i in range(8):
    # subplot
    axes = plt.subplot(8, 6, 1+ 6 * i)
    # 第一张图为真实的脸
    axes.axis('off')
    face_true = np.concatenate([X_test[i], y_test[i]]).reshape(64, 64)
    axes.imshow(face_true, cmap='gray')

    if i == 0:
        axes.set_title('True')
    # 第二行为上半部分脸
    axes = plt.subplot(8, 6, 2+ 6 * i)
    axes.axis('off')
    face_up = X_test[i].reshape(32, 64)
    axes.imshow(face_up, cmap='gray')
    if i == 0:
        axes.set_title('UP')
    # 打印算法的脸
    for j, key in enumerate(result_):
        y_ = result_[key]
        face_predict = np.concatenate([X_test[i], y_[i]]).reshape(64, 64)
        axes = plt.subplot(8, 6, 3 + 6 * i + j)
        axes.imshow(face_predict, cmap='gray')
        axes.axis('off')
        if i == 0:
            axes.set_title(key)

这里写图片描述

KNN应用

利用分类绘制鸢尾花散点图

knn做线性回归

设计思想：

存储过程：

函数：

触发器：

人脸自动补全

猜你喜欢

	age	workclass	education	education_num	occupation	sex	hours_per_week	native_country	race
0	39	1	1	13	1	1	40	1	1
1	50	2	1	13	2	1	13	1	1
2	38	3	2	9	3	1	40	1	1
3	53	3	3	7	3	1	40	1	2
4	28	3	1	13	4	2	40	2	2

	age	workclass	education	education_num	occupation	sex	hours_per_week	native_country	race
0	39	1	1	13	1	1	40	1	1
1	50	2	1	13	2	1	13	1	1
2	38	3	2	9	3	1	40	1	1
3	53	3	3	7	3	1	40	1	2
4	28	3	1	13	4	2	40	2	2

	age	workclass	education	education_num	occupation	sex	hours_per_week	native_country	race
0	39	1	1	13	1	1	40	1	1
1	50	2	1	13	2	1	13	1	1
2	38	3	2	9	3	1	40	1	1
3	53	3	3	7	3	1	40	1	2
4	28	3	1	13	4	2	40	2	2