算法图片识别——手写数字识别[保存算法joblib.dump/np.save]

导包

import matplotlib.pyplot as plt
%matplotlib inline


import numpy as np

import pandas as pd
from pandas import Series,DataFrame

导入单个数字

digit = plt.imread('./data/8/8_400.bmp')
digit
Out：
array([[255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255],
……
plt.figure(figsize=(1,1))
plt.imshow(digit,cmap = 'gray')

查看数据集结构：一共10个文件夹，每个文件夹有500张图片，则总量一共5000张图片

全部导入，特征数据存入data，目标数据存入target

# 上午fit(X,y)
# data ----->X
# target---->y
data = []
target = []
for i in range(10):
    
#     每个数字500
    for j in range(1,501):
        digit = plt.imread('./data/%d/%d_%d.bmp'%(i,i,j))
        data.append(digit)
        target.append(i)

len(data)
Out：5000

len(target)
Out：5000

index = np.random.randint(0,5000,size = 1)[0]
index
plt.figure(figsize=(1,1))
plt.imshow(data[index],cmap = 'gray')

print(target[index])

使用KNN分类算法

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)

# 使用算法，进行训练
# ValueError: Found array with dim 3. Estimator expected <= 2.
knn.fit(data,target)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-35-bc8bcb80be80> in <module>()
      1 # 使用算法，进行训练
----> 2 knn.fit(data,target)


type(data)
Out： list

转化成ndarray

X = np.array(data)
# [[[]]]
# [[样本一],[样本二]……]
X.shape
Out： (5000, 28, 28)

28*28
Out： 784


y = np.array(target)
y
Out：array([0, 0, 0, ..., 9, 9, 9])

数据的形状改变，数据没变

# 数据的形状改变，数据没变
X = X.reshape(5000,784)

plt.figure(figsize=(1,1))
plt.imshow(X[1600].reshape(28,28))

打乱顺序

nd = np.array([0,1,2,3,4,9])
np.random.shuffle(nd)
nd
Out： array([1, 9, 4, 2, 0, 3])

index = np.arange(5000)
index
np.random.shuffle(index)
index
Out: array([ 608, 2911, 2475, ..., 4248, 4205, 2117])

X = X[index]

y = y[index]

# 5000个样本，每个样本784个属性
# 784个未知数 f() = x0*w0 + x1*w1 + …… + x783*w783
X.shape
Out:(5000, 784)

举栗子

# 2x + 3y + 4z = 10

# x - 2y + 3z = 8

# 3x -y + z = 7

X1 = np.array([[2,3,4],[1,-2,3],[3,-1,1]])
X1

y1 = np.array([10,8,7])
display(X1,y1)

Out：
array([[ 2,  3,  4],
       [ 1, -2,  3],
       [ 3, -1,  1]])
array([10,  8,  7])

划分训练和测试数据

# 5000 一分为二4950,50

knn.fit(X[:4950],y[:4950])
Out：
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

y_ = knn.predict(X[-50:])
y_
Out：
array([1, 9, 9, 7, 1, 9, 1, 9, 3, 5, 7, 4, 3, 1, 0, 5, 0, 4, 0, 6, 3, 8,
       9, 7, 1, 3, 1, 9, 3, 4, 0, 5, 5, 2, 4, 0, 7, 8, 0, 6, 1, 7, 0, 8,
       1, 3, 3, 8, 8, 4])

y[-50:]
Out：
array([3, 1, 5, 4, 4, 2, 0, 6, 0, 3, 4, 6, 2, 4, 2, 7, 3, 1, 9, 3, 0, 7,
       5, 9, 9, 9, 3, 1, 8, 3, 3, 1, 6, 5, 2, 6, 4, 6, 9, 8, 6, 2, 5, 1,
       5, 3, 5, 4, 9, 1])

预测后50个数字图片

plt.figure(figsize=(5*1,10*2))
for i in range(50):
    
    axes = plt.subplot(10,5,i+1)
    
    axes.imshow(X[4950+i].reshape(28,28))
    
    t = y[4950+i]
    
#     预测值 50个（对最后的50个数据进行了预测）
    p = y_[i]
    
#     标题 True：Predict：
    axes.set_title('True:%d\nPredict:%d'%(t,p))
    axes.axis('off')

评分

#  socre ---->predict ---- 进行比较
knn.score(X[-50:],y[-50:])
Out：
0.96

保存算法

# 保存算法，以后使用
from sklearn.externals import joblib

# model 模型，数学建模----算法
joblib.dump(knn,'./digits.m')
Out：['./digits.m']

特征数据和目标数据保存至numpy的npy中方便日后调用

# 5000张图片在X中
# 5000张图片的目标值在y中
# numpy可以直接保存

np.save('./digits.npy',X)
np.save('./digits_target.npy',y)

各种调参

knn = KNeighborsClassifier(10)

knn.fit(X[:4950],y[:4950])

knn.score(X[-50:],y[-50:])
Out:
0.94

knn = KNeighborsClassifier(50)

knn.fit(X[:4950],y[:4950])

knn.score(X[-50:],y[-50:])
Out:
0.94

knn = KNeighborsClassifier(500)

knn.fit(X[:4950],y[:4950])

knn.score(X[-50:],y[-50:])
Out:
0.84

knn = KNeighborsClassifier(1)

knn.fit(X[:4950],y[:4950])

knn.score(X[-50:],y[-50:])
Out:
0.96

算法图片识别——手写数字识别[保存算法joblib.dump/np.save]

导包

猜你喜欢