掌控MNIST数据集

MNIST数据集号称是神经网络界的’hello world’，凡入门CNN，必先把MNIST玩得666才行。

MNIST有四个文件：训练集，训练标签，测试集，测试标签。
（共11M左右）
下载链接: http://yann.lecun.com/exdb/mnist/

数据集给的应该是图像和标签(数字), 那么如果想看到图像怎么做。如果想不依靠任何深度学习框架来玩MNIST数据集，又该怎么做？
废话不多说，给代码看:

#!/usr/bin/env python
# encoding: utf-8
"""
@version: 1.0
@author: levio
@contact: [email protected]
@file: format_decoder.py
@time: 2018/5/15 14:18
"""
import struct
from PIL import Image
import numpy as np
import gzip
import os.path as osp


class Format_decoder():
    def __init__(self):
        pass

    def save_images_into_file(filename):
        g_file = gzip.GzipFile(filename)
        # 创建gzip对象
        buf = g_file.read()
        g_file.close()
        index = 0
        magic, images, rows, columns = struct.unpack_from('>IIII', buf, index)
        index += struct.calcsize('>IIII')
        for i in range(images):
            image = Image.new('L', (columns, rows))
            for x in range(rows):
                for y in range(columns):
                    image.putpixel((y, x), int(struct.unpack_from('>B', buf, index)[0]))
                    index += struct.calcsize('>B')
            print('save ' + str(i) + 'image')
            image.save('imgs/' + str(i) + '.png')


    def reform_data_into_npy(filename):
        saveFilename = filename.split('.')[0]+'.npy'
        if osp.exists(saveFilename):
            print(saveFilename+' has already existed')
            return
        g_file = gzip.GzipFile(filename)
        # 创建gzip对象
        buf = g_file.read()
        g_file.close()
        index = 0
        magic, images, rows, columns = struct.unpack_from('>IIII', buf, index)
        index += struct.calcsize('>IIII')
        all_array_list = []
        for i in range(images):
            if i%1000 == 0:
                percentage = round(i*100/images, 0)
                print('processing: %s %%'% percentage)
            image = Image.new('L', (columns, rows))
            for x in range(rows):
                for y in range(columns):
                    image.putpixel((y, x), int(struct.unpack_from('>B', buf, index)[0]))
                    index += struct.calcsize('>B')
            narray = np.array(image)
            all_array_list.append(narray)
        all_arrays = np.array(all_array_list)
        print('processing successfully!')
        print(np.shape(all_arrays))
        np.save(saveFilename, all_arrays)

    def read_label(filename):
        saveFilename = filename.split('-')[0]+'-label.txt'
        if osp.exists(saveFilename):
            print(saveFilename+' has already existed')
            return
        g_file = gzip.GzipFile(filename)
        # 创建gzip对象
        buf = g_file.read()
        g_file.close()
        index = 0
        magic, labels = struct.unpack_from('>II', buf, index)
        index += struct.calcsize('>II')
        labelArr = [0] * labels
        for x in range(labels):
            labelArr[x] = int(struct.unpack_from('>B', buf, index)[0])
            index += struct.calcsize('>B')
        save = open(saveFilename, 'w')
        save.write(','.join(map(lambda x: str(x), labelArr)))
        save.write('\n')
        save.close()
        print('save labels success')


if __name__ == '__main__':
    Format_decoder.reform_data_into_npy('t10k-images-idx3-ubyte.gz')
    Format_decoder.read_label('t10k-labels-idx1-ubyte.gz')
    Format_decoder.reform_data_into_npy('train-images-idx3-ubyte.gz')
    Format_decoder.read_label('train-labels-idx1-ubyte.gz')

请按照代码里读取的位置放MNIST数据集。

我在上述代码实现了把MNIST数据集转换成.npy文件(numpy数组文件)，可以通过np.load(‘xxx.npy’)代码实现数组的读取，十分方便。
同时，数组还可以用Image模块，实现图像显示，图像保存等等。这样就算完全花式掌控了MNIST数据集。

猜你喜欢