mnist data sets into a picture and csv file

Mnist data sets are usually provided by the official website of archive formats, sometimes when we need to use:
(1), extract into a picture format into a folder
(2), or saved as csv format documents

(1) Save as image formats (windows down)

# -*- coding: utf-8 -*-
"""
Created on Tue Feb 18 17:36:14 2020
unzip_mnist

"""

import struct
from array import array
import numpy as np
import os
from PIL import Image
import cv2
trainimg = './fashion_mnist_origin/train-images.idx3-ubyte'
trainlabel = './fashion_mnist_origin/train-labels.idx1-ubyte'
testimg = './fashion_mnist_origin/t10k-images.idx3-ubyte'
testlabel = './fashion_mnist_origin/t10k-labels.idx1-ubyte'
trainfolder = './fashion_mnist_origin/train'
testfolder = './fashion_mnist_origin/test'
if not os.path.exists(trainfolder): os.makedirs(trainfolder)
if not os.path.exists(testfolder): os.makedirs(testfolder)
 
# open(文件路径,读写格式),用于打开一个文件,返回一个文件对象
# rb表示以二进制读模式打开文件
trimg = open(trainimg, 'rb')
teimg = open(testimg, 'rb')
trlab = open(trainlabel, 'rb')
telab = open(testlabel, 'rb')
# struct的用法这里不详述
struct.unpack(">IIII", trimg.read(16))
struct.unpack(">IIII", teimg.read(16))
struct.unpack(">II", trlab.read(8))
struct.unpack(">II", telab.read(8))
# array模块是Python中实现的一种高效的数组存储类型
# 所有数组成员都必须是同一种类型,在创建数组时就已经规定
# B表示无符号字节型,b表示有符号字节型
trimage = array("B", trimg.read())
teimage = array("B", teimg.read())
trlabel = array("b", trlab.read())
telabel = array("b", telab.read())
# close方法用于关闭一个已打开的文件,关闭后文件不能再进行读写操作
trimg.close()
teimg.close()
trlab.close()
telab.close()
# 为训练集和测试集各定义10个子文件夹,用于存放从0到9的所有数字,文件夹名分别为0-9
trainfolders = [os.path.join(trainfolder, str(i)) for i in range(10)]
testfolders = [os.path.join(testfolder, str(i)) for i in range(10)]
for dir in trainfolders:
    if not os.path.exists(dir):
        os.makedirs(dir)
for dir in testfolders:
    if not os.path.exists(dir):
        os.makedirs(dir)
# 开始保存训练图像数据
for (i, label) in enumerate(trlabel):
    filename = os.path.join(trainfolders[label], str(i) + ".png")
    print("writing " + filename)
    with open(filename, "wb") as img:
        #image = np.array([28,28])        
        data = [trimage[(i*28*28 + j*28) : (i*28*28 + (j+1)*28)] for j in range(28)]        
        data = np.array(data)
        #print("data:",data.shape)
        image = data
       # for r in range(28):
           #print("data:",np.array(data[r]))
           # for c in range(28):
           #image(r,:) = data1[:28]           
        #print(" image shape:",image)
        cv2.imwrite(filename, image)
        #image.write(img, data)
# 开始保存测试图像数据
for (i, label) in enumerate(telabel):
    filename = os.path.join(testfolders[label], str(i) + ".png")
    print("writing " + filename)
    with open(filename, "wb") as img:
        data = [teimage[(i*28*28 + j*28) : (i*28*28 + (j+1)*28)] for j in range(28)]
        image = np.array(data)
        cv2.imwrite(filename,image)

For minor modifications to the author:
https://blog.csdn.net/SongGu1996/article/details/98849274

(2) save as CSV format file

def convert(imgf, labelf, outf, n):
    f = open(imgf, "rb")
    o = open(outf, "w")
    l = open(labelf, "rb")
 
    f.read(16)
    l.read(8)
    images = []
 
    for i in range(n):
        image = [ord(l.read(1))]
        for j in range(28*28):
            image.append(ord(f.read(1)))
        images.append(image)
 
    for image in images:
        o.write(",".join(str(pix) for pix in image)+"\n")
    f.close()
    o.close()
    l.close()
 
convert("mnist_origin/train-images.idx3-ubyte", "mnist_origin/train-labels.idx1-ubyte",
        "mnist_train.csv", 60000)

convert("mnist_origin/t10k-images.idx3-ubyte", "mnist_origin/t10k-labels.idx1-ubyte",
        "mnist_test.csv", 10000)
 
print("Convert Finished!")
Released two original articles · won praise 2 · Views 368

Guess you like

Origin blog.csdn.net/weixin_42660446/article/details/104655468