《Kaggle Histopathologic Cancer Detection癌症图像分类比赛》之Keras/Generator实现

项目地址：https://www.kaggle.com/c/histopathologic-cancer-detection/overview
本文记录了自己使用纯Keras以及Keras标准的Generator的数据准备方式:
其他实现方式见:
Kaggle Histopathology Cancel Detection之Pyorch实现
Kaggle Histopathologic Cancer Detection之Keras实现
Kaggle Histopathologic Cancer Detection之Tensorflow2.0实现
# -*- coding: utf-8 -*-
import numpy as np
import os,sys,csv,math
import cv2 as cv
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import myimageutil as iu
import tensorflow as tf

"""
====================================================================================
<<1.初步了解掌握数据的情况>>
====================================================================================

用pandas简单处理一下CSV并画出来看一下

这里我借用了kaggle的这篇kernel里的plot的代码，有兴趣的童鞋可以读一下，
https://www.kaggle.com/qitvision/a-complete-ml-pipeline-fast-ai

"""
ROOT_PATH = 'D:/ai_data/histopathologic-cancer-detection'
CSV_PATH = 'D:/ai_data/histopathologic-cancer-detection/train_labels.csv'
TRAIN_PATH = 'D:/ai_data/histopathologic-cancer-detection/train'
TEST_PATH = 'D:/ai_data/histopathologic-cancer-detection/test'

print(">>>看一下根目录下有哪些东西：")
print(os.listdir(ROOT_PATH))

df = pd.read_csv(CSV_PATH)  #pandas里的数据集叫dataframe，和scala里的一样，我们简称df

# 接下来我们来看一下数据的情况
print(">>>这个数据集的大小：")
print(df.shape)

print(">>>这个数据集的样本分布：")
print(df['label'].value_counts())

print(">>>看一下数据：")
print(df.head())

# 这边我想说明一下，之前我们的第一篇walkthrough里是直接从csv中获得文件列表的，这边最好检查一下列表里的文件和文件夹里的是不是一一对应
print(">>>list一下训练图片文件夹里的图片：")
from glob import glob
train_file_paths = glob(TRAIN_PATH + '/*.tif')
test_file_paths  = glob(TEST_PATH + '/*.tif')
print("train_file_paths size:", len(train_file_paths)) 
print("test_file_paths size:", len(test_file_paths))

import re
def check_valid():
    assert len(train_file_paths) == len(df['id']),'图片数量不一致'
    ids_from_filepath = list(map(lambda filepath:''.join(re.findall(r'[a-z0-9]{40}',filepath)), train_file_paths))
    dif = list(set(ids_from_filepath)^set(df['id'])) #求两个list的差集，如果差集为0，那说明两个list相等
    if len(dif) == 0:
        print("文件名匹配正常")
    else:
        print("匹配异常,下列文件名有差异：")
        print(dif)
        exit()
check_valid()

# print(">>>数据没问题的话接下来看一下正负数据样例的图片：")
# iu.plotSamples(df,TRAIN_PATH) #要注意本次的图片数据是使用中间32X32像素的内容为基准进行标注的，所以画图把中间一块标注出来了，但实际分类的时候不一定要把中间裁出来

# print(">>>进入正题，我们拆分一下数据，把训练数据分成训练和测试2部分，比例为9：1")
train, val = train_test_split(train_file_paths, test_size=0.1, shuffle=True)

# train = train[:640]
# val = val[:64]

"""
====================================================================================
<<2.图片处理和扩增>>
====================================================================================

图片处理主要是要匹配CNN的输入大小，扩增是为了降低过拟合风险
无论是图片处理还是扩增都有太多方法了，比较常用的imageaug或者tf.image进行数据扩增，其实openCV什么都能干
imgaug堪称python里最强图片扩增工具，方法多，叠加方便，一个图像数据扩增100倍轻轻松松:
https://github.com/aleju/imgaug

使用tensorflow自带的tf.image进行augmentation，特点是能结合tf.dataset无缝使用：
http://androidkt.com/tensorflow-image-augmentation-using-tf-image/

这边我们使用imgaug进行处理，最后使用keras generator准备数据
代码参考kaggle的另一篇kernel：
https://www.kaggle.com/CVxTz/cnn-starter-nasnet-mobile-0-9709-lb
"""

from imgaug import augmenters as iaa
import imgaug as ia
from random import shuffle
from keras.applications.nasnet import preprocess_input

id_label_map = {
    
    k:v for k,v in zip(df.id.values, df.label.values)}

def get_id_from_file_path(file_path):
    return file_path.split(os.path.sep)[-1].replace('.tif', '')

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def get_seq():
    sometimes = lambda aug: iaa.Sometimes(0.5, aug)
    seq = iaa.Sequential(
        [
            # apply the following augmenters to most images
            iaa.Fliplr(0.5), # horizontally flip 50% of all images
            iaa.Flipud(0.2), # vertically flip 20% of all images
            sometimes(iaa.Affine(
                scale={
    
    "x": (0.9, 1.1), "y": (0.9, 1.1)}, # scale images to 80-120% of their size, individually per axis
                translate_percent={
    
    "x": (-0.1, 0.1), "y": (-0.1, 0.1)}, # translate by -20 to +20 percent (per axis)
                rotate=(-10, 10), # rotate by -45 to +45 degrees
                shear=(-5, 5), # shear by -16 to +16 degrees
                order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
                cval=(0, 255), # if mode is constant, use a cval between 0 and 255
                mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
            )),
            # execute 0 to 5 of the following (less important) augmenters per image
            # don't execute all of them, as that would often be way too strong
            iaa.SomeOf((0, 5),
                [
                    sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
                    iaa.OneOf([
                        iaa.GaussianBlur((0, 1.0)), # blur images with a sigma between 0 and 3.0
                        iaa.AverageBlur(k=(3, 5)), # blur image using local means with kernel sizes between 2 and 7
                        iaa.MedianBlur(k=(3, 5)), # blur image using local medians with kernel sizes between 2 and 7
                    ]),
                    iaa.Sharpen(alpha=(0, 1.0), lightness=(0.9, 1.1)), # sharpen images
                    iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
                    # search either for all edges or for directed edges,
                    # blend the result with the original image using a blobby mask
                    iaa.SimplexNoiseAlpha(iaa.OneOf([
                        iaa.EdgeDetect(alpha=(0.5, 1.0)),
                        iaa.DirectedEdgeDetect(alpha=(0.5, 1.0), direction=(0.0, 1.0)),
                    ])),
                    iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.01*255), per_channel=0.5), # add gaussian noise to images
                    iaa.OneOf([
                        iaa.Dropout((0.01, 0.05), per_channel=0.5), # randomly remove up to 10% of the pixels
                        iaa.CoarseDropout((0.01, 0.03), size_percent=(0.01, 0.02), per_channel=0.2),
                    ]),
                    iaa.Invert(0.01, per_channel=True), # invert color channels
                    iaa.Add((-2, 2), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
                    iaa.AddToHueAndSaturation((-1, 1)), # change hue and saturation
                    # either change the brightness of the whole image (sometimes
                    # per channel) or change the brightness of subareas
                    iaa.OneOf([
                        iaa.Multiply((0.9, 1.1), per_channel=0.5),
                        iaa.FrequencyNoiseAlpha(
                            exponent=(-1, 0),
                            first=iaa.Multiply((0.9, 1.1), per_channel=True),
                            second=iaa.ContrastNormalization((0.9, 1.1))
                        )
                    ]),
                    sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
                    sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))), # sometimes move parts of the image around
                    sometimes(iaa.PerspectiveTransform(scale=(0.01, 0.1)))
                ],
                random_order=True
            )
        ],
        random_order=True
    )
    return seq

def data_gen(list_files, id_label_map, batch_size, augment=False):
    seq = get_seq()
    while True:
        shuffle(list_files)
        for batch in chunker(list_files, batch_size):
            X = [cv.imread(x) for x in batch]
            Y = [id_label_map[get_id_from_file_path(x)] for x in batch]
            if augment:
                X = seq.augment_images(X)
            X = [preprocess_input(x) for x in X]
                
            yield np.array(X), np.array(Y)

# 测试一下这个generator方法

# for x in range(2):
#     batch_images = next(data_gen(train_file_paths,id_label_map,BATCH_SIZE,augment=False))
#     print(batch_images[0].shape)
#     print(batch_images[1].shape)
#     print('>>>>>>>>>>>>>>>>>>>>>>>>')


"""
====================================================================================
<<3.建模>>
====================================================================================

使用keras和比较新的NASnet来建立模型,方法和walkthrough里的一摸一样

"""
from keras.layers import concatenate, Activation, GlobalAveragePooling2D, Flatten
from keras.layers import Dense, Input, Dropout, MaxPooling2D, Concatenate, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.models import Model
from keras.applications.nasnet import NASNetMobile
from keras.optimizers import Adam
from keras.losses import mae, sparse_categorical_crossentropy, binary_crossentropy

# model.summary()

def get_model():
    inputs = Input((96, 96, 3))
    base_model = NASNetMobile(include_top=False, input_tensor=inputs)#, weights=None
    x = base_model(inputs)
    out1 = GlobalMaxPooling2D()(x)
    out2 = GlobalAveragePooling2D()(x)
    out3 = Flatten()(x)
    out = Concatenate(axis=-1)([out1, out2, out3])
    out = Dropout(0.5)(out)
    out = Dense(1, activation="sigmoid", name="3_")(out)
    model = Model(inputs, out)

    optimizer = Adam(lr = 0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

    model.compile(optimizer=optimizer, loss=binary_crossentropy, metrics=['acc'])
    model.summary()

    return model

model = get_model()

"""
====================================================================================
<<3.训练>>
====================================================================================

这边使用Keras generator的训练方法来训练网络

"""
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau,Callback,CSVLogger,TensorBoard
import time

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
h5_path = "model3_b32_e2_b64_e6.h5"
csv_logger_path = 'logger5.csv'
tb_log_dir = './logs5'
batch_size1=32
epoch1 = 2
batch_size2=64
epoch2 = 6
if_train = True
if_inference = True
submission_filename = "submission_5.csv"
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


def train_model(train,val,h5_path,csv_logger_path,tb_log_dir,batch_size1,epoch1,batch_size2,epoch2,if_aug=True,if_load_weights=False,test_run=False):
    class TimeHistory(Callback):
        def on_train_begin(self, logs={
    
    }):
            self.times = []

        def on_epoch_begin(self, epoch, logs={
    
    }):
            self.epoch_time_start = time.time()

        def on_epoch_end(self, epoch, logs={
    
    }):
            self.times.append(time.time() - self.epoch_time_start)

    time_callback1 = TimeHistory()
    time_callback2 = TimeHistory()

    csv_callback = CSVLogger(csv_logger_path, separator=',', append=True)

    if not os.path.exists(tb_log_dir):
        os.makedirs(tb_log_dir)

    tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tb_log_dir,
                                                 update_freq='batch', 
                                                 histogram_freq=0, 
                                                 write_graph=True, 
                                                 write_images=True, 
                                                 embeddings_freq=0, 
                                                 embeddings_layer_names=None, 
                                                 embeddings_metadata=None)
    checkpoint = ModelCheckpoint(h5_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

    if if_load_weights:
       model.load_weights(h5_path)

    train_filepaths = train
    val_filepaths = val

    if test_run:
       train_filepaths = train_file_paths[:256]
       val_filepaths = val_filepaths[:256] 

    history = model.fit_generator(
        data_gen(train_filepaths, id_label_map, batch_size1, augment=if_aug),
        validation_data=data_gen(val_filepaths, id_label_map, batch_size1),
        epochs=epoch1, verbose=1,
        callbacks=[checkpoint,time_callback1,csv_callback,tb_callback],
        steps_per_epoch=math.ceil(len(train_filepaths)/batch_size1),
        validation_steps=math.ceil(len(val_filepaths)/batch_size1),
        workers=1,
        use_multiprocessing=False)
    history = model.fit_generator(
        data_gen(train_filepaths, id_label_map, batch_size2, augment=if_aug),
        validation_data=data_gen(val_filepaths, id_label_map, batch_size2),
        epochs=epoch2, verbose=1,
        callbacks=[checkpoint,time_callback2,csv_callback,tb_callback],
        steps_per_epoch=math.ceil(len(train_filepaths)/batch_size2),
        validation_steps=math.ceil(len(val_filepaths)/batch_size2),
        workers=1,
        use_multiprocessing=False)

    print("time_callback1:")
    print(time_callback1.times)
    print("time_callback2:")
    print(time_callback2.times)

    # from keras.utils import plot_model
    # plot_model(model, to_file='model.png')

    # # Plot training & validation accuracy values
    # plt.plot(history.history['acc'])
    # plt.plot(history.history['val_acc'])
    # plt.title('Model accuracy')
    # plt.ylabel('Accuracy')
    # plt.xlabel('Epoch')
    # plt.legend(['Train', 'Test'], loc='upper left')
    # plt.show()

    # # Plot training & validation loss values
    # plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
    # plt.title('Model loss')
    # plt.ylabel('Loss')
    # plt.xlabel('Epoch')
    # plt.legend(['Train', 'Test'], loc='upper left')
    # plt.show()
if if_train:
    train_model(train,val,h5_path,csv_logger_path,tb_log_dir,batch_size1,epoch1,batch_size2,epoch2)

"""
====================================================================================
<<4.预测>>
====================================================================================

这边使用Keras predict_generator的方法来进行预测

"""


model.load_weights(h5_path)

def data_gen_test(list_files, batch_size):
    while True:
        # shuffle(list_files)
        for batch in chunker(list_files, batch_size):
            X = [cv.imread(x) for x in batch]
            X = [preprocess_input(x) for x in X]
            # print(len(X))
            yield np.array(X)

def inference(test_file_paths,h5_path,submission_filename,test_run=False):
    test_batch_size = 128

    if test_run:
        test_file_paths = test_file_paths[:17]

    test_generator = data_gen_test(test_file_paths,test_batch_size)
    predicts = model.predict_generator(test_generator,
                                        steps=math.ceil(len(test_file_paths)/test_batch_size), 
                                        # callbacks=None, 
                                        max_queue_size=10, 
                                        workers=1, 
                                        use_multiprocessing=False, 
                                        verbose=1)

    fileids = [x for x in map(get_id_from_file_path,test_file_paths)]
    pres = ["{:.8f}".format(x[0]) for x in predicts]
    # result = zip(fileids,pres)

    print("len fileids:")
    print(len(fileids))
    print("pres:")
    print(len(pres))

    df = pd.DataFrame({
    
    'id':fileids, 'label':pres})
    df.to_csv(submission_filename, index=False)
    print(df.head())

if if_inference:
    inference(test_file_paths,h5_path,submission_filename)```
《Kaggle Histopathologic Cancer Detection癌症图像分类比赛》之Keras/Generator实现

猜你喜欢