《Kaggle Histopathologic Cancer Detection比赛》之Tensorflow2.0/Keras Eager Execution实现

Kaggle项目地址：https://www.kaggle.com/c/histopathologic-cancer-detection/overview
本文记录了一个使用Tensorflow2.0/Keras Eager Execution的实现，数据预处理采用了Tensorflow标准的Dataset的方式：
其他实现方式参考:
Kaggle Histopathology Cancel Detection之Pyorch实现
Kaggle Histopathologic Cancer Detection之Keras实现
Kaggle Histopathologic Cancer Detection之Keras/Generator实现
# -*- coding: utf-8 -*-
import tensorflow as tf
AUTOTUNE = tf.data.experimental.AUTOTUNE 


# tf.enable_eager_execution()

import numpy as np
import os,sys,csv
import cv2 as cv
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import myimageutil as iu



"""
====================================================================================
<<1.初步了解掌握数据的情况>>
====================================================================================

用pandas简单处理一下CSV并画出来看一下

这里我借用了kaggle的这篇kernel里的plot的代码，有兴趣的童鞋可以读一下，
https://www.kaggle.com/qitvision/a-complete-ml-pipeline-fast-ai

"""
ROOT_PATH = 'D:/ai_data/histopathologic-cancer-detection'
CSV_PATH = 'D:/ai_data/histopathologic-cancer-detection/train_labels.csv'
TRAIN_PATH = 'D:/ai_data/histopathologic-cancer-detection/train'
TEST_PATH = 'D:/ai_data/histopathologic-cancer-detection/test'

print(">>>看一下根目录下有哪些东西：")
print(os.listdir(ROOT_PATH))

df = pd.read_csv(CSV_PATH)  #pandas里的数据集叫dataframe，和scala里的一样，我们简称df

# 接下来我们来看一下数据的情况
print(">>>这个数据集的大小：")
print(df.shape)

print(">>>这个数据集的样本分布：")
print(df['label'].value_counts())

print(">>>看一下数据：")
print(df.head())

# 这边我想说明一下，之前我们的第一篇walkthrough里是直接从csv中获得文件列表的，这边最好检查一下列表里的文件和文件夹里的是不是一一对应
print(">>>list一下训练图片文件夹里的图片：")
from glob import glob
train_file_paths = glob(TRAIN_PATH + '/*.tif')
test_file_paths  = glob(TEST_PATH + '/*.tif')
print("train_file_paths size:", len(train_file_paths)) 
print("test_file_paths size:", len(test_file_paths))

import re
def check_valid():
    assert len(train_file_paths) == len(df['id']),'图片数量不一致'
    ids_from_filepath = list(map(lambda filepath:''.join(re.findall(r'[a-z0-9]{40}',filepath)), train_file_paths))
    dif = list(set(ids_from_filepath)^set(df['id'])) #求两个list的差集，如果差集为0，那说明两个list相等
    if len(dif) == 0:
        print("文件名匹配正常")
    else:
        print("匹配异常,下列文件名有差异：")
        print(dif)
        exit()
check_valid()

# print(">>>数据没问题的话接下来看一下正负数据样例的图片：")
# iu.plotSamples(df,TRAIN_PATH) #要注意本次的图片数据是使用中间32X32像素的内容为基准进行标注的，所以画图把中间一块标注出来了，但实际分类的时候不一定要把中间裁出来

# print(">>>进入正题，我们拆分一下数据，把训练数据分成训练和测试2部分，比例为9：1")
train, val = train_test_split(train_file_paths, test_size=0.1, shuffle=True)

id_label_map = {
    
    k:v for k,v in zip(df.id.values, df.label.values)}

def get_paths_labels(pathlist):
    ids = []
    labels = []
    for item in pathlist:
        id = ''.join(re.findall(r'[a-z0-9]{40}',item))
        label = id_label_map[id]
        ids.append(item)
        labels.append(label)
    return ids,labels

train_paths,train_labels = get_paths_labels(train)
val_paths,val_labels = get_paths_labels(val)

# exit()

"""
====================================================================================
<<2.图片处理和扩增>>
====================================================================================

图片处理主要是要匹配CNN的输入大小，扩增是为了降低过拟合风险
无论是图片处理还是扩增都有太多方法了，比较常用的imageaug或者tf.image进行数据扩增，其实openCV什么都能干
imgaug堪称python里最强图片扩增工具，方法多，叠加方便，一个图像数据扩增100倍轻轻松松:
https://github.com/aleju/imgaug

使用tensorflow自带的tf.image进行augmentation，特点是能结合tf.dataset无缝使用：
http://androidkt.com/tensorflow-image-augmentation-using-tf-image/

这边我们使用imgaug进行处理，最后生成tf.dataset进行训练
"""

BATCH_SIZE = 32



#我们还是使用之前的方法读取tif文件，tensorflow本身不支持读取tif，所以只能用py_func调用外部函数来读取
def image_aug_cv(filepath,label):
    image_decoded = cv.imread(filepath.numpy().decode(), 1)

    image_resized = tf.image.resize(image_decoded, [224, 224])
    return aug_image(image_resized), label

def aug_image(image):
    return image / 255.0

def prepare_train_ds(filepaths,labels):
    global BATCH_SIZE
    paths_ds = tf.data.Dataset.from_tensor_slices(filepaths)
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)
    paths_labels_ds = tf.data.Dataset.zip((paths_ds,labels_ds))
    images_labels_ds = paths_labels_ds.shuffle(buffer_size=300000)
    images_labels_ds = images_labels_ds.map(lambda filename,label : tf.py_function( func=image_aug_cv,
                                                                                    inp=[filename,label],
                                                                                    Tout=[tf.float32,tf.float32]),
                                                                                    num_parallel_calls=AUTOTUNE)
    # images_labels_ds = images_labels_ds.repeat()
    images_labels_ds = images_labels_ds.batch(BATCH_SIZE)
    images_labels_ds = images_labels_ds.prefetch(buffer_size = 200)

    return images_labels_ds


train_ds = prepare_train_ds(train_paths,np.asarray(train_labels).astype('float32').reshape((-1,1)))
val_ds = prepare_train_ds(val_paths,np.asarray(val_labels).astype('float32').reshape((-1,1)))



"""
====================================================================================
<<3.建模>>
====================================================================================

使用keras和比较新的NASnet来建立模型,方法和walkthrough里的一摸一样

"""
from tensorflow.keras.layers import concatenate, Activation, GlobalAveragePooling2D, Flatten
from tensorflow.keras.layers import Dense, Input, Dropout, MaxPooling2D, Concatenate, GlobalMaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.applications.nasnet import NASNetMobile
# from tensorflow.keras.optimizers import Adam

nasnet = NASNetMobile(include_top=False, input_shape=(224, 224, 3))
x1 = GlobalMaxPooling2D()(nasnet.output)
x2 = GlobalAveragePooling2D()(nasnet.output)
x3 = Flatten()(nasnet.output)
out = Concatenate(axis=-1)([x1, x2, x3])
out = Dropout(0.5)(out)
predictions = Dense(1, activation="sigmoid",name = 'predictions')(out)
model = Model(inputs=nasnet.input, outputs=predictions)

model.trainable = True
# for layer in model.layers[:-3]:
#   layer.trainable = False

optimizer = tf.keras.optimizers.Adam(lr = 0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
loss_func = tf.keras.losses.BinaryCrossentropy()

# model.summary()

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.BinaryAccuracy(name='val_accuracy')

"""
====================================================================================
<<3.训练>>
====================================================================================

这边使用官方标准的tensorflow 2.0 Eager Execution的训练方法来训练网络

"""

# @tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    predictions = model(images)
    loss = loss_func(labels, predictions)
#   print("train loss:"+str(loss.numpy()))
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)
  train_accuracy(labels, predictions)
  

# @tf.function
def val_step(images, labels):
  predictions = model(images)
  loss = loss_func(labels, predictions)
#   print("val loss:"+str(loss.numpy()))
  val_loss(loss)
  val_accuracy(labels, predictions)
  

EPOCHS = 20

import datetime

for epoch in range(EPOCHS):
  for images, labels in train_ds:
    train_step(images, labels)
    

  for val_images, val_labels in val_ds:
    val_step(val_images, val_labels)

  template = 'Epoch {}, Loss: {}, Accuracy: {}, val Loss: {}, val Accuracy: {}'
  print (template.format(epoch+1, 
                         train_loss.result(),
                         train_accuracy.result()*100,
                         val_loss.result(),
                         val_accuracy.result()*100))
  print(datetime.datetime.now())
《Kaggle Histopathologic Cancer Detection比赛》之Tensorflow2.0/Keras Eager Execution实现

猜你喜欢