Image data cleaning

foreword

Data is critical to the effectiveness of deep learning algorithm models. Usually, some data cleaning work needs to be done before labeling the large amount of collected data. For a large amount of data, the speed of manual direct cleaning will be very slow. Therefore, developing some automatic cleaning tools to automatically clean batch data first, and then manually review and clean it can greatly improve efficiency.

tool function

According to the collected requirements, the tool mainly realizes the following functions:

Statistical data information (total occupied space, quantity, number of damaged pictures);
remove corrupted images,
remove blurred pictures,
remove similar images,
Motor vehicle color classification,
Diurnal classification

statistics information

# 获取数据集存储大小、图片数量、破损图片数量
def get_data_info(dir_path):
    size = 0
    number = 0
    bad_number = 0
    for root, dirs, files in os.walk(dir_path):
        img_files = [file_name for file_name in files if is_image(file_name)]
        files_size = sum([os.path.getsize(os.path.join(root, file_name)) for file_name in img_files])
        files_number = len(img_files)
        size += files_size
        number += files_number
        for file in img_files:
            try:
                img = Image.open(os.path.join(root, file))
                img.load()
            except OSError:
                bad_number += 1
    return size / 1024 / 1024, number, bad_number

Remove corrupted images

# 去除已损坏图片
def filter_bad(dir_path):
    filter_dir = os.path.join(os.path.dirname(dir_path), 'filter_bad')
    if not os.path.exists(filter_dir):
        os.mkdir(filter_dir)
    filter_number = 0
    for root, dirs, files in os.walk(dir_path):
        img_files = [file_name for file_name in files if is_image(file_name)]
        for file in img_files:
            file_path = os.path.join(root, file)
            try:
                Image.open(file_path).load()
            except OSError:
                shutil.move(file_path, filter_dir)
                filter_number += 1
    return filter_number

Remove blurry pictures

First of all, it is necessary to judge the sharpness of the picture, and use the Laplacian operator interface provided by opencv to obtain the sharpness value. The smaller the value, the lower the sharpness, and the more blurred (usually with a 100-bit cut-off value).

# 去除模糊图片
def filter_blurred(dir_path):
    filter_dir = os.path.join(os.path.dirname(dir_path), 'filter_blurred')
    if not os.path.exists(filter_dir):
        os.mkdir(filter_dir)
    filter_number = 0
    for root, dirs, files in os.walk(dir_path):
        img_files = [file_name for file_name in files if is_image(file_name)]
        for file in img_files:
            file_path = os.path.join(root, file)
            # img = cv2.imread(file_path)
            img = cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
            image_var = cv2.Laplacian(img, cv2.CV_64F).var()
            if image_var < 100:
                shutil.move(file_path, filter_dir)
                filter_number += 1
    return filter_number

There are many methods of image blur detection, you can refer to: https://www.cnblogs.com/greentomlee/p/9379471.html

remove similar images

For some picture data obtained by sampling video frames, the similarity of consecutive pictures will be very high, and the picture data with high similarity needs to be eliminated.
First of all, we need to calculate the similarity between two pictures. There are usually the following methods for calculating the similarity:

Calculate the similarity of the picture through the histogram;
Through hash value, Hamming distance calculation;
Calculated by the cosine distance of the picture;
Computed by structural metrics of images.

The results of the four methods may vary.
Reference: https://blog.csdn.net/weixin_35132022/article/details/112514520
The following is the use of python opencv to calculate the similarity of pictures through histograms. The process of removing similar pictures traverses to find the similarity between each picture and the four pictures after it (the few pictures after comparison here can be adjusted according to actual needs), and if the similarity exceeds the threshold, the subsequent pictures are eliminated.

# 计算两张图片的相似度
def calc_similarity(img1_path, img2_path):
    img1 = cv2.imdecode(np.fromfile(img1_path, dtype=np.uint8), -1)
    H1 = cv2.calcHist([img1], [1], None, [256], [0, 256])  # 计算图直方图
    H1 = cv2.normalize(H1, H1, 0, 1, cv2.NORM_MINMAX, -1)  # 对图片进行归一化处理
    img2 = cv2.imdecode(np.fromfile(img2_path, dtype=np.uint8), -1)
    H2 = cv2.calcHist([img2], [1], None, [256], [0, 256])  # 计算图直方图
    H2 = cv2.normalize(H2, H2, 0, 1, cv2.NORM_MINMAX, -1)  # 对图片进行归一化处理
    similarity1 = cv2.compareHist(H1, H2, 0)  # 相似度比较
    print('similarity:', similarity1)
    if similarity1 > 0.98:  # 0.98是阈值，可根据需求调整
        return True
    else:
        return False

# 去除相似度高的图片
def filter_similar(dir_path):
    filter_dir = os.path.join(os.path.dirname(dir_path), 'filter_similar')
    if not os.path.exists(filter_dir):
        os.mkdir(filter_dir)
    filter_number = 0
    for root, dirs, files in os.walk(dir_path):
        img_files = [file_name for file_name in files if is_image(file_name)]
        filter_list = []
        for index in range(len(img_files))[:-4]:
            if img_files[index] in filter_list:
                continue
            for idx in range(len(img_files))[(index+1):(index+5)]:
                img1_path = os.path.join(root, img_files[index])
                img2_path = os.path.join(root, img_files[idx])
                if calc_similarity(img1_path, img2_path):
                    filter_list.append(img_files[idx])
                    filter_number += 1
        for item in filter_list:
            src_path = os.path.join(root, item)
            shutil.move(src_path, filter_dir)
    return filter_number

Motor Vehicle Color Classification

Method 1: traditional algorithm (unsatisfactory result)

Use opencv library functions for processing.

1. Convert the image color to hsv,
2. Use the cv2.inRange() function to filter the background color
3. Binarize the filtered color
4. Perform morphological erosion and expansion, cv2.dilate()
5. Statistics Area of white area
Reference: https://www.jb51.net/article/172797.htm

# 定义HSV颜色字典
def get_color_list():
    dict = collections.defaultdict(list)
    # 黑色
    lower_black = np.array([0, 0, 0])
    upper_black = np.array([180, 255, 46])
    color_list = []
    color_list.append(lower_black)
    color_list.append(upper_black)
    dict['black'] = color_list

    # 灰色
    # lower_gray = np.array([0, 0, 46])
    # upper_gray = np.array([180, 43, 220])
    # color_list = []
    # color_list.append(lower_gray)
    # color_list.append(upper_gray)
    # dict['gray'] = color_list

    # 白色
    lower_white = np.array([0, 0, 221])
    upper_white = np.array([180, 30, 255])
    color_list = []
    color_list.append(lower_white)
    color_list.append(upper_white)
    dict['white'] = color_list

    # 红色1
    lower_red = np.array([156, 43, 46])
    upper_red = np.array([180, 255, 255])
    color_list = []
    color_list.append(lower_red)
    color_list.append(upper_red)
    dict['red'] = color_list

    # 红色2
    lower_red = np.array([0, 43, 46])
    upper_red = np.array([10, 255, 255])
    color_list = []
    color_list.append(lower_red)
    color_list.append(upper_red)
    dict['red2'] = color_list

    # 橙色
    lower_orange = np.array([11, 43, 46])
    upper_orange = np.array([25, 255, 255])
    color_list = []
    color_list.append(lower_orange)
    color_list.append(upper_orange)
    dict['orange'] = color_list

    # 黄色
    lower_yellow = np.array([26, 43, 46])
    upper_yellow = np.array([34, 255, 255])
    color_list = []
    color_list.append(lower_yellow)
    color_list.append(upper_yellow)
    dict['yellow'] = color_list

    # 绿色
    lower_green = np.array([35, 43, 46])
    upper_green = np.array([77, 255, 255])
    color_list = []
    color_list.append(lower_green)
    color_list.append(upper_green)
    dict['green'] = color_list

    # 青色
    lower_cyan = np.array([78, 43, 46])
    upper_cyan = np.array([99, 255, 255])
    color_list = []
    color_list.append(lower_cyan)
    color_list.append(upper_cyan)
    dict['cyan'] = color_list

    # 蓝色
    lower_blue = np.array([100, 43, 46])
    upper_blue = np.array([124, 255, 255])
    color_list = []
    color_list.append(lower_blue)
    color_list.append(upper_blue)
    dict['blue'] = color_list

    # 紫色
    lower_purple = np.array([125, 43, 46])
    upper_purple = np.array([155, 255, 255])
    color_list = []
    color_list.append(lower_purple)
    color_list.append(upper_purple)
    dict['purple'] = color_list

    return dict

# 颜色识别
def get_color(image):
    print('go in get_color')
    img_array = cv2.imdecode(np.fromfile(image, dtype=np.uint8), -1)
    kernel_4 = np.ones((4, 4), np.uint8)  # 4x4的卷积核
    hsv = cv2.cvtColor(img_array, cv2.COLOR_BGR2HSV)
    maxsum = -100
    color = None
    color_dict = get_color_list()
    print(color_dict)
    for key in color_dict:
        mask = cv2.inRange(hsv, color_dict[key][0], color_dict[key][1]) # mask是把HSV图片中在颜色范围内的区域变成白色，其它区域变成黑色
        cv2.imwrite(key + os.path.splitext(image)[-1], mask)
        erosion = cv2.erode(mask, kernel_4, iterations=1)
        erosion = cv2.erode(erosion, kernel_4, iterations=1)
        dilation = cv2.dilate(erosion, kernel_4, iterations=1)
        dilation = cv2.dilate(dilation, kernel_4, iterations=1)
        target = cv2.bitwise_and(img_array, img_array, mask=dilation)
        binary = cv2.threshold(dilation, 127, 255, cv2.THRESH_BINARY)[1]
        # binary = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)[1]
        # binary = cv2.dilate(binary, None, iterations=2)
        cnts, hiera = cv2.findContours(binary.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        # cnts, hiera = cv2.findContours(binary.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
        sum = 0
        for c in cnts:
            sum += cv2.contourArea(c)
        if sum > maxsum:
            maxsum = sum
            color = key
    return color

Method 2: Deep Learning Model

Using the trained color classification model for ROI images of motor vehicles, the effect is much better.

# 对机动车ROI图片按颜色分类
def classify_vehcolor(dir_path):
    result_dir = os.path.join(os.path.dirname(dir_path), 'color_results')
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
    color_list = dict_color.values()
    for color in color_list:
        color_dir = os.path.join(result_dir, color)
        if not os.path.exists(color_dir):
            os.mkdir(color_dir)
    classify_number = 0
    for root, dirs, files in os.walk(dir_path):
        for dir in dirs:
            result_dic = classify_color(os.path.join(root, dir))
            for key, value in result_dic.items():
                dst_path = os.path.join(result_dir, value)
                try:
                    shutil.move(key, dst_path)
                    classify_number += 1
                except Exception:
                    pass
        img_files = [file_name for file_name in files if is_image(file_name)]
        if len(img_files) != 0:
            result_dic = classify_color(root)
            for key, value in result_dic.items():
                dst_path = os.path.join(result_dir, value)
                try:
                    shutil.move(key, dst_path)
                    classify_number += 1
                except Exception:
                    pass
    return classify_number

Diurnal classification

That is, classify whether the scene where the picture is taken is daytime or nighttime. Here, the average brightness of the picture is used for rough classification. According to the actual measurement, the accuracy rate is not high, but for now, use this method for preliminary cleaning, and then find a better algorithm when you have time.

# 对图片进行昼夜分类，根据图片的平均亮度
def classify_day_or_night(dir_path):
    result_dir = os.path.join(os.path.dirname(dir_path), 'day_night_results')
    if not os.path.exists(result_dir):
        os.mkdir(result_dir)
    item_list = ['白天', '黑夜']
    for item in item_list:
        item_dir = os.path.join(result_dir, item)
        if not os.path.exists(item_dir):
            os.mkdir(item_dir)
    classify_number = 0
    for root, dirs, files in os.walk(dir_path):
        img_files = [file_name for file_name in files if is_image(file_name)]
        for file in img_files:
            file_path = os.path.join(root, file)
            rgb_img = cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
            img = cv2.cvtColor(rgb_img, cv2.COLOR_BGR2GRAY)
            brightness_value = img.mean()
            print('brightness_value', brightness_value)
            if brightness_value > 95:
                key = '白天'
            else:
                key = '黑夜'
            dst_path = os.path.join(result_dir, key)
            try:
                shutil.move(file_path, dst_path)
                classify_number += 1
            except Exception:
                pass
    return classify_number

Tool interface display

insert image description here