【darknet训练细节】一个隐藏的超参数：scale

整体源码如下，该源码是利用yolo训练检测网络时，对输入数据作前处理的源代码，该代码段中包含了从外部传入的超参数（jitter），也有内部写死的超参数（scale）。下面将对该代码做详细的解析。

data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue, float saturation, float exposure)
{
    char **random_paths = get_random_paths(paths, n, m);
    int i;
    data d = {0};
    d.shallow = 0;

    d.X.rows = n;
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*3;

    d.y = make_matrix(n, 5*boxes);
    for(i = 0; i < n; ++i){
        image orig = load_image_color(random_paths[i], 0, 0);
        image sized = make_image(w, h, orig.c);
        fill_image(sized, .5);

        float dw = jitter * orig.w;
        float dh = jitter * orig.h;

        float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
        //float scale = rand_uniform(.25, 2);
        float scale = 1;

        float nw, nh;

        if(new_ar < 1){
            nh = scale * h;
            nw = nh * new_ar;
        } else {
            nw = scale * w;
            nh = nw / new_ar;
        }

        float dx = rand_uniform(0, w - nw);
        float dy = rand_uniform(0, h - nh);

        place_image(orig, nw, nh, dx, dy, sized);

        random_distort_image(sized, hue, saturation, exposure);

        int flip = rand()%2;
        if(flip) flip_image(sized);
        d.X.vals[i] = sized.data;


        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);

        free_image(orig);
    }
    free(random_paths);
    return d;
}

1.新建data数据，data是包含了图片数据和其对应的标签文件，下述代码初始化了一个data数据结构，并根据初始化大小分配了内存空间。

    data d = {0};
    d.shallow = 0;

    d.X.rows = n;
    d.X.vals = calloc(d.X.rows, sizeof(float*));
    d.X.cols = h*w*3;

    d.y = make_matrix(n, 5*boxes);

上述代码段中用到的make_matrix函数用来分配各种大小的矩阵的内存空间。本例中是用来分配每个通道n，所对应的每个标签box，因为每个box包含了xywhid五个值，所以要乘以5.

matrix make_matrix(int rows, int cols)
{
    int i;
    matrix m;
    m.rows = rows;
    m.cols = cols;
    m.vals = calloc(m.rows, sizeof(float *));
    for(i = 0; i < m.rows; ++i){
        m.vals[i] = calloc(m.cols, sizeof(float));
    }
    return m;
}

2.导入图片数据，如下所示函数将图片读入，创建内存空间。

image orig = load_image_color(random_paths[i], 0, 0);
image sized = make_image(w, h, orig.c);
fill_image(sized, .5);

上述代码中先用load_image_color将图片导入，从如下源码中我们发现在导入的过程中，图片已经被resize好了。并用make_image分配空间，再用fill_image把image中的每个值初始化为0.5。

image load_image_color(char *filename, int w, int h)
{
    return load_image(filename, w, h, 3);
}
-------------------------------------------------------------
image load_image(char *filename, int w, int h, int c)
{
#ifdef OPENCV
    image out = load_image_cv(filename, c);
#else
    image out = load_image_stb(filename, c);
#endif

    if((h && w) && (h != out.h || w != out.w)){
        image resized = resize_image(out, w, h);
        free_image(out);
        out = resized;
    }
    return out;
}
------------------------------------------------------------
image make_image(int w, int h, int c)
{
    image out = make_empty_image(w,h,c);
    out.data = calloc(h*w*c, sizeof(float));
    return out;
}
------------------------------------------------------------
void fill_image(image m, float s)
{
    int i;
    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
}

3.设置超参数，设置部分超参数，对后续图片进行处理，部分超参数由外部cfg设置，部分超参数内部写死（比如scale参数）。

float dw = jitter * orig.w;
float dh = jitter * orig.h;

float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
//float scale = rand_uniform(.25, 2);
float scale = 1;

float nw, nh;

if(new_ar < 1){
      nh = scale * h;
      nw = nh * new_ar;
} else {
     nw = scale * w;
     nh = nw / new_ar;
}

float dx = rand_uniform(0, w - nw);
float dy = rand_uniform(0, h - nh);
place_image(orig, nw, nh, dx, dy, sized);
random_distort_image(sized, hue, saturation, exposure);

如上述代码所示，jitter是一种对宽高的随机抖动。scale是一种内部写死的表示对输入图片的宽高缩放（通过源码我们发现是对随机抖动后，较长的边进行缩放的）。其中place_image函数，实现了不论你上面做了多少尺度的scale的缩放，最终还是会放到sized大小的范围内来。

void place_image(image im, int w, int h, int dx, int dy, image canvas)
{
    int x, y, c;
    for(c = 0; c < im.c; ++c){
        for(y = 0; y < h; ++y){
            for(x = 0; x < w; ++x){
                float rx = ((float)x / w) * im.w;
                float ry = ((float)y / h) * im.h;
                float val = bilinear_interpolate(im, rx, ry, c);
                set_pixel(canvas, x + dx, y + dy, c, val);
            }
        }
    }
}
----------------------------------------------------------------------------
static float bilinear_interpolate(image im, float x, float y, int c)
{
    int ix = (int) floorf(x);
    int iy = (int) floorf(y);

    float dx = x - ix;
    float dy = y - iy;

    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
    return val;
}
----------------------------------------------------------------------------
static void set_pixel(image m, int x, int y, int c, float val)
{
    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
    assert(x < m.w && y < m.h && c < m.c);
    m.data[c*m.h*m.w + y*m.w + x] = val;
}

仔细剖析了place_image函数，对图片im的长宽通道数中的每个像素进行计算，如果缩放后的图比原始的小，则周边就用双线性差值进行填补（基本就是灰的），如果缩放后的图比原始的大，则那些部分就直接丢弃（通过set_pixel中的条件判断可以得出该结论）。
4.备注,如下图为data,image等数据结构。

typedef struct{
    int w, h;
    matrix X;
    matrix y;
    int shallow;
    int *num_boxes;
    box **boxes;
} data;


typedef struct{
    float x, y, w, h;//x,y中心点，w,h宽高（都是占比）
} box;


typedef struct matrix{
    int rows, cols;     // 矩阵的行与列数
    float **vals;       // 矩阵所存储的数据，二维数组
} matrix;


typedef struct {
    int h;
    int w;
    int c;
    float *data;
} image;

yuanCruise 博客专家

发布了233 篇原创文章 · 获赞 187 · 访问量 40万+

他的留言板关注

【darknet训练细节】一个隐藏的超参数：scale

猜你喜欢