sklearn 样本均衡 class_weight='balanced'的实现方法

想了解这个的原因，是因为Gbdt不能设置这个参数，但是GBDT是最常用的分类器了...

查了一会儿还想了下原理，想知道原理上为啥gbdt不能设置class_weight... 然后发现虽然sklearn没有这个选项，但是LightGBM是有的啊...所以应该是可以实现，但是Sklearn只是没有实现而已。。。

scale_pos_weight, default=1.0, type=double

weight of positive class in binary classification task

连接在这里

https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.md

然后，class_weight='balanced'的实现，根据我跟sklearn svc的源码，原理应该就是改变sample weight

有结论之后可以不看了，然后下面比较无聊，是跟代码过程，记录是因为学到几个函数的用法

LabelEncoder 等

计算class weight的代码：

def compute_class_weight(class_weight, classes, y):
    """Estimate class weights for unbalanced datasets.

    Parameters
    ----------
    class_weight : dict, 'balanced' or None
        If 'balanced', class weights will be given by
        ``n_samples / (n_classes * np.bincount(y))``.
        If a dictionary is given, keys are classes and values
        are corresponding class weights.
        If None is given, the class weights will be uniform.

    classes : ndarray
        Array of the classes occurring in the data, as given by
        ``np.unique(y_org)`` with ``y_org`` the original class labels.

    y : array-like, shape (n_samples,)
        Array of original class labels per sample;

    Returns
    -------
    class_weight_vect : ndarray, shape (n_classes,)
        Array with class_weight_vect[i] the weight for i-th class

    References
    ----------
    The "balanced" heuristic is inspired by
    Logistic Regression in Rare Events Data, King, Zen, 2001.
    """
    # Import error caused by circular imports.
    from ..preprocessing import LabelEncoder

    if set(y) - set(classes):
        raise ValueError("classes should include all valid labels that can "
                         "be in y")
    if class_weight is None or len(class_weight) == 0:
        # uniform class weights
        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
    elif class_weight in ['auto', 'balanced']:
        # Find the weight of each class as present in y.
        le = LabelEncoder()
        y_ind = le.fit_transform(y)
        if not all(np.in1d(classes, le.classes_)):
            raise ValueError("classes should have valid labels that are in y")

        # inversely proportional to the number of samples in the class
        if class_weight == 'auto':
            recip_freq = 1. / bincount(y_ind)
            weight = recip_freq[le.transform(classes)] / np.mean(recip_freq)
            warnings.warn("The class_weight='auto' heuristic is deprecated in"
                          " 0.17 in favor of a new heuristic "
                          "class_weight='balanced'. 'auto' will be removed in"
                          " 0.19", DeprecationWarning)
        else:
            recip_freq = len(y) / (len(le.classes_) *
                                   bincount(y_ind).astype(np.float64))
            weight = recip_freq[le.transform(classes)]
    else:
        # user-defined dictionary
        weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
        if not isinstance(class_weight, dict):
            raise ValueError("class_weight must be dict, 'balanced', or None,"
                             " got: %r" % class_weight)
        for c in class_weight:
            i = np.searchsorted(classes, c)
            if i >= len(classes) or classes[i] != c:
                raise ValueError("Class label %d not present." % c)
            else:
                weight[i] = class_weight[c]

    return weight

简单解释一下，LabelEncoder就是把label变成[0, label_cnt]区间的数，bincount就是对各个label的样本个数计数，所以recip_freq就是存的各个类别的平均每个样本的权重（假设所有的类别的总的权重一样），然后实际是计算样本权重的时候调用的上面这个计算class weight的函数，具体见下面代码：

def compute_sample_weight(class_weight, y, indices=None):
    """Estimate sample weights by class for unbalanced datasets.

    Parameters
    ----------
    class_weight : dict, list of dicts, "balanced", or None, optional
        Weights associated with classes in the form ``{class_label: weight}``.
        If not given, all classes are supposed to have weight one. For
        multi-output problems, a list of dicts can be provided in the same
        order as the columns of y.

        The "balanced" mode uses the values of y to automatically adjust
        weights inversely proportional to class frequencies in the input data:
        ``n_samples / (n_classes * np.bincount(y))``.

        For multi-output, the weights of each column of y will be multiplied.

    y : array-like, shape = [n_samples] or [n_samples, n_outputs]
        Array of original class labels per sample.

    indices : array-like, shape (n_subsample,), or None
        Array of indices to be used in a subsample. Can be of length less than
        n_samples in the case of a subsample, or equal to n_samples in the
        case of a bootstrap subsample with repeated indices. If None, the
        sample weight will be calculated over the full sample. Only "auto" is
        supported for class_weight if this is provided.

    Returns
    -------
    sample_weight_vect : ndarray, shape (n_samples,)
        Array with sample weights as applied to the original y
    """

    y = np.atleast_1d(y)
    if y.ndim == 1:
        y = np.reshape(y, (-1, 1))
    n_outputs = y.shape[1]

    if isinstance(class_weight, six.string_types):
        if class_weight not in ['balanced', 'auto']:
            raise ValueError('The only valid preset for class_weight is '
                             '"balanced". Given "%s".' % class_weight)
    elif (indices is not None and
          not isinstance(class_weight, six.string_types)):
        raise ValueError('The only valid class_weight for subsampling is '
                         '"balanced". Given "%s".' % class_weight)
    elif n_outputs > 1:
        if (not hasattr(class_weight, "__iter__") or
                isinstance(class_weight, dict)):
            raise ValueError("For multi-output, class_weight should be a "
                             "list of dicts, or a valid string.")
        if len(class_weight) != n_outputs:
            raise ValueError("For multi-output, number of elements in "
                             "class_weight should match number of outputs.")

    expanded_class_weight = []
    for k in range(n_outputs):

        y_full = y[:, k]
        classes_full = np.unique(y_full)
        classes_missing = None

        if class_weight in ['balanced', 'auto'] or n_outputs == 1:
            class_weight_k = class_weight
        else:
            class_weight_k = class_weight[k]

        if indices is not None:
            # Get class weights for the subsample, covering all classes in
            # case some labels that were present in the original data are
            # missing from the sample.
            y_subsample = y[indices, k]
            classes_subsample = np.unique(y_subsample)

            weight_k = np.choose(np.searchsorted(classes_subsample,
                                                 classes_full),
                                 compute_class_weight(class_weight_k,
                                                      classes_subsample,
                                                      y_subsample),
                                 mode='clip')

            classes_missing = set(classes_full) - set(classes_subsample)
        else:
            weight_k = compute_class_weight(class_weight_k,
                                            classes_full,
                                            y_full)

        weight_k = weight_k[np.searchsorted(classes_full, y_full)]

        if classes_missing:
            # Make missing classes' weight zero
            weight_k[in1d(y_full, list(classes_missing))] = 0.

        expanded_class_weight.append(weight_k)

    expanded_class_weight = np.prod(expanded_class_weight,
                                    axis=0,
                                    dtype=np.float64)

    return expanded_class_weight

sklearn 样本均衡 class_weight='balanced'的实现方法

猜你喜欢