weka学习(三)缺失值处理

/**
 * @author hao.wei
 */
@Service
public class MissingHandleBizImpl implements MissingHandleBiz {

    private static final Logger logger = LoggerFactory.getLogger(MissingHandleBizImpl.class);
    /** 缺失值用该属性的平均值填充*/
    @Override
    public Instances missingValuesFilledWithAvg(Instances instances, String incompatible) {
        try {
            // 属性个数(列)
            int dim = instances.numAttributes();
            // 实例个数(行)
            int num = instances.numInstances();
            logger.info("开始将平均值填充入缺失值...");
            double[] meanV = new double[dim];
            for (int line = 0; line < meanV.length; line++) {
                // 第i列平均值
                meanV[line] = 0;
                // 实例个数
                int count = 0;
                for (int row = 0; row < num; row++) {
                    // 计算第i列平均值(缺失值 和 不合条件的值除外)
                    if (!instances.instance(row).isMissing(line) && !instances.instance(row).toString(line).trim().contains(incompatible)) {
                        meanV[line] += instances.instance(row).value(line);
                        count++;
                    }
                }
                meanV[line] = meanV[line] / count;
                logger.info("属性[{}]的平均值为[{}]", instances.attribute(line).name(), meanV[line]);
                for (int row = 0; row < num; row++) {
                    // 平均值填充缺失值 和 不符合条件的值
                    if (instances.instance(row).isMissing(line) || instances.instance(row).toString(line).contains(incompatible)) {
                        instances.instance(row).setValue(line, meanV[line]);
                    }
                }
            }
        } catch (Exception e) {
            logger.error("将平均值填充入缺失值发生系统异常,错误信息:", e);
        }
        logger.info("平均值填充如缺失值结束...");

        return instances;
    }

    /** 移除掉包含特殊值的属性的实例*/
    @Override
    public Instances removeMismatchConditionData(Instances instances, String attribute, String incompatible) {
        try {
            logger.info("删除[{}]属性包含[{}]的实例", attribute, incompatible);
            // 属性个数(列)
            int dim = instances.numAttributes();
            // 实例个数(行)
            int num = instances.numInstances();
            for (int i = 0; i < dim; i++) {
                // 属性名称和需要处理的属性名相同
                if (instances.attribute(i).name().equals(attribute)) {
                    for (int j = 0; j < num; j++) {
                        // 实例的该属性值包含不合条件值 删除该条实例(行)
                        if (instances.instance(j).isMissing(i)|| instances.instance(j).toString(i).contains(incompatible)) {
                            logger.info("删除的实例属性值为[{}]", instances.instance(j).toStringNoWeight());
                            instances.remove(j);
                            j--;
                            num--;
                        }
                    }
                }
            }
        } catch (Exception e) {
            logger.error("删除[{}]属性包含[{}]的实例发生系统异常,错误信息[{}]", attribute, incompatible, e);
        }
        return instances;
    }
}

猜你喜欢

转载自my.oschina.net/u/3701483/blog/2244608