python数据分层抽样工具类

import  pandas as pd
import random as rd
import numpy as np
import math as ma


def typeicalSampling(group, typeicalFracDict):
    name = group.name
    frac = typeicalFracDict[name]
    return group.sample(frac=frac)
def group_sample(data_set,lable,typeicalFracDict):
    #分层抽样
    #data_set数据集
    #lable分层变量名
    #typeicalFracDict:分类抽样比例
    gbr=data_set.groupby(by=[lable])
    result=data_set.groupby(lable,group_keys=False).apply(typeicalSampling,typeicalFracDict)
    return result

data = pd.DataFrame({'id': [3566841, 6541227, 3512441, 3512441, 3512441,3512441, 3512441, 3512441, 3512441, 3512441],
                   'sex': ['male', 'Female', 'Female','male', 'Female', 'Female','male', 'Female','male', 'Female'],
                   'level': ['high', 'low', 'middle','high', 'low', 'middle','high', 'low', 'middle','middle']})

data_set=data
label='sex'
typicalFracDict = {
    'male': 0.8,
    'Female': 0.5
}
result=group_sample(data_set,label,typicalFracDict)
print(result)

        id     sex   level
7  3512441  Female     low
1  6541227  Female     low
2  3512441  Female  middle
8  3512441    male  middle
6  3512441    male    high
0  3566841    male    high

资源来自网上自己稍微整理了下

猜你喜欢

转载自blog.csdn.net/qq_14865711/article/details/83616416