Python数据抽样

1、数据的导入查看

import pandas as pd
f = open("E:/test.csv", encoding="utf-8")
content = pd.read_csv(f,nrows=5)  ##打印（读取）前5行
print(content)
##等价  print(content[0:5])左闭右开，输出0-4序号行

import pandas as pd
# content = pd.read_csv("E:/test.csv",nrows=2)  ##打印（读取）前5行
#print(content)
data = pd.read_excel("E:/test_data.xlsx",nrows=3) ## 3不起作用，还是输出整个表
##print(data[:3])   打印前3行不能直接[3]
print(data)

import pandas as pd
da= pd.read_excel('E:/test_data.xlsx') ### sheet_name=['列名','列名'],索引0开始
##data=da.head() ##默认前5行
## 指定返回某行列，要嵌套
#data=da.ix[[1,2],[0,1]].values ##  返回第2、3行第1、2列的值
#data=da.ix[[1,2],[0,1]]  ## 会有字段名，若.values只返回数据向量形式
##data=da.ix[0:2,1:4]  ## 多行多列，不用嵌套，[1,2]返回第2行第3列
data=da.ix[:,['school','sex']] ## 返回所有行指定列

data=da.values #获取所有的数据
print("获取到所有的值:\n{0}".format(data))#格式化输出

2、数据抽样

## 1 简单随机抽样

import pandas as pd
import numpy as np
data= pd.read_excel('E:/test_data.xlsx')
np.random.seed(seed=2)
d1=data.sample(n=10,replace=True)
d2=data.sample(frac=0.01,replace=True)
print(d1,'\n',d2)

## 2 分层抽样

import pandas as pd
import numpy as np
data= pd.read_excel('E:/test_data.xlsx')
df = pd.DataFrame(data)  ## 不转也行
print(df)
ds=data.groupby("sex")
print(ds.groups)

typicalFracDict = {
    "F": 0.01,
    "M": 0.01,
}

def typicalSampling(group, typicalFracDict):
    name = group.name
    frac = typicalFracDict[name]
    return group.sample(frac=frac)

result = df.groupby( "sex",group_keys=True).apply(typicalSampling, typicalFracDict)
print(result)

## 3 系统抽样

## 先了解向量形式转为数组（数据框）

### 试了很多方法，都无法实现 不显示索引  
import pandas as pd
data = pd.read_excel('E:/test_data.xlsx')
da=[[1,2,3],[4,5,6]]
#daa=pd.DataFrame(da).values ## 每月索引名，只返回数据，且数组形式
#daa=pd.DataFrame(da) ## 默认索引0开始
#daa=pd.DataFrame(da,columns=['a','b','c'],index=['row1','row2']) ##  索引重命名
daa=pd.DataFrame(da)   
print(daa)

## 系统抽样

import random
import pandas as pd

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
#         fltLine = map(float,curLine) #map all elements to float()
        dataMat.append(curLine)
    return dataMat

def RandomSampling(dataMat, number):
    try:
        slice = random.sample(dataMat, number)
        return slice
    except:
        print('sample larger than population')


def RepetitionRandomSampling(dataMat, number):
    sample = []
    for i in range(number):
        sample.append(dataMat[random.randint(0, len(dataMat) - 1)])
    return sample


def SystematicSampling(dataMat, number):
    length = len(dataMat)
    k = int(length / number)
    sample = []
    i = 0
    if k > 0:
        while len(sample) != number:
            sample.append(dataMat[0 + i * k])
            i += 1
        return sample

    else:
        return RandomSampling(dataMat, number)


if __name__ == '__main__':

    dataMat = loadDataSet('E:/tt.txt') ## 把test.xlsx转为txt
    #    print RandomSampling(dataMat,7)
    #    RepetitionSampling(dataMat,4)
    #print(SystematicSampling(dataMat, 60))
    result=SystematicSampling(dataMat, 30)  ##  取样数30
    result=pd.DataFrame(result)
    print(result)

猜你喜欢