data.table使用总结

数据读取


df1=fread("E:/Research/df1.csv",header=T)
df2=fread("E:/Research/df2.csv",header=T)

选择行列

df=df[,Variable] #返回向量
df=df[,.(Variable)] #返回data.table
df=df[,c("Variable"),with=FALSE] #返回一个数据框
df=df[["Variable"]] #返回一列x向量,同第一个
# Varibale可以是列名也可以是列编号

数据清洗

缺失值处理

缺失值删除

df=df[ID !="NA"]

缺失值替换

SparseVariables = c("Variable1","Variable2","Variable3")
for (col in SparseVariables)
    set(df, which(is.na(df[[col]])),col,0)

转换数据类型

NumericVariables=c("Var1","Var2")
df[,(NumericVariables):=lapply(.SD,as.numeric),.SDcols=NumericVariables]

生成新的列

对所有行生成

df[,":="(Var1 = Var2+ Var3)]
df[,":="(Var1 = Var2+ Var3, Var4 = Var5 + Var6)]
df[, c('Var1', 'Var5') := list(Var2+ Var3, Var5 + Var6)]

对满足条件的行生成

df[Var1=="0",':='(Var1_flag = 1)]

筛选

按照列的值去筛选

df2=df[Var1=="1"]

合并两个表

df_final=df_1[df_2,on="ID",nomatch=0]

长数据和宽数据的转换

宽数据转换为长数据

df_long=melt(df_wide,id.vars = "ID",measure.vars = c("Var1","Var2","Var3"),variable.name = "Var",value.name = "Var_valuet)

长数据转换为宽数据

df_wide=dcast(df_long,ID+Var1+Var2~Var3,value.var ="ValueName",fun.aggregate = sum)

排序

setkey(df_cluster,label)

分组统计

df_PPEG=df3 %$% .[,":="(diff=abs(Po-Pr))] %$% .[,.(PPGE=mean(diff,na.rm = TRUE)),by=ID]
df_SDGB=df2[,.(SDGB=sd(na.omit(glucose))),by=c("ID","Day")] %$% .[,.(meanSDGB=mean(SDGB,na.rm = TRUE)),by=ID]

猜你喜欢

转载自blog.csdn.net/Alleine/article/details/103622953