R language (5) -- data conversion (preprocessing)

Data Type Conversion - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

judge is

methods(is) all uses of #is

Determine whether it is a data frame

is.data.frame(data) 

convert

where vectors can be converted to a variety of data

matrix to data frame

data <- as.data.frame(data)

Convert data frame to matrix

as.matrix()

convert to factor

as.factor()

No need for names, remove column names

name()

convert to vector

unlist()

Subsetting - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Take specific rows and columns of a data frame

data1 <-data[c(1:50),c(1:30)] #Continuously extract rows and columns

data2 <-data[c(1,3,4,5),c(1,12,15)] #non-continuous extraction of rows and columns

Filter with logical values

data3 <-data[which(data$factor == 7)] #choose factor 7

data4 <- data[which(data$factor > 7 & data$factor <= 100]

subset function

data4 <- subset(data, data$factor > 7 & data$factor <= 100]

sample sampling

Random sampling with and without return is possible

sample(x, num, replace = True) #x is the sample to be sampled, num refers to the number of samples, replaceT refers to replacement

Merge - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

cbind(dataframe, factor) #merge columns

rbind() #Each column in the row must be the same as the original data

merge(x,y ,by= "") # by refers to what merge

Flip - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Row and column flip t()

tdata<- t(data)

flip a single line rev()

rev(vector)

eg reverse the line

women[rev(rownames(women)),]

Revise- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

deduplication of data

duplicated(data) #return duplicate boolean value

data[!duplicated(data),] #Take out the non-duplicated part

unique(data) # take out the non-repeating part in one step

Modify a column transform()

transform(women, height = height*2.54) #Original data operation

transform(women, cm = height*2.54) #Generate a new column

Sort - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Sort on a single condition

sort()

sort() #The default number is from small to large, and English is sorted by the first letter

rev(sort()) #sort in reverse

#sort cannot be used for data frame sorting, only vectors can be used, but it can save the country with curves

mtcars[sort(rownames(mtcars)), ]

order() 

#Returns the position of the vector instead of the sorted result

mtcars[order(mtcars$mpg), ]

Sort by multiple criteria

mtcars[order(mtcars$mpg, mtcars$disp), ]

Commonly used data conversion packages - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

reshape2

wide to long

melt(data, id.vars = c("col1", "col2" )) #melt data, change wide data into long data, id.vars is to be kept

Long and wide

dcast(aql, month+day ~variable) #According to the variable column brother level as the column name, the month and day column as the id is placed on the far left, and the remaining variables are appended to the new data set from left to right

let's

tidydata: An observation and a variable determine a value

wide to long

gather()

Long and wide

spread()

A column is divided into multiple columns

separate()

e.g

df  <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))

separete(df,col = x, into = c("A", "B"),sep = "") #The separator will be recognized by default, but it can also be specified by sep

Merge multiple columns into one column

unit() 

e.g 

unite(x, col = "AB", A, B, sep = "-")

dplyr

filter

* :: is to prevent conflicts between packages with the same function name

filter()

e.g

dplyr::filter(iris, Sopal.Length >7) #filter out iris calyx length <7

remove duplicate rows

dplyr::distinct(data) 

Slicing out any row

dplyr::slice(iris,10:15) #Take out 10-15 lines

sampling

dplyr::sample_n(iris, 10) # Randomly select ten lines

dplyr::sample_frac(iris,0.1) #Random selection in proportion

to sort

dplyr::arrange(iris, Sopal.Length) #sort by sepal length

dplyr::arrange(iris, desc(Sopal.Length)) #Sort in the opposite direction 

Subset

select()

statistics

summarise(iris, avg = mean(Sopal.Length)) #calculate the average length of the sepal

group

dplyr::group_by(iris, Species)

iris %>% group_by(Species)

add variable

dplyr::mutate(iris, new = Separ.Length + Petal.Length)

multi-table operation

dplyr::left_join() #left link

dplyr::right_join() #right link

#Inner link is the intersection

dplyr::full_join() #Full link is a union

dplyr::semi_join() #Semi-link: filter the left table according to the content of the right table

dplyr::anti_join() #anti-link: output complement

Multi-dataset operations

intersect() #take the intersection

union_all() #take union

setdiff() #take complement set

Pipe character (chain operator %>%) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Implement passing the output of one function to the next function as input to the next function

Available ctrl + shift + M shortcut key output

Mathematical calculations on data frames - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

row sum rowSums()

rs <- rowSums(worldphones)

total <- cbind(worldphones,Total = rs) #add this line

Column Means colMeans()

cm<- colMeans(worldphones)

apply() is more versatile, so use it

apply(worldphones, MARGIN = 1, FUN = sum)

MARGIN: 1 represents row processing, 2 represents column processing

lapply() returns a list

sapply() returns a vector/matrix

tapply() processes factor data, group by factor and then process

tapply(state.name, state.dicision, FUN = length)

Data centralization and standardization - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  

Function: Eliminate the impact of dimensions on data, making the difference between data smaller

centralized

The data in the data set minus the mean of the data set

x-mean(x)

standardization

After the data set is centered, it is divided by the standard deviation of the data set

x-mean(x) / sd(x)

Centralization + Standardization

scale(x,center = T, scale = T)

Guess you like

Origin blog.csdn.net/Scabbards_/article/details/130441677