Five Major Functions for Data Transformation

filter

filter(flights, month == 1, day == 1)

filter(flights, month==11 | month ==12) 
filter(flights, month %in% c(11, 12))
#these two are the same;`month == 11|12` is invalid

#a way to check the number of NA
flights %>% 
  mutate(temp = is.na(dep_time) ) %>% 
  select(temp) %>% 
  table()

arrange

arrange(flights,arr_delay) #from smaller to bigger

arrange(flights, desc(arr_delay)) #from bigger to smaller

select

# Select columns by name
select(flights, year, month, day)
# Select all columns between year and day (inclusive)
select(flights, year:day)
# Select all columns except those from year to day (inclusive)
select(flights, -(year:day))

##rename
rename(flights, tail_num = tailnum) #turn the latter one into the previous one

##change the order of variables as set
stocks %>% select(year,half,return)

mutate

mutate(flights,
  gain = arr_delay - dep_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)

#keep the new variables only
transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)

summarise

flights %>%
  group_by(year,month,day) %>%
  summarise(average_delay=mean(dep_delay,na.rm=T))

#a way to specifically filter the data
flights %>% 
  group_by(year, month, day) %>% 
  summarise(
    avg_delay1 = mean(arr_delay,na.rm=T),
    avg_delay2 = mean(arr_delay[arr_delay > 0],na.rm=T) # the average positive delay
  )

other orders

##Cumulative aggregates
x <- 1:10
cumsum(x)
cummean(x)

##Ranking
y <- c(10, 20, 20, NA, 30, 40)
min_rank(y) 
min_rank(desc(y)) #little values obtain low ranks(large numbers of ranks)

rankme <- mutate(rankme,
  x_row_number = row_number(x),
  x_min_rank = min_rank(x),
  x_dense_rank = dense_rank(x)
)
arrange(rankme, x)
#> # A tibble: 5 x 4
#>       x x_row_number x_min_rank x_dense_rank
#>   <dbl>        <int>      <int>        <int>
#> 1     1            1          1            1
#> 2     5            2          2            2
#> 3     5            3          2            2
#> 4     5            4          2            2
#> 5    10            5          5            3

##show the extreme elements
flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay)) %>% 
  group_by(year, month, day) %>% 
  select(year,month,day,dep_time) %>%
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))  # range: smallest and largest element 

##The number of types
x <-c("a","a","b","c")
n_distinct(x)

猜你喜欢

转载自blog.csdn.net/weixin_51674826/article/details/117045730