making something like lag work with group_by

cs0815 :

Tried to reproduce my pipeline with simplified data/code as follows:

library(magrittr)
library(dplyr)
library(lubridate)

df <- data.frame(
    date = c(
        as.Date("2015-1-1")
        , as.Date("2015-2-1")
        , as.Date("2015-3-1")
        , as.Date("2015-4-1")
        , as.Date("2015-5-1")
        , as.Date("2015-6-1")

        , as.Date("2015-1-1")
        , as.Date("2015-2-1")
        , as.Date("2015-3-1")
        , as.Date("2015-4-1")
        , as.Date("2015-5-1")
        , as.Date("2015-6-1")

    )
    ,value = c(1,2,3,4,5,6 ,7,8,9,10,11,12)
    ,category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1"  ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
group_by(
    date = floor_date(date, unit = "monthly")
    ,category 
) %>%
summarise(
    value = min(value)
) %>%
mutate(
    month_minus_1 = lag(value, n=1)
    , month_minus_2 = lag(value, n=2)
) %>%
arrange(
    category 
    , value
)
df

I added floor_date to indicate that I will rollup some figures by month and use some statistic (here min). Anyway, how can I apply lag to each group as the results the above produces are wrong:

date category value month_minus_1 month_minus_2
2015-01-01  cat1    1   NA  NA
2015-02-01  cat1    2   NA  NA
2015-03-01  cat1    3   NA  NA
2015-04-01  cat1    4   NA  NA
2015-05-01  cat1    5   NA  NA
2015-06-01  cat1    6   NA  NA
2015-01-01  cat2    7   1   NA
2015-02-01  cat2    8   2   NA
2015-03-01  cat2    9   3   NA
2015-04-01  cat2    10  4   NA
2015-05-01  cat2    11  5   NA
2015-06-01  cat2    12  6   NA

Expected output:

 date category value month_minus_1 month_minus_2
    2015-01-01  cat1    1   NA  NA
    2015-02-01  cat1    2   1   NA
    2015-03-01  cat1    3   2   1
    2015-04-01  cat1    4   3   2
    2015-05-01  cat1    5   4   3
    2015-06-01  cat1    6   5   4
    2015-01-01  cat2    7   NA  NA
    2015-02-01  cat2    8   7   NA
    2015-03-01  cat2    9   8   7
    2015-04-01  cat2    10  9   8
    2015-05-01  cat2    11  10  9
    2015-06-01  cat2    12  11  10
wibeasley :

The short answer is that date should not be inside dplyr::group_by().

dplyr::group_by() creates separate mini-data.frames that functions like dplyr::lag() can't see outside of. So essentially you were creating twelve data.frames with a single row.

library(magrittr)
library(dplyr)
library(lubridate)

data.frame(
  date = as.Date(c(
    "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", 
    "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01" 
  )),
  value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
  category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1"  ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>% 
group_by(category) %>%
mutate(
  month_minus_1 = lag(value, n=1, order_by = date),
  month_minus_2 = lag(value, n=2, order_by = date)
) %>%
ungroup() 

Results:

# A tibble: 12 x 5
   date       value category month_minus_1 month_minus_2
   <date>     <dbl> <fct>            <dbl>         <dbl>
 1 2015-01-01     1 cat1                NA            NA
 2 2015-02-01     2 cat1                 1            NA
 3 2015-03-01     3 cat1                 2             1
 4 2015-04-01     4 cat1                 3             2
 5 2015-05-01     5 cat1                 4             3
 6 2015-06-01     6 cat1                 5             4
 7 2015-01-01     7 cat2                NA            NA
 8 2015-02-01     8 cat2                 7            NA
 9 2015-03-01     9 cat2                 8             7
10 2015-04-01    10 cat2                 9             8
11 2015-05-01    11 cat2                10             9
12 2015-06-01    12 cat2                11            10

I see that you want to summarize something in your real scenario (not the simplified scenario you present here). I'd do something like this, where the floor and stat are calculated before subsetting for the lag. That first dplyr::ungroup() isn't required, but I like how it communicates the intent better.

data.frame(
  date = as.Date(c(
    "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01", 
    "2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01" 
  )),
  value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
  category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1"  ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
dplyr::mutate(
  month_floor = floor_date(date, unit = "monthly")
) %>%
group_by(category, month_floor) %>%
summarize(
  value_mean   = mean(value) # Or the rollup statistic you're referring to.
) %>% 
ungroup() %>% 
group_by(category) %>%
mutate(
  month_minus_1 = lag(value_mean, n=1, order_by = month_floor),
  month_minus_2 = lag(value_mean, n=2, order_by = month_floor)
) %>%
ungroup() 

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=397715&siteId=1