7

I have the following dataset:

Enter image description here

I want to calculate impute the NA in the n_j column in the second row with the formula n_j - (d_j+c_j).

To create the data:

    df = structure(list(time_intervals = structure(1:8, levels = c("[0,12)",
"[12,24)", "[24,36)", "[36,48)", "[48,60)", "[60,72)", "[72,84)",
"[84,96]"), class = "factor"), d_j = c(16L, 10L, 1L, 3L, 2L,
2L, 0L, 2L), c_j = c(4L, 4L, 0L, 1L, 2L, 0L, 1L, 0L), n_j = c(48L,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))

I managed to do this with a for loop:

for (i in 1:nrow(df)) {
  df <- df |>
    mutate(
    n_j =
      ifelse(is.na(n_j), lag(n_j)- (lag(d_j)+lag(c_j)), n_j)
  )
}

Is there a way to do this using purrr::map or other Tidyverse functions?

1
  • 1
    Note: Imputing is not a typo - "5. (transitive, statistics) To replace missing data with substituted values." Commented Dec 25, 2023 at 9:12

6 Answers 6

13

A one-liner with dplyr

We can fill the NAs from the original column with coalesce(). The replacement comes from subtracting the lagged d_j and c_j. Finally, use a cummulative sum to obtain the desired output. It should be fairly efficient, because this relies on vectorized subtractions and the very fast cumsum.

df |>
    mutate(n_j = coalesce(n_j, -lag(d_j) -lag(c_j)) |> cumsum())
# A tibble: 8 × 4

Output:

  time_intervals   d_j   c_j   n_j
  <fct>          <int> <int> <int>
1 [0,12)            16     4    48
2 [12,24)           10     4    28
3 [24,36)            1     0    14
4 [36,48)            3     1    13
5 [48,60)            2     2     9
6 [60,72)            2     0     5
7 [72,84)            0     1     3
8 [84,96]            2     0     2
10

Use:

 start <- df$n_j[1]
 transform(df, n_j = c(start, start - cumsum(d_j + c_j)[-nrow(df)]))

Output:

  time_intervals d_j c_j n_j
1         [0,12)  16   4  48
2        [12,24)  10   4  28
3        [24,36)   1   0  14
4        [36,48)   3   1  13
5        [48,60)   2   2   9
6        [60,72)   2   0   5
7        [72,84)   0   1   3
8        [84,96]   2   0   2
3
  • Thanks for the answer but I was looking for something in the tidyverse Commented Dec 22, 2023 at 22:07
  • 4
    @AbdullahAbdelaziz just change transform to mutate ie df %>%mutate(n_j = c(n_j[1], n_j[1] - cumsum(d_j + c_j)[-n()])) there is no much difference
    – Onyambu
    Commented Dec 22, 2023 at 22:11
  • 6
    @AbdullahAbdelaziz, why would you eschew a one-liner answer like this? There are certainly times when dplyr-based answers can be more elegant, easier, faster, etc ... this is not one of them. I suggest against blindly assuming anything non-tidyverse is too hard, not good enough, or something else.
    – r2evans
    Commented Dec 23, 2023 at 0:08
6

One solution using accumulate2 from purrr is:

library(tidyverse)

df %>%
  mutate(
    n_j = accumulate2(
      d_j,
      c_j,
      ~..1 - (..2 + ..3),
      .init = first(n_j)
    )[-n()-1]
  )

Output

  time_intervals   d_j   c_j   n_j
  <fct>          <int> <int> <int>
1 [0,12)            16     4    48
2 [12,24)           10     4    28
3 [24,36)            1     0    14
4 [36,48)            3     1    13
5 [48,60)            2     2     9
6 [60,72)            2     0     5
7 [72,84)            0     1     3
8 [84,96]            2     0     2
4

If you want "recursive" or "iterative" things, you can try Reduce like here

transform(
  df,
  n_j = head(
    Reduce(`-`,
      d_j + c_j,
      init = n_j[1],
      accumulate = TRUE
    ), -1
  )
)

which gives

  time_intervals d_j c_j n_j
1         [0,12)  16   4  48
2        [12,24)  10   4  28
3        [24,36)   1   0  14
4        [36,48)   3   1  13
5        [48,60)   2   2   9
6        [60,72)   2   0   5
7        [72,84)   0   1   3
8        [84,96]   2   0   2
4

Another solution using accumulate:

library(purrr)
library(dplyr)

df %>%
  mutate(n_j = accumulate(
    .init = first(n_j),
    .x = seq_len(n() - 1),
    .f = ~ ifelse(is.na(df$n_j[.y + 1]), .x - (df$d_j[.y] + df$c_j[.y]), df$n_j[.y + 1])
  ), .keep = "unused")

Output:

 time_intervals   d_j   c_j   n_j
  <fct>          <int> <int> <int>
1 [0,12)            16     4    48
2 [12,24)           10     4    28
3 [24,36)            1     0    14
4 [36,48)            3     1    13
5 [48,60)            2     2     9
6 [60,72)            2     0     5
7 [72,84)            0     1     3
8 [84,96]            2     0     2
-2

Use the lag function:

Mutate(N_j = ifelse(is.na(n_j), lag(n_j,1) - (d_j + c_j), n_j))

If I understand correctly, this will give you what you wanted, which is the NJ from the previous row (otherwise all is na) minus the sum of DJ and CJ of the specific row.

1
  • 2
    It will do this only for the second row Commented Dec 22, 2023 at 22:07

Not the answer you're looking for? Browse other questions tagged or ask your own question.