Generate pseudorandom dataframes with specific parameters

This function is a small utility to create a specific length dataframe with a set number of groups, specific mean/sd per group. Note that the total length of the dataframe will be n * n_grps.

Usage

generate_df(n = 10L, n_grps = 1L, with_seed = NULL, mean = c(10), sd = mean/10)

Arguments

n: An integer indicating the number of rows per group, default to 10
n_grps: An integer indicating the number of rows per group, defaults to 1
with_seed: A seed to make the randomization reproducible
mean: A number indicating the mean of the randomly generated values, must be a vector of equal length to the n_grps
sd: A number indicating the standard deviation of the randomly generated values, must be a vector of equal length to the n_grps

Value

a tibble/dataframe

Examples

generate_df()
#> # A tibble: 10 × 4
#>    row_id id    grp   values
#>     <int> <chr> <chr>  <dbl>
#>  1      1 Q08   grp-1   8.91
#>  2      2 H06   grp-1  11.0 
#>  3      3 R10   grp-1   8.72
#>  4      4 U04   grp-1   8.82
#>  5      5 P07   grp-1   8.71
#>  6      6 Q09   grp-1   9.39
#>  7      7 V03   grp-1  10.1 
#>  8      8 D09   grp-1   9.29
#>  9      9 E10   grp-1   9.53
#> 10     10 I04   grp-1   7.54
generate_df(n = 100L, n_grps = 5L, with_seed = NULL, mean = seq(10, 50, length.out = 5))
#> # A tibble: 500 × 4
#>    row_id id    grp   values
#>     <int> <chr> <chr>  <dbl>
#>  1      1 A062  grp-1  10.4 
#>  2      2 E082  grp-1  11.2 
#>  3      3 E002  grp-1  10.1 
#>  4      4 B095  grp-1  11.4 
#>  5      5 L067  grp-1   7.94
#>  6      6 F079  grp-1   8.63
#>  7      7 B057  grp-1  10.2 
#>  8      8 T091  grp-1   8.79
#>  9      9 O020  grp-1  11.6 
#> 10     10 U042  grp-1  10.6 
#> # ℹ 490 more rows

library(dplyr)
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union
generate_df(
  100L,
  n_grps = 5,
  mean = seq(10, 50, length.out = 5)
) |>
  group_by(grp) |>
  summarise(
    mean = mean(values), # mean is approx mean
    sd = sd(values),     # sd is approx sd
    n = n(),             # each grp is of length n
    # showing that the sd default of mean/10 works
    `mean/sd` = round(mean / sd, 1)
  )
#> # A tibble: 5 × 5
#>   grp    mean    sd     n `mean/sd`
#>   <chr> <dbl> <dbl> <int>     <dbl>
#> 1 grp-1  9.88  1.21   100       8.2
#> 2 grp-2 19.9   2.28   100       8.7
#> 3 grp-3 29.7   3.21   100       9.2
#> 4 grp-4 40.4   4.19   100       9.6
#> 5 grp-5 50.0   5.22   100       9.6