
In cheapr, ‘cheap’ means fast and memory-efficient, and that’s exactly the philosophy that cheapr aims to follow.
You can install cheapr like so:
install.packages("cheapr")or you can install the development version of cheapr:
remotes::install_github("NicChr/cheapr")Some common operations that cheapr can do much faster and more efficiently include:
Counting, finding, removing and replacing NA and
scalar values
Creating factors
Creating multiple sequences in a vectorised way
Sub-setting vectors and data frames efficiently
Safe, flexible and fast greatest common divisor and lowest common multiple
Lags/leads
Lightweight integer64 support
In-memory Math (no copies, vectors updated by reference)
Summary statistics of data frame variables
Binning of continuous data
Let’s first load the required packages
library(cheapr)
library(bench)NABecause R mostly uses vectors and vectorised operations, this means that there are few scalar-optimised operations.
cheapr provides tools to efficiently count, find, replace and remove scalars.
# Setup data with NA values
set.seed(42)
x <- sample(1:5, 30, TRUE)
x <- na_insert(x, n = 7)
cheapr_table(x, order = TRUE) # Fast table()
#> 1 2 3 4 5 <NA>
#> 6 6 3 4 4 7NA functions
na_count(x)
#> [1] 7
na_rm(x)
#> [1] 1 5 1 2 4 2 1 4 5 4 2 3 1 1 3 4 5 5 2 3 2 1 2
na_find(x)
#> [1] 4 8 11 15 22 24 26
na_replace(x, -99)
#> [1] 1 5 1 -99 2 4 2 -99 1 4 -99 5 4 2 -99 3 1 1 3 4 5 -99 5 -99 2 -99 3
#> [28] 2 1 2Scalar functions
val_count(x, 3)
#> [1] 3
val_rm(x, 3)
#> [1] 1 5 1 NA 2 4 2 NA 1 4 NA 5 4 2 NA 1 1 4 5 NA 5 NA 2 NA 2 1 2
val_find(x, 3)
#> [1] 16 19 27
val_replace(x, 3, 99)
#> [1] 1 5 1 NA 2 4 2 NA 1 4 NA 5 4 2 NA 99 1 1 99 4 5 NA 5 NA 2 NA 99 2 1 2Scalar based case-match
val_match(
x,
1 ~ "one",
2 ~ "two",
3 ~ "three",
.default = ">3"
)
#> [1] "one" ">3" "one" ">3" "two" ">3" "two" ">3" "one" ">3" ">3" ">3" ">3"
#> [14] "two" ">3" "three" "one" "one" "three" ">3" ">3" ">3" ">3" ">3" "two" ">3"
#> [27] "three" "two" "one" "two"m <- matrix(na_insert(rnorm(10^6), prop = 1/4), ncol = 10^3)
# Number of NA values by row
mark(row_na_counts(m),
rowSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 row_na_counts(m) 455µs 472.2µs 1946. 13.09KB 0
#> 2 rowSums(is.na(m)) 3.38ms 3.68ms 259. 3.85MB 27.9
# Number of NA values by col
mark(col_na_counts(m),
colSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 col_na_counts(m) 1.33ms 1.41ms 666. 13.09KB 0
#> 2 colSums(is.na(m)) 1.74ms 2.06ms 471. 3.82MB 45.4is_na is a multi-threaded alternative to
is.na
x <- rnorm(10^6) |>
na_insert(10^5)
options(cheapr.cores = 4)
mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 943µs 1.21ms 782. 3.81MB 130.
#> 2 is_na(x) 370µs 496.4µs 1837. 3.82MB 202.
options(cheapr.cores = 1)
mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 946µs 1.16ms 834. 3.81MB 121.
#> 2 is_na(x) 771µs 914.6µs 1055. 3.81MB 139.
### posixlt method is much faster
hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
tz = "UTC") |>
na_insert(10^5)
mark(is.na(hours), is_na(hours))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(hours) 1.04s 1.04s 0.965 61.05MB 0.965
#> 2 is_na(hours) 4.64ms 5.02ms 169. 7.65MB 19.9It differs in 2 regards:
NA when either that
element is an NA value or it is a list containing only
NA values.is_na returns a logical vector where
TRUE defines an empty row of only NA
values.# List example
is.na(list(NA, list(NA, NA), 10))
#> [1] TRUE FALSE FALSE
is_na(list(NA, list(NA, NA), 10))
#> [1] TRUE TRUE FALSE
# Data frame example
df <- new_df(x = c(1, NA, 3),
y = c(NA, NA, NA))
df
#> x y
#> 1 1 NA
#> 2 NA NA
#> 3 3 NA
is_na(df)
#> [1] FALSE TRUE FALSE
is_na(df)
#> [1] FALSE TRUE FALSE
# The below identity should hold
identical(is_na(df), row_na_counts(df) == ncol(df))
#> [1] TRUEis_na and all the NA handling functions
fall back on calling is.na() if no suitable method is
found. This means that custom objects like vctrs rcrds and more are
supported.
overviewInspired by the excellent skimr package, overview() is a
cheaper alternative designed for larger data.
df <- new_df(
x = sample.int(100, 10^6, TRUE),
y = as_factor(sample(LETTERS, 10^6, TRUE)),
z = rnorm(10^6)
)
overview(df)
#> obs: 1000000
#> cols: 3
#>
#> ----- Numeric -----
#> col n_missng p_complt n_unique mean p0 p25 p50 p75 p100 iqr sd hist
#> 1 x 0 1 100 50.52 1 25 51 76 100 51 28.88 ▇▇▇▇▇
#> 2 z 0 1 1000000 -0.00038 -4.58 -0.67 -0.00062 0.68 5.08 1.35 1 ▁▃▇▂▁
#>
#> ----- Categorical -----
#> col n_missng p_complt n_unique n_levels min max
#> 1 y 0 1 26 26 A Z
mark(overview(df, hist = FALSE))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 overview(df, hist = FALSE) 75.6ms 76.5ms 13.0 0B 0ssetsset(iris, 1:5)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5.0 3.6 1.4 0.2 setosa
sset(iris, 1:5, j = "Species")
#> Species
#> 1 setosa
#> 2 setosa
#> 3 setosa
#> 4 setosa
#> 5 setosa
# sset always returns a data frame when input is a data frame
sset(iris, 1, 1) # data frame
#> Sepal.Length
#> 1 5.1
iris[1, 1] # not a data frame
#> [1] 5.1
x <- sample.int(10^6, 10^4, TRUE)
y <- sample.int(10^6, 10^4, TRUE)
mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(x, x %in_% y) 87.6µs 117µs 7823. 109KB 10.9
#> 2 sset(x, x %in% y) 154.8µs 234µs 3783. 286KB 23.8
#> 3 x[x %in% y] 150.4µs 231µs 3903. 325KB 26.0sset uses an internal range-based subset when
i is an ALTREP integer sequence of the form m:n.
mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, 0:10^5) 302.8µs 442.85µs 2168. 1.53MB 38.7
#> 2 df[0:10^5, , drop = FALSE] 6.91ms 7.28ms 131. 4.83MB 6.68It also accepts negative indexes
mark(sset(df, -10^4:0),
df[-10^4:0, , drop = FALSE],
check = FALSE) # The only difference is the row names
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, -10^4:0) 2.68ms 3ms 326. 15.1MB 97.5
#> 2 df[-10^4:0, , drop = FALSE] 26.57ms 26.6ms 37.6 72.5MB 527.The biggest difference between sset and [
is the way logical vectors are handled. The two main differences when
i is a logical vector are:
NA values are ignored, only the locations of
TRUE values are used.i must be the same length as x and is not
recycled.# Examples with NAs
x <- c(1, 5, NA, NA, -5)
x[x > 0]
#> [1] 1 5 NA NA
sset(x, x > 0)
#> [1] 1 5
# Example with length(i) < length(x)
sset(x, TRUE)
#> Error in sset.default(x, TRUE): `length(i)` must match `length(x)` when `i` is a logical vector
# This is equivalent
x[TRUE]
#> [1] 1 5 NA NA -5
# to..
sset(x)
#> [1] 1 5 NA NA -5lag_()set.seed(37)
lag_(1:10, 3) # Lag(3)
#> [1] NA NA NA 1 2 3 4 5 6 7
lag_(1:10, -3) # Lead(3)
#> [1] 4 5 6 7 8 9 10 NA NA NA
# Using an example from data.table
library(data.table)
#> data.table 1.17.2 using 9 threads (see ?getDTthreads). Latest news: r-datatable.com
#>
#> Attaching package: 'data.table'
#>
#> The following object is masked from 'package:cheapr':
#>
#> address
dt <- data.table(year=2010:2014, v1=runif(5), v2=1:5, v3=letters[1:5])
# Similar to data.table::shift()
lag_(dt, 1) # Lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
lag_(dt, -1) # Lead
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2011 0.07883715 2 b
#> 2: 2012 0.64879698 3 c
#> 3: 2013 0.49685336 4 d
#> 4: 2014 0.71878731 5 e
#> 5: NA NA NA <NA>With lag_ we can update variables by reference,
including entire data frames
# At the moment, shift() cannot do this
lag_(dt, set = TRUE)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 d
dt # Was updated by reference
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 dlag2_ is a more generalised variant that supports
vectors of lags, custom ordering and run lengths.
lag2_(dt, order = 5:1) # Reverse order lag (same as lead)
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, -1) # Same as above
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2011 0.07883715 2 b
#> 3: 2012 0.64879698 3 c
#> 4: 2013 0.49685336 4 d
#> 5: NA NA NA <NA>
lag2_(dt, c(1, -1)) # Alternating lead/lag
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: NA NA NA <NA>
#> 2: 2011 0.07883715 2 b
#> 3: 2010 0.54964085 1 a
#> 4: 2013 0.49685336 4 d
#> 5: 2012 0.64879698 3 c
lag2_(dt, c(-1, 0, 0, 0, 0)) # Lead e.g. only first row
#> year v1 v2 v3
#> <int> <num> <int> <char>
#> 1: 2010 0.54964085 1 a
#> 2: 2010 0.54964085 1 a
#> 3: 2011 0.07883715 2 b
#> 4: 2012 0.64879698 3 c
#> 5: 2013 0.49685336 4 dgcd2(5, 25)
#> [1] 5
scm2(5, 6)
#> [1] 30
gcd(seq(5, 25, by = 5))
#> [1] 5
scm(seq(5, 25, by = 5))
#> [1] 300
x <- seq(1L, 1000000L, 1L)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 700ns 900ns 762787. 0B 76.3
x <- seq(0, 10^6, 0.5)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 31.6ms 32.6ms 30.1 0B 0As an example, to create 3 sequences with different increments,
the usual approach might be to use lapply to loop through the increment
values together with seq()
# Base R
increments <- c(1, 0.5, 0.1)
start <- 1
end <- 5
unlist(lapply(increments, \(x) seq(start, end, x)))
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2
#> [28] 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9
#> [55] 5.0In cheapr you can use seq_() which accepts vector
arguments.
seq_(start, end, increments)
#> [1] 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2
#> [28] 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9
#> [55] 5.0Use add_id = TRUE to label the individual sequences.
seq_(start, end, increments, add_id = TRUE)
#> 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 1.0 2.0 3.0 4.0 5.0 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3
#> 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
#> 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0If you know the sizes of your sequences beforehand, use
sequence_()
seq_sizes <- c(3, 5, 10)
sequence_(seq_sizes, from = 0, by = 1/3, add_id = TRUE)
#> 1 1 1 2 2 2 2 2 3 3 3
#> 0.0000000 0.3333333 0.6666667 0.0000000 0.3333333 0.6666667 1.0000000 1.3333333 0.0000000 0.3333333 0.6666667
#> 3 3 3 3 3 3 3
#> 1.0000000 1.3333333 1.6666667 2.0000000 2.3333333 2.6666667 3.0000000You can also calculate the sequence sizes using
seq_size()
seq_size(start, end, increments)
#> [1] 5 9 41cheapr provides a full set of common math functions that can transform numeric vectors in-place (no copies)
(x <- seq(0, 5, by = 0.5))
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
# x is modified in-place
set_add(x, 10);x
#> [1] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0
#> [1] 10.0 10.5 11.0 11.5 12.0 12.5 13.0 13.5 14.0 14.5 15.0
set_subtract(x, 10);x
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
set_multiply(x, 10);x
#> [1] 0 5 10 15 20 25 30 35 40 45 50
#> [1] 0 5 10 15 20 25 30 35 40 45 50
set_divide(x, 10);x
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
set_change_sign(x);x
#> [1] 0.0 -0.5 -1.0 -1.5 -2.0 -2.5 -3.0 -3.5 -4.0 -4.5 -5.0
#> [1] 0.0 -0.5 -1.0 -1.5 -2.0 -2.5 -3.0 -3.5 -4.0 -4.5 -5.0
set_abs(x);x
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
#> [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0
set_round(x);x
#> [1] 0 0 1 2 2 2 3 4 4 4 5
#> [1] 0 0 1 2 2 2 3 4 4 4 5
set_log(x);x
#> [1] -Inf -Inf 0.0000000 0.6931472 0.6931472 0.6931472 1.0986123 1.3862944 1.3862944 1.3862944
#> [11] 1.6094379
#> [1] -Inf -Inf 0.0000000 0.6931472 0.6931472 0.6931472 1.0986123 1.3862944 1.3862944 1.3862944
#> [11] 1.6094379These in-place functions are not always faster than using normal R math functions. This becomes apparent when performing multiple operations which R can process simultaneously.
x <- rnorm(10^6)
mark(
x * 10 * 20 + 1 - 1 ,
set_subtract(set_add(set_multiply(set_multiply(x, 10), 20), 1), 1)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl>
#> 1 x * 10 * 20 + 1 - 1 2.35ms 2.64ms 368. 7.63MB 37.6
#> 2 set_subtract(set_add(set_multiply(set_multiply(x, 10), 20), 1), 1) 3.21ms 3.43ms 275. 0B 0.argscheapr now provides .args as a means of providing a list
of arguments instead of .... This is designed to replace
the use of do.call().
In practice this means that users can either supply objects directly
to the dots ... or as a list of objects.
# The below lines are equivalent
cheapr_c(1, 2, 3)
#> [1] 1 2 3
cheapr_c(.args = list(1, 2, 3))
#> [1] 1 2 3A very common scenario is having a list of objects that you would
like to combine into a vector. Normally one would call
do.call(c, x) but it is much more efficient to use the
.args argument in cheapr_c().
x <- rep(list(0), 10^5)
mark(
do.call(c, x),
cheapr_c(.args = x)
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 do.call(c, x) 2.93ms 3.73ms 232. 781KB 116.
#> 2 cheapr_c(.args = x) 909.7µs 992.5µs 929. 781KB 4.22
# Matches the speed of `unlist()` without removing attributes
unlist(list(Sys.Date()), recursive = FALSE)
#> [1] 20233
cheapr_c(.args = list(Sys.Date()))
#> [1] "2025-05-25"Fast base-R style recycling using recycle()
recycle(letters, pi)
#> [[1]]
#> [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"
#>
#> [[2]]
#> [1] 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593
#> [13] 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593
#> [25] 3.141593 3.141593
# Data frame rows are recycled
recycle(vector = 1:10, data = cars)
#> $vector
#> [1] 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6 7 8 9 10 1 2 3 4 5 6
#> [37] 7 8 9 10 1 2 3 4 5 6 7 8 9 10
#>
#> $data
#> speed dist
#> 1 4 2
#> 2 4 10
#> 3 7 4
#> 4 7 22
#> 5 8 16
#> 6 9 10
#> 7 10 18
#> 8 10 26
#> 9 10 34
#> 10 11 17
#> 11 11 28
#> 12 12 14
#> 13 12 20
#> 14 12 24
#> 15 12 28
#> 16 13 26
#> 17 13 34
#> 18 13 34
#> 19 13 46
#> 20 14 26
#> 21 14 36
#> 22 14 60
#> 23 14 80
#> 24 15 20
#> 25 15 26
#> 26 15 54
#> 27 16 32
#> 28 16 40
#> 29 17 32
#> 30 17 40
#> 31 17 50
#> 32 18 42
#> 33 18 56
#> 34 18 76
#> 35 18 84
#> 36 19 36
#> 37 19 46
#> 38 19 68
#> 39 20 32
#> 40 20 48
#> 41 20 52
#> 42 20 56
#> 43 20 64
#> 44 22 66
#> 45 23 54
#> 46 24 70
#> 47 24 92
#> 48 24 93
#> 49 24 120
#> 50 25 85
# Using .args
recycle(.args = list(letters, pi))
#> [[1]]
#> [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z"
#>
#> [[2]]
#> [1] 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593
#> [13] 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593 3.141593
#> [25] 3.141593 3.141593Sizes are recycled to the common maximum, except when a vector is length 0 (excluding NULL which is ignored), in which case they are all recycled to length 0.
recycle(a = 1:3, b = 1:10, c = iris, d = numeric())
#> $a
#> integer(0)
#>
#> $b
#> integer(0)
#>
#> $c
#> [1] Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <0 rows> (or 0-length row.names)
#>
#> $d
#> numeric(0)cheapr provides some helpers in the form of
shallow_copy, semi_copy and
deep_copy.
mark(shallow_copy(iris))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 shallow_copy(iris) 300ns 400ns 1795783. 6.34KB 0
mark(deep_copy(iris))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 deep_copy(iris) 700ns 1.1µs 455454. 9.34KB 45.5
mark(semi_copy(iris))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 semi_copy(iris) 600ns 1.1µs 399343. 9.36KB 0shallow_copyShallow-copies list elements and attributes. When given an atomic vector it full copies the vector and so is mostly useful for lists.
deep_copyFull (deep) copies everything, including attributes.
semi_copyLike deep_copy it deep-copies everything, excluding
attributes, which it shallow copies. In practice this turns out to be
more efficient.
semi_copy() vs deep_copy()
df <- new_df(x = integer(10^6))
attr(df, "my_attr") <- integer(10^6)
# Take note of the memory allocation
mark(
semi_copy(df), # Only deep copies the data
deep_copy(df) # Deep copies "my_attr" as well
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 semi_copy(df) 636.9µs 682.6µs 1381. 3.81MB 68.3
#> 2 deep_copy(df) 1.16ms 1.4ms 665. 7.63MB 68.3With cheapr you can add and remove attributes flexibly using
attrs_add().
To remove all attributes, use attrs_rm().
To remove specific attributes, use
attrs_add(attr = NULL).
(x <- attrs_add(1:10, .length = 10, .type = "integer"))
#> [1] 1 2 3 4 5 6 7 8 9 10
#> attr(,".length")
#> [1] 10
#> attr(,".type")
#> [1] "integer"
attrs_add(x, .type = NULL) # Remove specific attribute '.type'
#> [1] 1 2 3 4 5 6 7 8 9 10
#> attr(,".length")
#> [1] 10
attrs_rm(x) # Clear all attributes
#> [1] 1 2 3 4 5 6 7 8 9 10
# With .args
y <- 11:20
attrs_add(y, .args = attributes(x))
#> [1] 11 12 13 14 15 16 17 18 19 20
#> attr(,".length")
#> [1] 10
#> attr(,".type")
#> [1] "integer"Both functions allow setting attributes in-place. This turns out to be very useful in avoiding implicit copies that R performs when it detects that the data has been modified.
This must be used with care to not overwrite an existing object’s attributes. Therefore it is best-practice to only use in-place attribute manipulation on fresh objects, i.e objects that you can ensure are newly created.
add_length_class <- function(x){
attr(x, ".length") <- length(x)
attr(x, ".class") <- class(x)
x
}
add_length_class_in_place <- function(x){
attrs_add(
x, .length = length(x), .class = class(x),
.set = TRUE
)
}
# Notice the memory allocations
# we expect only 3.81 MB to be allocated
mark(
add_length_class(integer(10^6)),
add_length_class_in_place(integer(10^6)),
iterations = 1
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 add_length_class(integer(10^6)) 3.65ms 3.65ms 274. 3.81MB 0
#> 2 add_length_class_in_place(integer(10^6)) 2.01ms 2.01ms 498. 3.81MB 0
mark(
add_length_class(integer(10^6)),
add_length_class_in_place(integer(10^6)),
iterations = 1
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 add_length_class(integer(10^6)) 1.47ms 1.47ms 683. 7.63MB 0
#> 2 add_length_class_in_place(integer(10^6)) 885.8µs 885.8µs 1129. 3.81MB 0
# R detected that the vector we created had been modified (because it was)
# and created a copy
# When we add the attributes in-place to our fresh object, no copies are
# madex <- rep(TRUE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 946.2µs 1.11ms 808. 3.82MB 52.4
#> 2 base_which 1.44ms 1.68ms 573. 7.63MB 62.6
x <- rep(FALSE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 118µs 124µs 7093. 0B 0
#> 2 base_which 228µs 256µs 3587. 3.81MB 128.
x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 610µs 723.65µs 1182. 1.91MB 21.2
#> 2 base_which 986µs 1.17ms 828. 7.63MB 71.7
x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.32ms 1.4ms 666. 3.81MB 30.5
#> 2 base_which 1.74ms 1.96ms 489. 9.54MB 61.8
x <- sample(c(TRUE, FALSE), 10^6, TRUE)
x[sample.int(10^6, 10^4)] <- NA
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 751.3µs 843.85µs 1084. 1.89MB 26.3
#> 2 base_which 4.08ms 4.21ms 227. 5.7MB 13.8x <- sample(seq(-10^3, 10^3, 0.01))
y <- do.call(paste0, expand.grid(letters, letters, letters, letters))
mark(cheapr_factor = factor_(x),
base_factor = factor(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 9.89ms 10.4ms 93.0 4.59MB 2.73
#> 2 base_factor 314.43ms 314.4ms 3.18 27.84MB 3.18
mark(cheapr_factor = factor_(x, order = FALSE),
base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.79ms 5.1ms 183. 1.53MB 4.52
#> 2 base_factor 517.44ms 517.4ms 1.93 22.79MB 0
mark(cheapr_factor = factor_(y),
base_factor = factor(y))
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 191.37ms 199.66ms 4.94 5.23MB 0
#> 2 base_factor 2.76s 2.76s 0.362 54.35MB 0.362
mark(cheapr_factor = factor_(y, order = FALSE),
base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 8.46ms 9.47ms 101. 3.49MB 7.19
#> 2 base_factor 54.79ms 56.2ms 17.7 39.89MB 29.5x <- sample.int(10^6, 10^5, TRUE)
y <- sample.int(10^6, 10^5, TRUE)
mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
base_intersect = intersect(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_intersect 2.61ms 2.84ms 340. 1.19MB 4.45
#> 2 base_intersect 4.86ms 5.2ms 182. 6.41MB 17.3
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_setdiff 2.79ms 2.96ms 313. 1.79MB 6.76
#> 2 base_setdiff 5.08ms 5.44ms 172. 6.96MB 13.9%in_% and %!in_%mark(cheapr = x %in_% y,
base = x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.72ms 1.84ms 492. 781.34KB 4.38
#> 2 base 2.27ms 2.54ms 380. 2.53MB 13.0
mark(cheapr = x %!in_% y,
base = !x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.66ms 1.83ms 508. 792.3KB 6.95
#> 2 base 2.33ms 2.68ms 358. 2.91MB 12.9as_discreteas_discrete is a cheaper alternative to
cut
x <- rnorm(10^6)
b <- seq(0, max(x), 0.2)
mark(cheapr_cut = as_discrete(x, b, left = FALSE),
base_cut = cut(x, b))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_cut 14.2ms 14.9ms 64.7 3.92MB 4.47
#> 2 base_cut 27.3ms 30.8ms 33.1 15.32MB 18.4cheapr_if_elseA cheap alternative to ifelse
mark(
cheapr_if_else(x >= 0, "pos", "neg"),
ifelse(x >= 0, "pos", "neg"),
data.table::fifelse(x >= 0, "pos", "neg")
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 "cheapr_if_else(x >= 0, \"pos\", \"neg\")" 10.01ms 12.1ms 76.9 11.4MB 13.8
#> 2 "ifelse(x >= 0, \"pos\", \"neg\")" 138.48ms 142.3ms 7.00 53.4MB 7.00
#> 3 "data.table::fifelse(x >= 0, \"pos\", \"neg\")" 9.94ms 10.6ms 80.4 11.4MB 15.7casecheapr’s version of a case-when statement, with mostly the same
arguments as dplyr::case_when but similar efficiency as
data.table::fcase
mark(case(
x >= 0 ~ "pos",
x < 0 ~ "neg",
.default = "Unknown"
),
data.table::fcase(
x >= 0, "pos",
x < 0, "neg",
rep_len(TRUE, length(x)), "Unknown"
))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl>
#> 1 "case(x >= 0 ~ \"pos\", x < 0 ~ \"neg\", .default = \"Unknown\")" 20.4ms 22.1ms 45.1 28.8MB 50.1
#> 2 "data.table::fcase(x >= 0, \"pos\", x < 0, \"neg\", rep_len(TRUE, … 18.9ms 20.1ms 49.3 26.7MB 31.4val_match is an even cheaper special variant of
case when all LHS expressions are length-1 vectors, i.e
scalars
x <- round(rnorm(10^6))
mark(
val_match(x, 1 ~ Inf, 2 ~ -Inf, .default = NaN),
case(x == 1 ~ Inf,
x == 2 ~ -Inf,
.default = NaN),
data.table::fcase(x == 1, Inf,
x == 2, -Inf,
rep_len(TRUE, length(x)), NaN)
)
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:t> <dbl> <bch:byt> <dbl>
#> 1 val_match(x, 1 ~ Inf, 2 ~ -Inf, .default = NaN) 4.24ms 4.66ms 206. 8.79MB 41.1
#> 2 case(x == 1 ~ Inf, x == 2 ~ -Inf, .default = NaN) 16.67ms 17.21ms 55.9 27.63MB 45.8
#> 3 data.table::fcase(x == 1, Inf, x == 2, -Inf, rep_len(TRUE, lengt… 14.21ms 15.76ms 62.6 30.52MB 33.2get_breaks is a very fast function for generating pretty
equal-width breaks It is similar to base::pretty though
somewhat less flexible with simpler arguments.
x <- with_local_seed(rnorm(10^5), 112)
# approximately 10 breaks
get_breaks(x, 10)
#> [1] -6 -4 -2 0 2 4 6
pretty(x, 10)
#> [1] -6 -5 -4 -3 -2 -1 0 1 2 3 4 5
mark(
get_breaks(x, 20),
pretty(x, 20),
check = FALSE
)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 get_breaks(x, 20) 61µs 63µs 14291. 0B 0
#> 2 pretty(x, 20) 407µs 708µs 1369. 1.91MB 23.6
# Not pretty but equal width breaks
get_breaks(x, 5, pretty = FALSE)
#> [1] -5.0135893 -3.2004889 -1.3873886 0.4257118 2.2388121 4.0519125
diff(get_breaks(x, 5, pretty = FALSE)) # Widths
#> [1] 1.8131 1.8131 1.8131 1.8131 1.8131It can accept both data and a length-two vector representing a range, meaning it can easily be used in ggplot2 and base R plots
library(ggplot2)
gg <- airquality |>
ggplot(aes(x = Ozone, y = Wind)) +
geom_point() +
geom_smooth(se = FALSE)
# Add our breaks
gg +
scale_x_continuous(breaks = get_breaks)
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#> Warning: Removed 37 rows containing non-finite outside the scale range (`stat_smooth()`).
#> Warning: Removed 37 rows containing missing values or values outside the scale range (`geom_point()`).
# More breaks
# get_breaks accepts a range too
gg +
scale_x_continuous(breaks = \(x) get_breaks(range(x), 20))
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#> Warning: Removed 37 rows containing non-finite outside the scale range (`stat_smooth()`).
#> Removed 37 rows containing missing values or values outside the scale range (`geom_point()`).