Rdatatable · ben-schwen · Oct 28, 2025 · Oct 29, 2025 · Oct 30, 2025 · Oct 30, 2025
@@ -34,6 +34,14 @@
 
 7. Fixed compilation failure like "error: unknown type name 'siginfo_t'" in v1.18.0 in some strict environments, e.g., FreeBSD, where the header file declaring the POSIX function `waitid` does not transitively include the header file defining the `siginfo_t` type, [#7516](https://github.com/rdatatable/data.table/issues/7516). Thanks to @jszhao for the report and @aitap for the fix.
 
+8. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation.
+
+    This rewrite also introduces several new optimizations:
+      - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336)
+      - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032)
+      - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934)
+      - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815)
+
 ### Notes
 
 1. {data.table} now depends on R 3.5.0 (2018).

@@ -380,7 +380,39 @@ utf8_check = function(test_str) identical(test_str, enc2native(test_str))
 test = function(num, x, y=TRUE,
                 error=NULL, warning=NULL, message=NULL, output=NULL, notOutput=NULL, ignore.warning=NULL,
                 options=NULL, env=NULL,
-                context=NULL, requires_utf8=FALSE) {
+                context=NULL, requires_utf8=FALSE, optimize=NULL) {
+  # if optimization is provided, test across multiple optimization levels
+  if (!is.null(optimize)) {
+    if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L))
+      stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) # nocov
+    cl = match.call()
+    if ("datatable.optimize" %in% names(cl$options))
+      stopf("Trying to set optimization level through both options= and optimize=") # nocov
+    cl$optimize = NULL  # Remove optimization levels from the recursive call
+
+    # Check if y was explicitly provided (not just the default)
+    y_provided = !missing(y)
+    vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment())
+    vector_params = vector_params[lengths(vector_params) > 0L]
+    compare = !y_provided && length(optimize)>1L && !length(vector_params)
+    # When optimize has multiple levels, vector params are recycled across levels.
+    if (length(optimize) > 1L && "warning" %in% names(vector_params) && length(vector_params$warning) > 1L)
+      warningf("warning= with multiple values is recycled across optimize levels, not treated as multiple warnings in one run")
+
+    for (i in seq_along(optimize)) {
+      cl$num = num + (i - 1L) * 1e-6
+      opt_level = list(datatable.optimize = optimize[i])
+      cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level
+      for (param in names(vector_params)) {
+        val = vector_params[[param]]
+        cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels
+      }
+
+      if (compare && i == 1L) cl$y = eval(cl$x, parent.frame())
+      eval(cl, parent.frame()) # actual test call
+    }
+    return(invisible())
+  }
   if (!is.null(env)) {
     old = Sys.getenv(names(env), names=TRUE, unset=NA)
     to_unset = !lengths(env)

@@ -190,24 +190,14 @@ DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
 test(301.1, nrow(DT[,sum(B),by=C])==100010)
 
 # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
-local({
-  old = options(datatable.optimize=0L); on.exit(options(old))
-  set.seed(1)
-  DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
-  test(637.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
-  test(637.2, key(DT[J(43L), a:=99L]), NULL)
-  setkey(DT, a)
-  test(637.3, key(DT[, a:=99L, by=a]), NULL)
-})
-local({
-  options(datatable.optimize=2L); on.exit(options(old))
-  set.seed(1)
-  DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
-  test(638.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
-  test(638.2, key(DT[J(43L), a:=99L]), NULL)
-  setkey(DT,a)
-  test(638.3, key(DT[, a:=99L, by=a]), NULL)
-})
+set.seed(1)
+DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a")
+opt = c(0L,2L)
+test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a"))
+test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL)
+setkey(DT, a)
+test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL)
+# test 637 subsumes 637 and 638 for different optimization levels
 
 # Test X[Y] slowdown, #2216
 # Many minutes in 1.8.2!  Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes