From ebd152d585a02593ca67f3b8b7d87e3f0fa0c99c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 28 Oct 2025 18:09:00 +0100 Subject: [PATCH 01/78] modular optimization paths - init --- R/data.table.R | 610 +++++++++++++++++++++++++++++-------------------- 1 file changed, 360 insertions(+), 250 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index db74384c4b..255c9aeac0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -147,6 +147,350 @@ replace_dot_alias = function(e) { } } +# Transform lapply(.SD, fun) or Map(fun, .SD) into list(fun(col1), fun(col2), ...) +.massageSD = function(jsub, sdvars, SDenv, funi) { + txt = as.list(jsub)[-1L] + if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 + # support Map instead of lapply #5336 + fun = if (jsub %iscall% "Map") txt[[1L]] else txt[[2L]] + if (fun %iscall% "function") { + # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT + # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means + # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) + # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) + thisfun = paste0("..FUN", funi) # Fix for #985 + assign(thisfun, eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD + lockBinding(thisfun, SDenv) + txt[[1L]] = as.name(thisfun) + } else { + if (is.character(fun)) fun = as.name(fun) + txt[[1L]] = fun + } + ans = vector("list", length(sdvars)+1L) + ans[[1L]] = as.name("list") + for (ii in seq_along(sdvars)) { + txt[[2L]] = as.name(sdvars[ii]) + ans[[ii+1L]] = as.call(txt) + } + jsub = as.call(ans) # important no names here + jvnames = sdvars # but here instead + list(jsub=jsub, jvnames=jvnames, funi=funi+1L) + # It may seem inefficient to construct a potentially long expression. But, consider calling + # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it + # is called, involving small memory allocations. + # The R level lapply calls as.list which needs a shallow copy. + # lapply also does a setAttib of names (duplicating the same names over and over again + # for each group) which is terrible for our needs. We replace all that with a + # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol + # lookup), and the eval() inside dogroups hardly has to do anything. All this results in + # overhead minimised. We don't need to worry about the env passed to the eval in a possible + # lapply replacement, or how to pass ... efficiently to it. + # Plus we optimize lapply first, so that mean() can be optimized too as well, next. +} + +# Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD) +# return NULL for no optimization possible +.optimize_sd_subset = function(jsub, sdvars, SDenv, envir) { + if (!is.call(jsub) || length(jsub) < 2L || !is.name(jsub[[2L]]) || jsub[[2L]] != ".SD") return(NULL) + + # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 + subopt = length(jsub) == 3L && + (jsub %iscall% "[" || + (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), SDenv$.SDall, envir))) && + (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") + headopt = jsub %iscall% c("head", "tail") + firstopt = jsub %iscall% c("first", "last") # fix for #2030 + if (subopt || headopt || firstopt) { + if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + jsub_new = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) + return(list(jsub=jsub_new, jvnames=sdvars)) + } + + NULL +} + +# Optimize c(...) expressions +.optimize_c_expr = function(jsub, jvnames, sdvars, SDenv, funi, envir) { + if (!jsub %iscall% "c" || length(jsub) <= 1L) { + return(list(jsub=jsub, jvnames=jvnames, funi=funi, optimized=FALSE)) + } + # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. + # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains + # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* + # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* + # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. + # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output + # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. + # For now, we optimise all functions mentioned in 'optfuns' below. + optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") + is_valid = TRUE + any_SD = FALSE + jsubl = as.list.default(jsub) + oldjvnames = jvnames + jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. + # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! + for (i_ in 2L:length(jsubl)) { + this = jsub[[i_]] + # Case 1: Plain name (.SD or .N) + if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names + if (this == ".SD") { # optimise '.SD' alone + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, as.name) + jvnames = c(jvnames, sdvars) + } else if (this == ".N") { + # don't optimise .I in c(.SD, .I), it's length can be > 1 + # only c(.SD, list(.I)) should be optimised!! .N is always length 1. + jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) + } else { + # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + is_valid = FALSE + break + } + } + # Case 2: Call expression + else if (is.call(this)) { + # Case 2a: lapply(.SD, ...) or Map(fun, .SD) + is_lapply = this[[1L]] == "lapply" && length(this) >= 2L && this[[2L]] == ".SD" + is_map = this[[1L]] == "Map" && length(this) >= 3L && this[[3L]] == ".SD" + if ((is_lapply || is_map) && length(sdvars)) { + any_SD = TRUE + massage_result = .massageSD(this, sdvars, SDenv, funi) + funi = massage_result$funi + jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.) + jn__ = massage_result$jvnames + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named arguments of c() to column names of .SD + # e.g. c(mean=lapply(.SD, mean)) or c(mean=Map(mean, .SD)) + jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) + } + jvnames = c(jvnames, jn__) + } + # Case 2b: list(...) + else if (this[[1L]] == "list") { + # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen + if (length(this) > 1L) { + jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + njl__ = names(jl__) %||% rep("", length(jl__)) + njl__nonblank = nzchar(names(jl__)) + if (length(jl__) > 1L) { + jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) + } else { + jn__ = names(jsubl)[i_] + } + jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + } else { + jn__ = names(jl__) %||% rep("", length(jl__)) + } + idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && + jvnames = c(jvnames, jn__) + jsubl[[i_]] = jl__ + } + } + # Case 2c: Single-value functions like mean, sum, etc. + else if (this %iscall% optfuns && length(this)>1L) { + jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) + } + # Case 2d: .SD[1] or similar subsetting + else if (length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && + this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { + # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. + any_SD = TRUE + jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) + jvnames = c(jvnames, sdvars) + } + # Case 2e: Complex .SD usage - can't optimize + else if (any(all.vars(this) == ".SD")) { + # TODO, TO DO: revisit complex cases (as illustrated below) + # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] + # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. + is_valid = FALSE + break + } + # Case 2f: Other cases - skip optimization + else { + # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? + is_valid = FALSE + break + } + } + # Case 3: Other types - can't optimize + else { + is_valid = FALSE + break + } + } + + # Return result + if (!is_valid || !any_SD) { + # Can't optimize - return original + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) + } else { + # Optimization successful + setattr(jsubl, 'names', NULL) + jsub_new = as.call(unlist(jsubl, use.names=FALSE)) + jsub_new[[1L]] = quote(list) + return(list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE)) + } +} + +# Optimize lapply(.SD, ...) expressions +# This function transforms lapply(.SD, fun) into list(fun(col1), fun(col2), ...) +# Returns: list(jsub=call/name, jvnames=character) +.optimize_lapply = function(jsub, jvnames, sdvars, SDenv, verbose, envir) { + oldjsub = jsub + funi = 1L # Fix for #985 + + # Try different optimization patterns in order + + # Pattern 1: Plain .SD -> list(col1, col2, ...) + if (is.name(jsub) && jsub == ".SD") { + jsub = as.call(c(quote(list), lapply(sdvars, as.name))) + jvnames = sdvars + } + # Pattern 2: .SD subsetting like .SD[1], head(.SD), first(.SD) + else if (!is.null(result <- .optimize_sd_subset(jsub, sdvars, SDenv, envir))) { + jsub = result$jsub + jvnames = result$jvnames + } + # Pattern 3a: lapply(.SD, fun) + else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L && jsub[[2L]] == ".SD" && length(sdvars)) { + massage_result = .massageSD(jsub, sdvars, SDenv, funi) + jsub = massage_result$jsub + jvnames = massage_result$jvnames + funi = massage_result$funi + } + # Pattern 3b: Map(fun, .SD) + else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { + massage_result = .massageSD(jsub, sdvars, SDenv, funi) + jsub = massage_result$jsub + jvnames = massage_result$jvnames + funi = massage_result$funi + } + # Pattern 4: c(...) with .SD components + else if (is.call(jsub)) { + c_result = .optimize_c_expr(jsub, jvnames, sdvars, SDenv, funi, envir) + if (c_result$optimized) { + jsub = c_result$jsub + jvnames = c_result$jvnames + funi = c_result$funi + } + } + + # Verbose output + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) + else + catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) + } + + list(jsub=jsub, jvnames=jvnames) +} + +# Optimize expressions using GForce (C-level optimizations) +# This function replaces functions like mean() with gmean() for fast C implementations +.optimize_gforce = function(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { + GForce = FALSE + + # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with + # nomatch=NULL even now.. but not switching it on yet, will deal it separately. + if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { + if (!length(ansvars) && !use.I) { + GForce = FALSE + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { + GForce = TRUE + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) + } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE + } else { + # Apply GForce + if (jsub %iscall% "list") { + GForce = TRUE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} + } + } else + GForce = .gforce_ok(jsub, SDenv$.SDall, envir) + if (GForce) { + if (jsub %iscall% "list") + for (ii in seq_along(jsub)[-1L]) { + if (is.N(jsub[[ii]])) next; # For #334 + jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) + } + else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L + jsub = .gforce_jsub(jsub, names_x, envir) + } + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); + } + } + + list(GForce=GForce, jsub=jsub) +} + +# Old mean() optimization fallback when GForce is not used +.optimize_mean = function(jsub, SDenv, verbose, GForce) { + if (!GForce && !is.name(jsub)) { + # Still do the old speedup for mean, for now + nomeanopt=FALSE # to be set by .optmean() using <<- inside it + oldjsub = jsub + if (jsub %iscall% "list") { + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() + # jsub[[1]]=="list" so the first item of todo will always be FALSE + todo = sapply(jsub, `%iscall%`, 'mean') + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) + } + } else if (jsub %iscall% "mean") { + jsub = .optmean(jsub) + } + if (nomeanopt) { + warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) + } + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) + else + catf("Old mean optimization is on, left j unchanged.\n") + } + assign("Cfastmean", Cfastmean, SDenv) + # Old comments still here for now ... + # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. + # Maybe change to : + # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower + # when fastmean can do trim. + } + + jsub +} + +# attempts to optimize j expressions using lapply, GForce, and mean optimizations +.attempt_optimize = function(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { + # Step 1: Apply lapply(.SD) optimization + lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir) + jsub = lapply_result$jsub + jvnames = lapply_result$jvnames + + # Step 2: Apply GForce optimization + gforce_result = .optimize_gforce(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) + GForce = gforce_result$GForce + jsub = gforce_result$jsub + + # Step 3: Apply old mean optimization (fallback when GForce is not used) + jsub = .optimize_mean(jsub, SDenv, verbose, GForce) + + list(GForce=GForce, jsub=jsub, jvnames=jvnames) +} + "[.data.table" = function(x, i, j, by, keyby, with=TRUE, nomatch=NA, mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0.0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL, env=NULL, showProgress=getOption("datatable.showProgress", interactive())) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could @@ -1612,252 +1956,18 @@ replace_dot_alias = function(e) { SDenv$.NGRP = length(f__) lockBinding(".NGRP", SDenv) - GForce = FALSE - if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit - # Optimization to reduce overhead of calling lapply over and over for each group - oldjsub = jsub - funi = 1L # Fix for #985 - # converted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then. - .massageSD = function(jsub) { - txt = as.list(jsub)[-1L] - if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 - fun = txt[[2L]] - if (fun %iscall% "function") { - # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT - # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means - # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) - # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) - thisfun = paste0("..FUN", funi) # Fix for #985 - assign(thisfun,eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD - lockBinding(thisfun,SDenv) - txt[[1L]] = as.name(thisfun) - } else { - if (is.character(fun)) fun = as.name(fun) - txt[[1L]] = fun - } - ans = vector("list", length(sdvars)+1L) - ans[[1L]] = as.name("list") - for (ii in seq_along(sdvars)) { - txt[[2L]] = as.name(sdvars[ii]) - ans[[ii+1L]] = as.call(txt) - } - jsub = as.call(ans) # important no names here - jvnames = sdvars # but here instead - list(jsub, jvnames) - # It may seem inefficient to construct a potentially long expression. But, consider calling - # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it - # is called, involving small memory allocations. - # The R level lapply calls as.list which needs a shallow copy. - # lapply also does a setAttib of names (duplicating the same names over and over again - # for each group) which is terrible for our needs. We replace all that with a - # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol - # lookup), and the eval() inside dogroups hardly has to do anything. All this results in - # overhead minimised. We don't need to worry about the env passed to the eval in a possible - # lapply replacement, or how to pass ... efficiently to it. - # Plus we optimize lapply first, so that mean() can be optimized too as well, next. - } - if (is.name(jsub)) { - if (jsub == ".SD") { - jsub = as.call(c(quote(list), lapply(sdvars, as.name))) - jvnames = sdvars - } - } else if (is.name(jsub[[1L]])) { # Else expect problems with - # g[[ only applies to atomic input, for now, was causing #4159. be sure to eval with enclos=parent.frame() for #4612 - subopt = length(jsub) == 3L && - (jsub %iscall% "[" || - (jsub %iscall% "[[" && is.name(jsub[[2L]]) && eval(call('is.atomic', jsub[[2L]]), x, parent.frame()))) && - (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") - headopt = jsub %iscall% c("head", "tail") - firstopt = jsub %iscall% c("first", "last") # fix for #2030 - if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && - (subopt || headopt || firstopt)) { - if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 - # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - jsub = as.call(c(quote(list), lapply(sdvars, function(x) { jsub[[2L]] = as.name(x); jsub }))) - jvnames = sdvars - } else if (jsub %iscall% "lapply" && jsub[[2L]]==".SD" && length(xcols)) { - deparse_ans = .massageSD(jsub) - jsub = deparse_ans[[1L]] - jvnames = deparse_ans[[2L]] - } else if (jsub %iscall% "c" && length(jsub) > 1L) { - # TODO, TO DO: raise the checks for 'jvnames' earlier (where jvnames is set by checking 'jsub') and set 'jvnames' already. - # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. - # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains - # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* - # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* - # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. - # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output - # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. - # For now, we optimise all functions mentioned in 'optfuns' below. - optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") - is_valid = TRUE - any_SD = FALSE - jsubl = as.list.default(jsub) - oldjvnames = jvnames - jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. - # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! - for (i_ in 2L:length(jsubl)) { - this = jsub[[i_]] - if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names - if (this == ".SD") { # optimise '.SD' alone - any_SD = TRUE - jsubl[[i_]] = lapply(sdvars, as.name) - jvnames = c(jvnames, sdvars) - } else if (this == ".N") { - # don't optimise .I in c(.SD, .I), it's length can be > 1 - # only c(.SD, list(.I)) should be optimised!! .N is always length 1. - jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) - } else { - # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) - is_valid=FALSE - break - } - } else if (is.call(this)) { - if (this[[1L]] == "lapply" && this[[2L]] == ".SD" && length(xcols)) { - any_SD = TRUE - deparse_ans = .massageSD(this) - funi = funi + 1L # Fix for #985 - jsubl[[i_]] = as.list(deparse_ans[[1L]][-1L]) # just keep the '.' from list(.) - jn__ = deparse_ans[[2L]] - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named arguments of c() to column names of .SD - # e.g. c(mean=lapply(.SD, mean)) - jn__ = paste(names(jsubl)[i_], jn__, sep=".") # sep="." for consistency with c(A=list(a=1,b=1)) - } - jvnames = c(jvnames, jn__) - } else if (this[[1L]] == "list") { - # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen - if (length(this) > 1L) { - jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* - njl__ = names(jl__) %||% rep("", length(jl__)) - njl__nonblank = nzchar(names(jl__)) - if (length(jl__) > 1L) { - jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) - } else { - jn__ = names(jsubl)[i_] - } - jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") - } else { - jn__ = names(jl__) %||% rep("", length(jl__)) - } - idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) - if (any(idx)) - jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && - jvnames = c(jvnames, jn__) - jsubl[[i_]] = jl__ - } - } else if (this %iscall% optfuns && length(this)>1L) { - jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) - } else if ( length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && - this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) { - # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - any_SD = TRUE - jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) - jvnames = c(jvnames, sdvars) - } else if (any(all.vars(this) == ".SD")) { - # TODO, TO DO: revisit complex cases (as illustrated below) - # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] - # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. - is_valid=FALSE - break - } else { # just to be sure that any other case (I've overlooked) runs smoothly, without optimisation - # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? - is_valid=FALSE - break - } - } else { - is_valid = FALSE - break - } - } - if (!is_valid || !any_SD) { # restore if c(...) doesn't contain lapply(.SD, ..) or if it's just invalid - jvnames = oldjvnames # reset jvnames - jsub = oldjsub # reset jsub - jsubl = as.list.default(jsubl) # reset jsubl - } else { - setattr(jsubl, 'names', NULL) - jsub = as.call(unlist(jsubl, use.names=FALSE)) - jsub[[1L]] = quote(list) - } - } - } + # Call extracted GForce optimization function + if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { + gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) + GForce = gforce_result$GForce + jsub = gforce_result$jsub + jvnames = gforce_result$jvnames + } else { + GForce = FALSE if (verbose) { - if (!identical(oldjsub, jsub)) - catf("lapply optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub,width.cutoff=200L, nlines=1L)) - else - catf("lapply optimization is on, j unchanged as '%s'\n", deparse(jsub,width.cutoff=200L, nlines=1L)) - } - # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with - # nomatch=NULL even now.. but not switching it on yet, will deal it separately. - if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { - if (!length(ansvars) && !use.I) { - GForce = FALSE - if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { - GForce = TRUE - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) - } - } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N - GForce = FALSE - } else { - # Apply GForce - if (jsub %iscall% "list") { - GForce = TRUE - for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { - if (!.gforce_ok(jsub[[ii]], SDenv$.SDall)) {GForce = FALSE; break} - } - } else - GForce = .gforce_ok(jsub, SDenv$.SDall) - if (GForce) { - if (jsub %iscall% "list") - for (ii in seq_along(jsub)[-1L]) { - if (is.N(jsub[[ii]])) next; # For #334 - jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x) - } - else { - # adding argument to ghead/gtail if none is supplied to g-optimized head/tail - if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L - jsub = .gforce_jsub(jsub, names_x) - } - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) - } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); - } - } - if (!GForce && !is.name(jsub)) { - # Still do the old speedup for mean, for now - nomeanopt=FALSE # to be set by .optmean() using <<- inside it - oldjsub = jsub - if (jsub %iscall% "list") { - # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() - # jsub[[1]]=="list" so the first item of todo will always be FALSE - todo = sapply(jsub, `%iscall%`, 'mean') - if (any(todo)) { - w = which(todo) - jsub[w] = lapply(jsub[w], .optmean) - } - } else if (jsub %iscall% "mean") { - jsub = .optmean(jsub) - } - if (nomeanopt) { - warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) - } - if (verbose) { - if (!identical(oldjsub, jsub)) - catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) - else - catf("Old mean optimization is on, left j unchanged.\n") - } - assign("Cfastmean", Cfastmean, SDenv) - # Old comments still here for now ... - # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. - # Maybe change to : - # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower - # when fastmean can do trim. + if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") + else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } - } else if (verbose) { - if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") - else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } if (byjoin) { groups = i @@ -3181,11 +3291,11 @@ is_constantish = function(q, check_singleton=FALSE) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) } -`.g[_ok` = function(q, x) { +`.g[_ok` = function(q, x, envir=parent.frame(3L)) { length(q) == 3L && is_constantish(q[[3L]], check_singleton = TRUE) && (q[[1L]] != "[[" || eval(call('is.atomic', q[[2L]]), envir=x)) && - !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], parent.frame(3L))) && length(q3)==1L && q3>0L + !(as.character(q[[3L]]) %chin% names(x)) && is.numeric(q3 <- eval(q[[3L]], envir)) && length(q3)==1L && q3>0L } .gweighted.mean_ok = function(q, x) { #3977 q = match.call(gweighted.mean, q) @@ -3209,7 +3319,7 @@ is_constantish = function(q, check_singleton=FALSE) { # is robust to unnamed expr. Note that NA names are not possible here. .arg_is_narm = function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") -.gforce_ok = function(q, x) { +.gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 q1 = .get_gcall(q) if (is.null(q1)) return(FALSE) @@ -3219,19 +3329,19 @@ is_constantish = function(q, check_singleton=FALSE) { "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), "tail" = , "head" = .ghead_ok(q), - "[[" = , "[" = `.g[_ok`(q, x), + "[[" = , "[" = `.g[_ok`(q, x, envir), FALSE ) } -.gforce_jsub = function(q, names_x) { +.gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. q[[1L]] = as.name(paste0("g", call_name)) # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok # do not evaluate vars present as columns in x if (length(q) >= 3L) { for (i in 3:length(q)) { - if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], parent.frame(2L)) # tests 1187.2 & 1187.4 + if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 } } q From 71b21ab4538e5a93570239917ba2a4bfa5ed3119 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 29 Oct 2025 09:17:11 +0100 Subject: [PATCH 02/78] make linter happy --- R/data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 255c9aeac0..aa184f5c4c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -210,7 +210,7 @@ replace_dot_alias = function(e) { NULL } -# Optimize c(...) expressions +# Optimize c(...) expressions .optimize_c_expr = function(jsub, jvnames, sdvars, SDenv, funi, envir) { if (!jsub %iscall% "c" || length(jsub) <= 1L) { return(list(jsub=jsub, jvnames=jvnames, funi=funi, optimized=FALSE)) @@ -328,13 +328,13 @@ replace_dot_alias = function(e) { # Return result if (!is_valid || !any_SD) { # Can't optimize - return original - return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) + list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE) } else { # Optimization successful setattr(jsubl, 'names', NULL) jsub_new = as.call(unlist(jsubl, use.names=FALSE)) jsub_new[[1L]] = quote(list) - return(list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE)) + list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE) } } From 8a9e72734e63c3b8b0efde67755a9b94368476be Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Thu, 30 Oct 2025 16:02:25 +0100 Subject: [PATCH 03/78] move tests --- R/test.data.table.R | 32 +- inst/tests/benchmark.Rraw | 25 +- inst/tests/tests.Rraw | 896 ++++++-------------------------------- man/test.Rd | 3 +- 4 files changed, 167 insertions(+), 789 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 6e264c871f..bfc61412a3 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,7 +361,37 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,levels=NULL) { + # if levels is provided, test across multiple optimization levels + if (!is.null(levels)) { + cl = match.call() + cl$levels = NULL # Remove levels from the recursive call + + vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") + # Check if y was explicitly provided (not just the default) + y_provided = !missing(y) + compare = !y_provided && length(levels)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + + for (i in seq_along(levels)) { + cl$num = num + (i - 1L) * 1e-6 + opt_level = list(datatable.optimize = levels[i]) + cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level + for (p in vector_params) { + val = get(p, envir=environment()) + if (length(val) > 0L) { + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels + } else if (p %in% names(cl)) { + cl[[p]] = NULL + } + } + + if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) + eval(cl, parent.frame()) + } + return(invisible()) + } + + # print(match.call()) if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 62075dcf85..52a91c723f 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -190,24 +190,13 @@ DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep="")) test(301.1, nrow(DT[,sum(B),by=C])==100010) # Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too. -local({ - old = options(datatable.optimize=0L); on.exit(options(old)) - set.seed(1) - DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") - test(637.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) - test(637.2, key(DT[J(43L), a:=99L]), NULL) - setkey(DT, a) - test(637.3, key(DT[, a:=99L, by=a]), NULL) -}) -local({ - options(datatable.optimize=2L); on.exit(options(old)) - set.seed(1) - DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") - test(638.1, DT[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) - test(638.2, key(DT[J(43L), a:=99L]), NULL) - setkey(DT,a) - test(638.3, key(DT[, a:=99L, by=a]), NULL) -}) +set.seed(1) +DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") +opt = c(0L,2L) +test(637.1, levels=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, levels=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +setkey(DT, a) +test(637.3, levels=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7be4af2358..5540b212d3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1851,53 +1851,7 @@ x = sample(LETTERS,1000,replace=TRUE) test(610.3, chorder(x), base::order(x, method="radix")) test(610.4, unique(x[chgroup(x)]), unique(x)) -# := by group -options(datatable.optimize=0L) -DT = data.table(a=1:3,b=(1:9)/10) -test(611.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) -setkey(DT,a) -test(611.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) -# Combining := by group with i -test(611.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) -options(datatable.optimize=2L) -DT = data.table(a=1:3,b=(1:9)/10) -test(612.1, DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) -setkey(DT,a) -test(612.2, DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) -# Combining := by group with i -test(612.3, DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(612.4, DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) -# Assign to subset ok (NA initialized in the other items) ok : -test(613, DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614, DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) - -# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 -ans = copy(DT)[,r:=NA_real_] -options(datatable.optimize=0L) -test(618.1, copy(DT)[a>3,r:=sum(b)], ans) -test(618.2, copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3, copy(DT)[NA,r:=sum(b)], ans) -test(618.4, copy(DT)[0,r:=sum(b)], ans) -test(618.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) -options(datatable.optimize=2L) -test(619.1, copy(DT)[a>3,r:=sum(b)], ans) -test(619.2, copy(DT)[J(-1),r:=sum(b)], ans) -test(619.3, copy(DT)[NA,r:=sum(b)], ans) -test(619.4, copy(DT)[0,r:=sum(b)], ans) -test(619.5, copy(DT)[NULL,r:=sum(b)], null.data.table()) - -DT = data.table(x=letters, key="x") -test(621, copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622, copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") - -set.seed(2) -DT = data.table(a=rnorm(5)*10, b=1:5) -options(datatable.optimize=0L) -test(623.1, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) -options(datatable.optimize=2L) -test(623.2, copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +# tests 611-623 moved to optimize.Rraw # Tests on POSIXct attributes @@ -2018,20 +1972,10 @@ setnames(ans2,"x","V1") setnames(ans2,"y","V2") test(654, ans1, ans2) -options(datatable.optimize = 0L) -test(656.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') -test(656.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -test(656.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 1L) -test(657.1, DT[ , mean(x), by=grp1, verbose=TRUE], output='(GForce FALSE)') -test(657.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -test(657.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 2L) -test(658.1, DT[ , mean(x), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -test(658.2, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -test(658.3, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output="GForce optimized j to.*gmean") -tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) -test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. +# test 656-658 moved to optimize.Rraw +# test is not testing what it should since #2671 +# tt = capture.output(DT[,list(mean(x),mean(y)),by=list(grp1,grp2),verbose=TRUE]) +# test(659, !length(grep("Wrote less rows", tt))) # first group is one row with this seed. Ensure we treat this as aggregate case rather than allocate too many rows. # Test .N for logical i subset DT = data.table(a=1:10, b=rnorm(10)) @@ -2059,33 +2003,7 @@ test(667, DT[a<3,sum(b),by=paste("a")], error='Otherwise, by=eval(paste("a")) sh test(668, DT[a<3,sum(b),by=eval(paste("a"))], DT[a<3,sum(b),by=a]) test(669, DT[a<3,sum(b),by=c(2)], error="must evaluate to 'character'") -# Test := keyby does setkey, #2065 -options(datatable.optimize=0L) -DT = data.table(x=1:2, y=1:6) -ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1, DT[,z:=sum(y),keyby=x], ans) -DT = data.table(x=1:2, y=1:6) -test(670.2, DT[,z:=sum(y),keyby="x"], ans) -DT = data.table(x=1:2, y=1:6) -test(670.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), - warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -DT = data.table(x=1:2, y=1:6) -test(670.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -DT = data.table(x=1:2, y=1:6) -test(670.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") -options(datatable.optimize=2L) -DT = data.table(x=1:2, y=1:6) -ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(671.1, DT[,z:=sum(y),keyby=x], ans) -DT = data.table(x=1:2, y=1:6) -test(671.2, DT[,z:=sum(y),keyby="x"], ans) -DT = data.table(x=1:2, y=1:6) -test(671.3, DT[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), - warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -DT = data.table(x=1:2, y=1:6) -test(671.4, DT[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -DT = data.table(x=1:2, y=1:6) -test(671.5, DT[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +# tests 670+671 moved to optimize.Rraw # Test new .() DT = data.table(x=1:2, y=1:6, key="x") @@ -2270,23 +2188,7 @@ test(749, DT[,c("c","d","e"):=list(.N,sum(b),a*10L),by=a], data.table(a=rep(6:8, test(750.1, copy(DT)[a<8,`:=`(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) test(750.2, copy(DT)[a<8,let(f=b+sum(d),g=.N),by=c][,6:7,with=FALSE], data.table(f=INT(2,12,13,NA,NA,NA),g=INT(1,2,2,NA,NA,NA))) -# varname holding colnames, by group, linked from #2120. -options(datatable.optimize=0L) -DT = data.table(a=rep(1:3,1:3),b=1:6) -colname = "newcol" -test(751.1, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) -options(datatable.optimize=2L) -DT = data.table(a=rep(1:3,1:3),b=1:6) -colname = "newcol" -test(751.2, DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) - -# Add tests for nested := in j by group, #1987 -options(datatable.optimize=0L) -DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752.1, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) -options(datatable.optimize=2L) -DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752.2, DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +# tests 751, 752 moved to optimize.Rraw # Test duplicate() of recycled plonking RHS, #2298 DT = data.table(a=letters[3:1],x=1:3) @@ -2590,18 +2492,8 @@ test(864.3, rbindlist(list(data.table(logical(0),logical(0)), DT<-data.table(baz message="Column 1 [[]'baz'[]] of item 2 is missing in item 1.*Use fill=TRUE.*or use.names=FALSE.*v1.12.2") # Steve's find that setnames failed for numeric 'old' when pointing to duplicated names -DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) -options(datatable.optimize = 0L) -test(865.1, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 1L) -test(865.2, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output="(GForce FALSE)") -options(datatable.optimize = 2L) -test(865.3, ans1<-DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], - output="GForce optimized.*gsum[(]v[)], gsum[(]w[)]") # v1.9.7 treats wrapped {} better, so this is now optimized -options(datatable.optimize = Inf) -test(866, names(ans1), c("a","b","name1","name2")) -test(867, names(ans2<-DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a","b","name1","name2")) # list names extracted here -test(868, ans1, ans2) +# tests 865-868 moved to optimize.Rraw + # and related to setnames, too DT = data.table(a=1:3,b=1:6,key="a") test(869, DT[J(2,42,84),print(.SD),by=.EACHI], output=" b\n.*1.*2\n2:.*5.*Empty data.table [(]0 rows and 3 cols[)]: a,V2,V3") # .* for when verbose mode @@ -3998,10 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -options(datatable.optimize=0L) -test(1133.7, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) -options(datatable.optimize=2L) -test(1133.75, DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7, levels=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -4104,10 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -options(datatable.optimize=0L) -test(1143.4, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -options(datatable.optimize=2L) +test(1143.4, levels=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5, levels=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) @@ -4382,40 +4269,7 @@ test(1181, forderv(INT(1,3,5000000,NA)), INT(4,1,2,3)) test(1182, forderv(INT(1,-1,5000000,NA)), INT(4,2,1,3)) test(1183, forderv(INT(-3,-7,1,-6000000,NA,3,5000000,NA,8)), INT(5,8,4,2,1,3,6,9,7)) -# tests of gsum and gmean with NA -DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) -set(DT,c(3L,8L),"y",NA) -set(DT,c(5L,9L),"v",NA) -set(DT,10:12,"y",NA) -set(DT,10:12,"v",NA) -options(datatable.optimize=1) # turn off GForce -test(1184.1, DT[, sum(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1184.2, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) -options(datatable.optimize=0) # turn off fastmean optimization to get the answer to match to -test(1185.2, ans <- DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output="All optimizations.*off") -options(datatable.optimize=1) # turn on old fastmean optimization only -test(1185.3, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="Old mean.*changed j") -options(datatable.optimize=Inf) # turn on GForce -test(1185.4, DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], ans, output="GForce optimized j to") -test(1186, DT[, sum(v), by=x, verbose=TRUE], output="GForce optimized j to") -test(1187.1, DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) -MyVar = TRUE -test(1187.2, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, list(sum(y,na.rm=TRUE), mean(y,na.rm=TRUE)), by=x]) -test(1187.3, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, mean(y,na.rm=TRUE), by=x]) -MyVar = FALSE -test(1187.4, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, list(sum(y,na.rm=FALSE), mean(y,na.rm=FALSE)), by=x]) -test(1187.5, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output="GForce optimized j to", - DT[, mean(y,na.rm=FALSE), by=x]) -# GForce should not turn on when the .ok function isn't triggered -test(1187.6, DT[, mean(y, trim=.2), by=x, verbose=TRUE], - data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), - output='j unchanged', warning="'trim' is not yet optimized") +# tests 1184-1187 moved to optimize.Rraw # test from Zach Mayer a <- c("\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" \"", "\"\"\"\") \" \" \" \" ,\"") @@ -4610,18 +4464,17 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") -options(datatable.optimize=1) # turn off GForce, to test dogroups -test(1230, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +# turn off GForce, to test dogroups +test(1230, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232, levels=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233, levels=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234, levels=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -options(datatable.optimize=Inf) +test(1235, levels=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236, levels=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237, levels=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4887,44 +4740,40 @@ set.seed(45L) dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) -options(datatable.optimize=0L) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01, levels=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.02, levels=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.03, levels=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.04, levels=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.05, levels=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # newly added tests for #861 -- optimise, but no GForce -test(1268.06, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06, levels=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # don't optimise .I in c(...) -test(1268.07, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") - -options(datatable.optimize=1L) -test(1268.08, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") - -options(datatable.optimize=Inf) -test(1268.15, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, +test(1268.07, levels=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") + +test(1268.08, levels=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") +test(1268.09, levels=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.10, levels=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') +test(1268.11, levels=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.12, levels=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.13, levels=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.14, levels=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") + +test(1268.15, levels=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, +test(1268.16, levels=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, +test(1268.17, levels=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, +test(1268.18, levels=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, +test(1268.19, levels=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, +test(1268.20, levels=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, +test(1268.21, levels=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") - -test(1268.22, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22, levels=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") @@ -5267,21 +5116,7 @@ set(DT,1L,"b",3L) test(1302, 0L[1L], 3L-3L) test(1303, 0L, 3L-3L) -# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. -set.seed(2L) -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -options(datatable.optimize = 1L) -ans1 <- dt[, list(.N, sum(y)), by=x] -options(datatable.optimize = 2L) -ans2 <- dt[, list(.N, sum(y)), by=x] -test(1304.1, ans1, ans2) - -dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -options(datatable.optimize = 1L) -ans1 <- dt[, list(.N, sum(y)), by=x] -options(datatable.optimize = 2L) -ans2 <- dt[, list(.N, sum(y)), by=x] -test(1304.2, ans1, ans2) +# test 1304 moved to optimize.Rraw # FR #338 DT <- data.table(x=1:5, y=6:10) @@ -5330,59 +5165,7 @@ DT = data.table(a=1:3,b=6:1) test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use") # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread). -# gmin and gmax extensive testing (because there are tricky cases) -DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) -# make sure GForce is running -options(datatable.optimize=3L) - -# for integers -test(1313.01, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := INT(NA)] -test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) - -# for numeric -DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) -DT[x==6, y := NA_real_] -test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) - -# for date (attribute check.. especially after issues/689 !!!) -DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18, DT[, list(y=max(y)), by=x], DT[c(5,10)]) -DT[c(1,6), y := NA] -test(1313.19, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) - -# for character -set.seed(1L) -DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) -DT[x==7, y := c("","b","c")] -test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) -DT[x==6, y := NA_character_] -test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +# test 1313 moved to optimze.Rraw # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -6495,10 +6278,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -options(datatable.optimize = 2L) -test(1437.21, DT[x == 2, verbose = TRUE], output = "^ x y") -options(datatable.optimize = Inf) -test(1437.22, DT[x == 2, verbose = TRUE], output = "Optimized subsetting") +test(1437.21, levels=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6583,35 +6363,22 @@ if (.Machine$sizeof.pointer>4) { # temporarily disabled for 32bit, #2767 for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery))] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery))] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery)), which = TRUE] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery)), which = TRUE] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ - options("datatable.optimize" = 3L) - ansOpt <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] - options("datatable.optimize" = 2L) - ansRef <- DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, ansOpt, ansRef) + test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } } -options(datatable.optimize = Inf) # fread dec=',' e.g. France test(1439, fread("A;B\n1;2,34\n", dec="12"), error=base_messages$stopifnot("nchar(dec) == 1L")) @@ -8146,14 +7913,7 @@ test(1564.1, truelength(dt[, .SD]), 1025L) test(1564.2, truelength(dt[a==5, .SD]), 1025L) test(1564.3, dt[a==5, .SD][, b := 1L], data.table(a=5L, b=1L)) -# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now -dt = data.table(a=sample(3,20,TRUE), b=1:10) -options(datatable.optimize = 0L) -test(1565.1, ans <- dt[, .N, by=a, verbose=TRUE], output="All optimizations are turned off") -options(datatable.optimize = 1L) -test(1565.2, dt[ , .N, by=a, verbose=TRUE], ans, output="lapply optimization is on, j unchanged") -options(datatable.optimize = Inf) -test(1565.3, dt[ , .N, by=a, verbose=TRUE], ans, output = "GForce optimized j to") +# test 1565 moved to optimize.Rraw # Fix for #1212 set.seed(123) @@ -8262,83 +8022,7 @@ test(1578.7, fread(f, skip=49L), data.table(V1=1:2, V2=3:4)) test(1578.8, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4)) test(1578.9, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50 -# gforce optimisations -dt = data.table(x = sample(letters, 300, TRUE), - i1 = sample(-10:10, 300, TRUE), - i2 = sample(c(-10:10, NA), 300, TRUE), - d1 = as.numeric(sample(-10:10, 300, TRUE)), - d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) -if (test_bit64) { - dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] - dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] -} - -# make sure gforce is on -options(datatable.optimize=2L) - -# testing gforce::gmedian -test(1579.01, dt[, lapply(.SD, median), by=x], - dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x]) -test(1579.02, dt[, lapply(.SD, median, na.rm=TRUE), by=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03, dt[, lapply(.SD, median), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], - dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) -ans = capture.output(dt[, lapply(.SD, median), by=x, verbose=TRUE]) -test(1579.05, any(grepl("GForce optimized", ans)), TRUE) - -# testing gforce::ghead and gforce::gtail -# head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) -test(1579.07, dt[, head(.SD,1), by=x], dt[, utils::head(.SD,1), by=x]) -test(1579.08, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) -test(1579.09, dt[, head(.SD,1), keyby=x], dt[, utils::head(.SD,1), keyby=x]) -test(1579.10, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) -test(1579.11, dt[, head(.SD,1L), by=x], dt[, utils::head(.SD,1L), by=x]) -test(1579.12, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) -test(1579.13, dt[, head(.SD,1L), keyby=x], dt[, utils::head(.SD,1L), keyby=x]) - -test(1579.14, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) -test(1579.15, dt[, tail(.SD,1), by=x], dt[, utils::tail(.SD,1), by=x]) -test(1579.16, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) -test(1579.17, dt[, tail(.SD,1), keyby=x], dt[, utils::tail(.SD,1), keyby=x]) -test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) -test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) -test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -# 1579.22 tested gtail with n>1; now 1579.4+ below - -mysub <- function(x, n) x[n] -test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) - -# gforce head/tail for n>1, #5060 -set.seed(99) -DT = data.table(x = sample(letters[1:5], 20, TRUE), - y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly - i = sample(c(-2L,0L,3L,NA), 20, TRUE), - d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), - s = sample(c("foo","bar",NA), 20, TRUE), - l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) -if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -options(datatable.optimize=2L) -test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") - -options(datatable.optimize = Inf) +# test 1579 moved to optimize.Rraw # test for #1419, rleid doesn't remove names attribute x = c("a"=TRUE, "b"=FALSE) @@ -8346,104 +8030,12 @@ nx = copy(names(x)) r = rleid(x) test(1580, nx, names(x)) -# FR #971, partly addressed (only subsets in 'i') -# make sure GForce kicks in and the results are identical -dt = dt[, .(x, d1, d2)] -options(datatable.optimize=1L) - -test(1581.01, ans1 <- dt[x %in% letters[15:20], - c(.N, lapply(.SD, sum, na.rm=TRUE), - lapply(.SD, min, na.rm=TRUE), - lapply(.SD, max, na.rm=TRUE), - lapply(.SD, mean, na.rm=TRUE), - lapply(.SD, median, na.rm=TRUE) - ), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.02, ans2 <- dt[x %in% letters[15:20], - c(.N, lapply(.SD, sum, na.rm=TRUE), - lapply(.SD, min, na.rm=TRUE), - lapply(.SD, max, na.rm=TRUE), - lapply(.SD, mean, na.rm=TRUE), - lapply(.SD, median, na.rm=TRUE) - ), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.03, ans1, ans2) - -# subsets in 'i' for head and tail -options(datatable.optimize=1L) -test(1581.04, ans1 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.05, ans2 <- dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.06, ans1, ans2) - -options(datatable.optimize=1L) -test(1581.07, ans1 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.08, ans2 <- dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.09, ans1, ans2) - -options(datatable.optimize=1L) -test(1581.10, ans1 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=2L) -test(1581.11, ans2 <- dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.12, ans1, ans2) -options(datatable.optimize = Inf) - -# #3209 g[[ -options(datatable.optimize=1L) -test(1581.13, ans1 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], - output = "(GForce FALSE)") -options(datatable.optimize=Inf) -test(1581.14, ans2 <- dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], - output = "GForce optimized j") -test(1581.15, ans1, ans2) -# also, block for non-atomic input, #4159 -dt = data.table(a=1:3) -dt[ , l := .(list(1, 2, 3))] -test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], - dt[ , l := unlist(l)], output='(GForce FALSE)') -# make sure not to apply when `[[` is applied to a nested call, #4413 -DT = data.table(f1=c("a","b"), f2=c("x","y")) -l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) -test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], - data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) -test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], - data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) -# When the object being [[ is in parent.frame(), not x, -# need eval to have enclos=parent.frame(), #4612 -DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) -DT0 = copy(DT) -fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] -fun(DT) -test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) +# test 1581 moved to optimize.Rraw # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) -# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level -# verbatim test from the original report: -options(datatable.optimize=Inf) # ensure gforce is on -DT = data.table( - C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), - C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), - Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) -test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], - data.table(C1=c("A","A","B","B","C","C"), - C2=c("a","b","b","c","c","d"), - agg=c(1,4,5,8,9,10))) -# extra test with a size-1 group containing one NaN too -DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +# test 1583 moved to optimize.Rraw # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE) @@ -8563,32 +8155,7 @@ test(1592.2, names(setnames(DT, -1, c("m", "n"))), c("x", "m", "n")) # fix for #1513 test(1593, CJ(c(1,2,2), c(1,2,3)), data.table(V1=rep(c(1,2), c(3,6)), V2=c(1,2,3,1,1,2,2,3,3), key=c("V1", "V2"))) -# FR #523, var, sd and prod -options(datatable.optimize = Inf) # ensure gforce is on -DT = data.table(x=sample(5, 100, TRUE), - y1=sample(6, 100, TRUE), - y2=sample(c(1:10,NA), 100, TRUE), - z1=runif(100), - z2=sample(c(runif(10),NA,NaN), 100, TRUE)) -test(1594.01, DT[, lapply(.SD, var, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::var, na.rm=FALSE), by=x]) -test(1594.02, DT[, lapply(.SD, var, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::var, na.rm=TRUE), by=x]) -test(1594.03, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gvar") -# coverage: default group .N=1 case -idx=DT[ , .I[1L], by=x]$V1 -out=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05, DT[(idx), var(y1), by=x], out) -test(1594.06, DT[(idx), var(y1, na.rm=TRUE), by=x], out) -test(1594.07, DT[(idx), var(z1), by=x], out) -test(1594.08, DT[(idx), var(z1, na.rm=TRUE), by=x], out) - -test(1594.09, DT[, lapply(.SD, sd, na.rm=FALSE), by=x], DT[, lapply(.SD, stats::sd, na.rm=FALSE), by=x]) -test(1594.10, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gsd") - -test(1594.12, DT[, lapply(.SD, prod, na.rm=FALSE), by=x], DT[, lapply(.SD, base::prod, na.rm=FALSE), by=x]) -test(1594.13, DT[, lapply(.SD, prod, na.rm=TRUE), by=x], DT[, lapply(.SD, base::prod, na.rm=TRUE), by=x]) -test(1594.14, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output="GForce optimized j to.*gprod") - +# test 1594 moved to optimize.Rraw # FR #1517 dt1 = data.table(x=c(1,1,2), y=1:3) @@ -9211,24 +8778,20 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -options(datatable.optimize = 1L) # no gforce -test(1629.09, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) -ans1 = dt[, sum(v1), by=grp] +test(1629.09, levels=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10, levels=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11, levels=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12, levels=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce -options(datatable.optimize = Inf) # Inf -test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3], dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp]) test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp]) -options(datatable.optimize = Inf) # by=.EACHI and operations with 'i' test(1629.16, dt[.(c(2,3)), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3]) @@ -9311,11 +8874,7 @@ test(1637.3, dt[, data.table(a, .SD), by = a], data.table(a=1,a=1,b=1)) test(1637.4, dt[, data.table(b, .SD), by = cumsum(a)], data.table(cumsum=1, b=1, b=1)) test(1637.5, dt[, data.table(a, b), by = cumsum(a)], data.table(cumsum=1, a=1, b=1)) -# when datatable.optimize<1, no optimisation of j should take place: -options(datatable.optimize=0L) -dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) -test(1638, dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") -options(datatable.optimize=Inf) +# test 1638 moved to optimize.Rraw rm_all() @@ -13197,9 +12756,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -options(datatable.optimize = Inf) -test(1918.5, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5, levels=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6, levels=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -14082,17 +13640,12 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -options(datatable.optimize = 0L) -verbose_output = capture.output(x[order(a), .N, verbose = TRUE]) -test(1967.69, !any(grepl('forder.c', verbose_output, fixed = TRUE))) -test(1967.70, any(grepl('[1] 5', verbose_output, fixed = TRUE))) -options('datatable.optimize' = 1L) -test(1967.71, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69, levels=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71, levels=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72, levels=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") -options(datatable.optimize = Inf) x = data.table( i1 = c(234L, 250L, 169L, 234L, 147L, 96L, 96L, 369L, 147L, 96L), @@ -14243,14 +13796,9 @@ x <- as.array(1:5) test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 -options(datatable.optimize=0L) DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") -options(datatable.optimize=Inf) -DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.3, DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.4, DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1, levels=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2, levels=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -14282,8 +13830,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -options('datatable.optimize' = Inf) -test(1984.10, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10, levels=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -15168,16 +14715,11 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], # gforce wrongly applied to external variable; #875 DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 -options(datatable.optimize = Inf) -test(2043.1, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -options(datatable.optimize = 1L) -test(2043.2, DT[, list(mean(z), mean(y)), by=x], ans) -options(datatable.optimize = 0L) -test(2043.3, DT[, list(mean(z), mean(y)), by=x], ans) -options(datatable.optimize = Inf) -test(2043.4, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +opt = c(Inf,1L,0L) +test(2043.1, levels=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4, levels=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5, levels=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -16340,12 +15882,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -old = options(datatable.optimize=0L) -test(2098.5, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -options(old) +test(2098.5, levels=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6, levels=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7, levels=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8, levels=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17770,16 +17310,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - old = options(datatable.optimize=2L) - test(2208.1, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1, levels=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3, levels=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) - options(old) + test(2208.7, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17995,17 +17533,16 @@ if (test_bit64) test(2219.2, DT[3, A:=as.integer64("4611686018427387906")], data # gforce improve coverage DT = data.table(g=1:2, i=c(NA, 1:4, NA), f=factor(letters[1:6]), l=as.list(1:6)) -options(datatable.optimize = 2L) funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, levels=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -18021,7 +17558,6 @@ test(2223.1, DT[.(4), nomatch=FALSE], data.table(A=integer(), key="A")) test(2223.2, DT[.(4), nomatch=NA_character_], data.table(A=4L, key="A")) # gshift, #5205 -options(datatable.optimize = 2L) set.seed(123) DT = data.table(x = sample(letters[1:5], 20, TRUE), y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly @@ -18043,7 +17579,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, levels=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -18132,86 +17668,8 @@ test(2230.12, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x test(2230.13, merge(DT, y, by="k2", NULL, NULL, FALSE, FALSE, FALSE, TRUE, c(".x", ".y"), TRUE, getOption("datatable.allow.cartesian"), NULL, unk1=1L, unk2=2L, 3L, 4L), merge(DT, y, by="k2"), warning=c("Supplied both `by` and `by.x`/`by.y`. `by` argument will be ignored.", "2 unnamed arguments.*2 unknown keyword arguments.*\\[unk1, unk2\\]")) +# tests 2231 moved to optimize.Rraw -# weighted.mean GForce optimized, #3977 -old = options(datatable.optimize=1L) -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.01, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce FALSE") -test(2231.02, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce FALSE") -test(2231.03, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce FALSE") -# multiple groups -DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") -test(2231.05, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce FALSE") -test(2231.06, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce FALSE") -# (only x XOR w) containing NA -DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") -test(2231.08, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce FALSE") -test(2231.09, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.10, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (only x XOR w) containing NaN -DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce FALSE") -test(2231.12, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") -test(2231.13, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce FALSE") -test(2231.14, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (only x XOR w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce FALSE") -test(2231.16, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce FALSE") -test(2231.17, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.18, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# (x and w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.20, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce FALSE") -test(2231.22, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce FALSE") -# same as previous test cases but now GForce optimized -options(datatable.optimize=2L) -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.31, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output="GForce optimized j to") -test(2231.32, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output="GForce optimized j to") -test(2231.33, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") -# multiple groups -DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.34, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") -test(2231.35, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output="GForce optimized j to") -test(2231.36, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output="GForce optimized j to") -# (only x XOR w) containing NA -DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.37, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") -test(2231.38, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output="GForce optimized j to") -test(2231.39, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.40, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (only x XOR w) containing NaN -DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.41, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output="GForce optimized j to") -test(2231.42, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") -test(2231.43, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output="GForce optimized j to") -test(2231.44, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (only x XOR w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.45, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output="GForce optimized j to") -test(2231.46, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output="GForce optimized j to") -test(2231.47, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.48, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# (x and w) containing NA and NaN -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.49, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.50, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.51, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output="GForce optimized j to") -test(2231.52, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output="GForce optimized j to") -# let wrongly named arguments get lost in ellipsis #5543 -DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output="GForce optimized j to") -test(2231.62, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output="GForce optimized j to") -test(2231.63, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) -options(old) # cols argument for unique.data.table, #5243 DT = data.table(g = rep(letters, 3), v1=1:78, v2=78:1) @@ -18223,17 +17681,18 @@ test(2232.3, unique(DT[1:26], by='g', cols='v1'), DT[1:26, !'v2']) test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") # support := with GForce #1414 -options(datatable.optimize = 2L) DT = data.table(a=1:3,b=(1:9)/10) -test(2233.01, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output="GForce optimized j to") +opt = 0:2 +out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") +test(2233.01, levels=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output="GForce optimized j to") +test(2233.02, levels=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03, DT[, v := head(b, 2L), a], error="Supplied 6 items to be assigned to 9 items of column 'v'.") +test(2233.03, levels=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output="GForce optimized j to") -test(2233.05, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output="GForce optimized j to") +test(2233.04, levels=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05, levels=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -18304,27 +17763,21 @@ test(2233.38, copy(DT)[, val:=v[1L], keyby=.(A,B), verbose=TRUE], data.table(A=I set.seed(10) n = 100 a = data.table(id1=1:n, id2=sample(1:900,n,replace=TRUE), flag=sample(c(0,0,0,1),n,replace=TRUE)) -for (opt in c(0,Inf)) { - options(datatable.optimize=opt) - out = if (opt) "GForce.*gsum" else "GForce FALSE" - B = copy(a) - A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle - num_bump = (opt>0)/100 - test(2233.39+num_bump+0.001, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= - setorder(A, id1) - test(2233.39+num_bump+0.002, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) - test(2233.39+num_bump+0.003, any(A[,t1!=t2]), FALSE) - test(2233.39+num_bump+0.004, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) - test(2233.39+num_bump+0.005, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) -} +opt = c(0,Inf) +out = c("GForce FALSE", "GForce.*gsum") +B = copy(a) +A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle +test(2233.391, levels=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +setorder(A, id1) +test(2233.392, levels=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393, levels=opt, any(A[,t1!=t2]), FALSE) +test(2233.394, levels=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395, levels=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -options(datatable.optimize=0) -test(2233.41, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce FALSE") -options(datatable.optimize=Inf) -test(2233.42, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output="GForce.*gsum") +test(2233.41, levels=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -18335,32 +17788,20 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43, - options = list(datatable.verbose=TRUE, datatable.optimize=0), +test(2233.43, levels=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) ][, sum_v1_idT:=sum(v1), by=c("id", "t") ][, sum_v1_id :=sum(v1), by=c("id")], ans, - output="GForce FALSE") -test(2233.44, - options = list(datatable.verbose=TRUE, datatable.optimize=Inf), - copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") - ][, n_idT :=dim(.SD)[[1]], by=list(t, id) - ][, sum_v2_id :=sum(v2), by=.(id) - ][, sum_v1_idT:=sum(v1), by=c("id", "t") - ][, sum_v1_id :=sum(v1), by=c("id")], - ans, - output="GForce.*gsum") + output=c("GForce FALSE", "GForce.*gsum")) # optimized := with gforce functions that can return lists #5403 -old = options(datatable.verbose=TRUE) DT = data.table(grp=1:2, x=1:4) out = "Making each group and running j (GForce TRUE)" -test(2233.45, copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) -test(2233.46, copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) -test(2233.47, copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) -options(old) +test(2233.45, options=c(datatable.verbose=TRUE), copy(DT)[, c("y", "z") := .(shift(x, type="lag", n=1), shift(x, type="lead", n=1)), by=grp], data.table(grp=1:2, x=1:4, y=c(NA, NA, 1:2), z=c(3:4, NA, NA)), output=out) +test(2233.46, options=c(datatable.verbose=TRUE), copy(DT)[, l := shift(x, n=c(0, 0)), by=grp], data.table(grp=1:2, x=1:4, l=list(INT(1, 1), INT(2, 2), INT(3, 3), INT(4, 4))), output=out) +test(2233.47, options=c(datatable.verbose=TRUE), copy(DT)[, c("l1", "l2") := shift(x, n=c(-1, 1)), by=grp], data.table(grp=1:2, x=1:4, l1=c(3:4,NA,NA), l2=c(NA,NA,1:2)), output=out) # support by=.I; #1732 DT = data.table(V1=1:5, V2=3:7, V3=5:1) @@ -18497,20 +17938,14 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( dt = data.table(x = c(2,2,1,1), y = 1:4, z=letters[1:4]) i=c(1,2) j=1L -test(2243.41, options=c(datatable.optimize=1L), dt[, .I[TRUE], x]$V1, 1:4) -test(2243.42, options=c(datatable.optimize=1L), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA))) -test(2243.51, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.52, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[1], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.54, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[j], x]$V1, c(1L, 3L), output="GForce TRUE") -test(2243.55, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, options=list(datatable.optimize=2L, datatable.verbose=TRUE), - dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +opt = c(1L,2L) +out = c("GForce FALSE", "GForce TRUE") +test(2243.41, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42, levels=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18531,11 +17966,9 @@ test(2245.3, dt[1], data.table(foo = 1L, bar = 4L)) # Default in this environmen # data.table:: doesn't turn off GForce, #5942 DT = data.table(a = rep(1:5, 2L), b = 1:10) -old = options(datatable.optimize=Inf, datatable.verbose=TRUE) -test(2246.1, DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") -test(2246.2, DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") -test(2246.3, DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") -options(old) +test(2246.1, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::shift(b), by=a], DT[, shift(b), by=a], output="GForce TRUE") +test(2246.2, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::first(b), by=a], DT[, first(b), by=a], output="GForce TRUE") +test(2246.3, options=list(datatable.optimize=Inf, datatable.verbose=TRUE), DT[, data.table::last(b), by=a], DT[, last(b), by=a], output="GForce TRUE") # 5392 split(x,f) works with formula f dt = data.table(x=1:4, y=factor(letters[1:2])) @@ -18833,14 +18266,7 @@ test(2262.6, set(null.data.table(), j=c("a","b"), value=list(1:2, 3:4)), dt3) test(2262.7, data.table(a=1, b=2)[, c("a", "b") := list(NULL, NULL)], null.data.table()) test(2262.8, data.table(a=1, b=2)[, c("a", "b") := list(NULL)], null.data.table()) -# GForce retains attributes in by arguments #5567 -dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) -test(2263.1, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce FALSE") -test(2263.2, options=list(datatable.verbose=TRUE, datatable.optimize=0L), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce FALSE") -test(2263.3, options=list(datatable.verbose=TRUE, datatable.optimize=0L), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce FALSE") -test(2263.4, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, b], data.table(b=dt$b, N=1L), output="GForce optimized j to") -test(2263.5, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output="GForce optimized j to") -test(2263.6, options=list(datatable.verbose=TRUE, datatable.optimize=Inf), names(attributes(dt[, .N, b]$b)), c("class", "att"), output="GForce optimized j to") +# test 2263 moved to optimize.Rraw # tests for printing indices alongside data.tables NN = 200 @@ -18976,7 +18402,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270, options=c(datatable.optimize=1L), DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270, levels=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -19288,75 +18714,7 @@ test(2282.08, rowwiseDT(A=,B=,1,2,C=,4), error="Header must be the first N argum ncols = 1e6 test(2282.09, rowwiseDT(A=,ncols), data.table(A=ncols)) -# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 - -M <- as.data.table(mtcars) -M[, " " := hp] -M[, "." := hp] - -sdnames <- setdiff(names(M), "cyl") -sdlist <- vector("list", length(sdnames)) -names(sdlist) <- sdnames - -for (opt in c(0, 1, 2)) { - test(2283 + opt/10 + 0.001, options=c(datatable.optimize=opt), - names(M[, c(m=lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(m=sdlist)))) - test(2283 + opt/10 + 0.002, options=c(datatable.optimize=opt), - names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", sdnames)) - test(2283 + opt/10 + 0.003, options=c(datatable.optimize=opt), - names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", names(c(m=sdlist)))) - test(2283 + opt/10 + 0.004, options=c(datatable.optimize=opt), - names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), - c("cyl", "mpg", names(c(mpg=sdlist)))) - test(2283 + opt/10 + 0.005, options=c(datatable.optimize=opt), - names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "V1", sdnames)) - test(2283 + opt/10 + 0.006, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), - c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) - test(2283 + opt/10 + 0.007, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, sdnames)) - test(2283 + opt/10 + 0.008, options=c(datatable.optimize=opt), - names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(mean=sdlist, sum=sdlist)))) - test(2283 + opt/10 + 0.009, options=c(datatable.optimize=opt), - names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, names(c(sum=sdlist))) ) - test(2283 + opt/10 + 0.010, options=c(datatable.optimize=opt), - names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(" "=sdlist, "."=sdlist)))) - test(2283 + opt/10 + 0.011, options=c(datatable.optimize=opt), - names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(a=0, b=0))), sdnames)) - test(2283 + opt/10 + 0.012, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, 0))), sdnames)) - test(2283 + opt/10 + 0.013, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) - test(2283 + opt/10 + 0.014, options=c(datatable.optimize=opt), - names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0))), sdnames)) - test(2283 + opt/10 + 0.015, options=c(datatable.optimize=opt), - names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) - test(2283 + opt/10 + 0.016, options=c(datatable.optimize=opt), - names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) - test(2283 + opt/10 + 0.017, options=c(datatable.optimize=opt), - names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) - test(2283 + opt/10 + 0.018, options=c(datatable.optimize=opt), - names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) - test(2283 + opt/10 + 0.019, options=c(datatable.optimize=opt), - names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "V2", "b", "vs", "am")) -} +# test 2283 moved tests to optimize.Rraw # Confusing behavior with DT[, min(var):max(var)] #2069 DT = data.table(t = c(2L, 1L, 3L), a=0, b=1) diff --git a/man/test.Rd b/man/test.Rd index 594040aca9..4a25a592c5 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL) + options = NULL, env = NULL, levels = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -22,6 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } +\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize.level=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y for equality. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From 04e578230641f00646a297b9285d2ff74b499eac Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Thu, 30 Oct 2025 16:35:03 +0100 Subject: [PATCH 04/78] add lapply(list(col1, col2, ...), fun) pattern --- R/data.table.R | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/R/data.table.R b/R/data.table.R index aa184f5c4c..ffe5cfc24b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -364,6 +364,18 @@ replace_dot_alias = function(e) { jvnames = massage_result$jvnames funi = massage_result$funi } + # Pattern 3a2: lapply(list(col1, col2, ...), fun) + else if (is.call(jsub) && jsub %iscall% "lapply" && length(jsub) >= 2L && + jsub[[2L]] %iscall% "list" && length(jsub[[2L]]) > 1L) { + cnames = as.list(jsub[[2L]])[-1L] + if (all(vapply_1b(cnames, is.name))) { + cnames = vapply_1c(cnames, as.character) + massage_result = .massageSD(jsub, cnames, SDenv, funi) + jsub = massage_result$jsub + jvnames = NULL # consistent with datatable.optimize=0L behavior + funi = massage_result$funi + } + } # Pattern 3b: Map(fun, .SD) else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { massage_result = .massageSD(jsub, sdvars, SDenv, funi) From a8dde19a1aa0cca46eb9f9cfb318c57f8d79989e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 31 Oct 2025 14:51:43 +0100 Subject: [PATCH 05/78] turn on optimization --- R/data.table.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index ffe5cfc24b..c7eed447a5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -225,7 +225,7 @@ replace_dot_alias = function(e) { # For now, we optimise all functions mentioned in 'optfuns' below. optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") is_valid = TRUE - any_SD = FALSE + any_optimized = FALSE jsubl = as.list.default(jsub) oldjvnames = jvnames jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. @@ -235,7 +235,7 @@ replace_dot_alias = function(e) { # Case 1: Plain name (.SD or .N) if (is.name(this)) { # no need to check length(this)==1L; is.name() returns single TRUE or FALSE (documented); can't have a vector of names if (this == ".SD") { # optimise '.SD' alone - any_SD = TRUE + any_optimized = TRUE jsubl[[i_]] = lapply(sdvars, as.name) jvnames = c(jvnames, sdvars) } else if (this == ".N") { @@ -254,7 +254,7 @@ replace_dot_alias = function(e) { is_lapply = this[[1L]] == "lapply" && length(this) >= 2L && this[[2L]] == ".SD" is_map = this[[1L]] == "Map" && length(this) >= 3L && this[[3L]] == ".SD" if ((is_lapply || is_map) && length(sdvars)) { - any_SD = TRUE + any_optimized = TRUE massage_result = .massageSD(this, sdvars, SDenv, funi) funi = massage_result$funi jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.) @@ -289,6 +289,7 @@ replace_dot_alias = function(e) { jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && jvnames = c(jvnames, jn__) jsubl[[i_]] = jl__ + any_optimized = TRUE } } # Case 2c: Single-value functions like mean, sum, etc. @@ -299,7 +300,7 @@ replace_dot_alias = function(e) { else if (length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. - any_SD = TRUE + any_optimized = TRUE jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) jvnames = c(jvnames, sdvars) } @@ -326,7 +327,7 @@ replace_dot_alias = function(e) { } # Return result - if (!is_valid || !any_SD) { + if (!is_valid || !any_optimized) { # Can't optimize - return original list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE) } else { From 67f28749e1e18e7e90a76437b5c9b1139597043e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 1 Nov 2025 16:50:33 +0100 Subject: [PATCH 06/78] add type conversion support to GForce --- R/data.table.R | 20 ++++++++++++++++---- src/gsumm.c | 2 +- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index c7eed447a5..52686ed812 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -378,7 +378,9 @@ replace_dot_alias = function(e) { } } # Pattern 3b: Map(fun, .SD) - else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars)) { + # Only optimize if .SD appears exactly once to avoid cases like Map(rep, .SD, .SD) + else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars) && + sum(vapply_1b(as.list(jsub), function(x) identical(x, quote(.SD)))) == 1L) { massage_result = .massageSD(jsub, sdvars, SDenv, funi) jsub = massage_result$jsub jvnames = massage_result$jvnames @@ -3313,13 +3315,15 @@ is_constantish = function(q, check_singleton=FALSE) { .gweighted.mean_ok = function(q, x) { #3977 q = match.call(gweighted.mean, q) is_constantish(q[["na.rm"]]) && + !(is.symbol(q[["na.rm"]]) && q[["na.rm"]] %chin% names(x)) && (is.null(q[["w"]]) || eval(call('is.numeric', q[["w"]]), envir=x)) } # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls where x is a column of .SD .get_gcall = function(q) { if (!is.call(q)) return(NULL) # is.symbol() is for #1369, #1974 and #2949 - if (!is.symbol(q[[2L]])) return(NULL) + if (!is.symbol(q[[2L]]) && !is.call(q[[2L]])) return(NULL) + if (is.call(q[[2L]]) && !.is_type_conversion(q[[2L]])) return(NULL) q1 = q[[1L]] if (is.symbol(q1)) return(if (q1 %chin% gfuns) q1) if (!q1 %iscall% "::") return(NULL) @@ -3332,12 +3336,20 @@ is_constantish = function(q, check_singleton=FALSE) { # is robust to unnamed expr. Note that NA names are not possible here. .arg_is_narm = function(expr, which=3L) !is.null(nm <- names(expr)[which]) && startsWith(nm, "na") +.is_type_conversion = function(expr) { + is.call(expr) && is.symbol(expr[[1L]]) && expr[[1L]] %chin% + c("as.numeric", "as.double", "as.integer", "as.character", "as.integer64", + "as.complex", "as.logical", "as.Date", "as.POSIXct", "as.factor") +} + .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 q1 = .get_gcall(q) if (is.null(q1)) return(FALSE) - if (!(q2 <- q[[2L]]) %chin% names(x) && q2 != ".I") return(FALSE) # 875 - if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]))) return(TRUE) + q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] + if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && + !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) switch(as.character(q1), "shift" = .gshift_ok(q), "weighted.mean" = .gweighted.mean_ok(q, x), diff --git a/src/gsumm.c b/src/gsumm.c index 5970f59194..be8ce16119 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -410,7 +410,7 @@ SEXP gsum(SEXP x, SEXP narmArg) //Rprintf(_("gsum int took %.3f\n"), wallclock()-started); if (overflow) { UNPROTECT(1); // discard the result with overflow - warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.")); + warning(_("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience. Consider using 'as.numeric' on the column beforehand to avoid this warning.")); ans = PROTECT(allocVector(REALSXP, ngrp)); double *restrict ansp = REAL(ans); memset(ansp, 0, ngrp*sizeof(double)); From 2876ebef76346c1f315c74c498e9f39e91f4a5f4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 1 Nov 2025 16:57:13 +0100 Subject: [PATCH 07/78] remove stale branch --- R/test.data.table.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index bfc61412a3..decb600bb1 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -380,8 +380,6 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no val = get(p, envir=environment()) if (length(val) > 0L) { cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels - } else if (p %in% names(cl)) { - cl[[p]] = NULL } } From c445c3830ce83428181f6fe9559de8e059d95161 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:13:14 +0100 Subject: [PATCH 08/78] add tests --- inst/tests/optimize.Rraw | 455 +++++++++++++++++++++++++++++++++++++++ inst/tests/tests.Rraw | 16 ++ tests/optimize.R | 2 + 3 files changed, 473 insertions(+) create mode 100644 inst/tests/optimize.Rraw create mode 100644 tests/optimize.R diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw new file mode 100644 index 0000000000..57d53f0521 --- /dev/null +++ b/inst/tests/optimize.Rraw @@ -0,0 +1,455 @@ +require(methods) +if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { + if ((tt<-compiler::enableJIT(-1))>0) + cat("This is dev mode and JIT is enabled (level ", tt, ") so there will be a brief pause around the first test.\n", sep="") +} else { + require(data.table) + test = data.table:::test + null.data.table = data.table:::null.data.table + INT = data.table:::INT +} + +sugg = c("bit64") +for (s in sugg) { + assign(paste0("test_",s), loaded<-suppressWarnings(suppressMessages( + library(s, character.only=TRUE, logical.return=TRUE, quietly=TRUE, warn.conflicts=FALSE, pos="package:base") # attach at the end for #5101 + ))) + if (!loaded) cat("\n**** Suggested package",s,"is not installed or has dependencies missing. Tests using it will be skipped.\n\n") +} + +# := by group +DT = data.table(a=1:3,b=(1:9)/10) +test(611.1, levels=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +setkey(DT,a) +test(611.2, levels=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +# Combining := by group with i +test(611.3, levels=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4, levels=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +# 612 was just level repetition of 611 +# Assign to subset ok (NA initialized in the other items) ok : +test(613, levels=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614, levels=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615, levels=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) + +# Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 +ans = copy(DT)[,r:=NA_real_] +test(618.1, levels=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2, levels=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3, levels=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4, levels=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5, levels=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +# test 619 was level 2 of 618 + +DT = data.table(x=letters, key="x") +test(621, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") + +set.seed(2) +DT = data.table(a=rnorm(5)*10, b=1:5) +test(623, levels=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) + +# Setup for test 656.x - gforce tests +set.seed(9) +n = 1e3 +DT = data.table(grp1=sample.int(150L, n, replace=TRUE), + grp2=sample.int(150L, n, replace=TRUE), + x=rnorm(n), + y=rnorm(n)) +opt = 0:2 +out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') +test(656.1, levels=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2, levels=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3, levels=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) + +# Test := keyby does setkey, #2065 +DT = data.table(x=1:2, y=1:6) +ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") +test(670.1, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), + warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") +test(670.4, levels=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5, levels=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +# test 671 was level 2 of 670 + +# varname holding colnames, by group, linked from #2120. +DT = data.table(a=rep(1:3,1:3),b=1:6) +colname = "newcol" +test(751, levels=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) + +# Add tests for nested := in j by group, #1987 +DT = data.table(a=rep(1:3,2:4),b=1:9) +test(752, levels=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) + +DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) +opt = c(0:2, Inf) +out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') +# v1.9.7 treats wrapped {} better, so this is now optimized +test(865, levels=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867, levels=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here + +# tests of gsum and gmean with NA +DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) +set(DT,c(3L,8L),"y",NA) +set(DT,c(5L,9L),"v",NA) +set(DT,10:12,"y",NA) +set(DT,10:12,"v",NA) +opt = c(1L, 2L) +out = c("(GForce FALSE)", "GForce optimized j to") +test(1184.1, levels=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2, levels=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2, levels=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], + output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) +test(1187.1, levels=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], + data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +MyVar = TRUE +test(1187.2, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +MyVar = FALSE +test(1187.4, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +# GForce should not turn on when the .ok function isn't triggered +test(1187.6, levels=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], + data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), + output='j unchanged', warning="'trim' is not yet optimized") + +# FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. +set.seed(2L) +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) +test(1304.1, levels=0:2, dt[, list(.N, sum(y)), by=x]) +dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") +test(1304.2, levels=0:2, dt[, list(.N, sum(y)), by=x]) + +# gmin and gmax extensive testing (because there are tricky cases) +DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) +opts = 0:2 +# for integers +test(1313.01, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := INT(NA)] +test(1313.05, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +# for numeric +DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) +test(1313.09, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +# testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) +DT[x==6, y := NA_real_] +test(1313.13, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +# for date (attribute check.. especially after issues/689 !!!) +DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) +test(1313.17, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18, levels=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +DT[c(1,6), y := NA] +test(1313.19, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20, levels=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21, levels=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22, levels=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +# for character +set.seed(1L) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] +test(1313.23, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +DT[x==6, y := NA_character_] +test(1313.27, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) + +# Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now +dt = data.table(a=sample(3,20,TRUE), b=1:10) +test(1565.1, levels=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], + output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) + +# gforce optimisations +dt = data.table(x = sample(letters, 300, TRUE), + i1 = sample(-10:10, 300, TRUE), + i2 = sample(c(-10:10, NA), 300, TRUE), + d1 = as.numeric(sample(-10:10, 300, TRUE)), + d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) +if (test_bit64) { + dt[, `:=`(d3 = as.integer64(sample(-10:10, 300, TRUE)))] + dt[, `:=`(d4 = as.integer64(sample(c(-10:10,NA), 300, TRUE)))] +} +opt = 0:2 +out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') +# make sure gforce is on +# testing gforce::gmedian +test(1579.01, levels=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], + dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") +test(1579.02, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) +test(1579.03, levels=2L, dt[, lapply(.SD, median), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) +test(1579.04, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], + dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) +# testing gforce::ghead and gforce::gtail +# head(.SD, 1) and tail(.SD, 1) optimisation +test(1579.06, levels=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08, levels=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10, levels=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12, levels=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14, levels=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16, levels=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18, levels=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20, levels=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +# 1579.22 tested gtail with n>1; now 1579.4+ below +mysub <- function(x, n) x[n] +test(1579.23, levels=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, levels=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, levels=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, levels=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, levels=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28, levels=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +test(1579.401, levels=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, levels=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, levels=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, levels=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, levels=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, levels=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, levels=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, levels=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, levels=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, levels=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") + + +# FR #971, partly addressed (only subsets in 'i') +# make sure GForce kicks in and the results are identical +dt = data.table(x = sample(letters, 300, TRUE), + d1 = as.numeric(sample(-10:10, 300, TRUE)), + d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) +opt = 1:2 +out = c("GForce FALSE","GForce optimized j") +test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], + c(.N, lapply(.SD, sum, na.rm=TRUE), + lapply(.SD, min, na.rm=TRUE), + lapply(.SD, max, na.rm=TRUE), + lapply(.SD, mean, na.rm=TRUE), + lapply(.SD, median, na.rm=TRUE) + ), by=x, verbose=TRUE], + output = out) + +# subsets in 'i' for head and tail +test(1581.04, levels=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07, levels=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10, levels=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +# #3209 g[[ +test(1581.13, levels=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +# also, block for non-atomic input, #4159 +dt = data.table(a=1:3) +dt[ , l := .(list(1, 2, 3))] +test(1581.16, dt[ , .(l = l[[1L]]), by=a, verbose=TRUE], + dt[ , l := unlist(l)], output='(GForce FALSE)') +# make sure not to apply when `[[` is applied to a nested call, #4413 +DT = data.table(f1=c("a","b"), f2=c("x","y")) +l = list(a = c(x = "ax", y = "ay"), b = c(x = "bx", y = "by")) +test(1581.17, DT[ , as.list(l[[f1]])[[f2]], by=c("f1","f2")], + data.table(f1 = c("a", "b"), f2 = c("x", "y"), V1 = c("ax", "by"))) +test(1581.18, DT[, v:=l[[f1]][f2], by=c("f1","f2")], + data.table(f1=c("a","b"), f2=c("x","y"), v=c("ax", "by"))) +# When the object being [[ is in parent.frame(), not x, +# need eval to have enclos=parent.frame(), #4612 +DT = data.table(id = c(1, 1, 2), value = c("a", "b", "c")) +DT0 = copy(DT) +fun = function(DT, tag = c("A", "B")) DT[, var := tag[[.GRP]], by = "id"] +fun(DT) +test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) + +# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level +# verbatim test from the original report: +DT = data.table( + C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), + C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), + Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) +opt = 0:2 +test(1583.1, levels=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], + data.table(C1=c("A","A","B","B","C","C"), + C2=c("a","b","b","c","c","d"), + agg=c(1,4,5,8,9,10))) +# extra test with a size-1 group containing one NaN too +DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) +test(1583.2, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4, levels=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5, levels=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) + +# FR #523, var, sd and prod +DT = data.table(x=sample(5, 100, TRUE), + y1=sample(6, 100, TRUE), + y2=sample(c(1:10,NA), 100, TRUE), + z1=runif(100), + z2=sample(c(runif(10),NA,NaN), 100, TRUE)) +opt = 0:2 +out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") +test(1594.01, levels=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +# coverage: default group .N=1 case +idx=DT[ , .I[1L], by=x]$V1 +ans=data.table(x=DT[(idx), x], V1=NA_real_) +test(1594.05, levels=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06, levels=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07, levels=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08, levels=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) + +test(1594.09, levels=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) + +test(1594.12, levels=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) + +# when datatable.optimize<1, no optimisation of j should take place: +dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) +test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], output="All optimizations are turned off") + +# weighted.mean GForce optimized, #3977 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +opt = c(1L,2L) +out = c("GForce FALSE", "GForce optimized j to") +test(2231.01, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02, levels=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03, levels=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +# multiple groups +DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.04, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06, levels=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +# (only x XOR w) containing NA +DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.07, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (only x XOR w) containing NaN +DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.11, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (only x XOR w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) +test(2231.15, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# (x and w) containing NA and NaN +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.19, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) +test(2231.21, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# let wrongly named arguments get lost in ellipsis #5543 +DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) +test(2231.61, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62, levels=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64, levels=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) + +# GForce retains attributes in by arguments #5567 +dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) +opt = c(0,Inf) +out = c("GForce FALSE", "GForce optimized j to") +test(2263.1, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3, levels=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) + +# named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 +M <- as.data.table(mtcars) +M[, " " := hp] +M[, "." := hp] + +sdnames <- setdiff(names(M), "cyl") +sdlist <- vector("list", length(sdnames)) +names(sdlist) <- sdnames + +opts = 0:2 +test(2283 + 0.01, levels=opts, + names(M[, c(m=lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(m=sdlist)))) +test(2283 + 0.02, levels=opts, + names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", sdnames)) +test(2283 + 0.03, levels=opts, + names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), + c("cyl", "Mpg", names(c(m=sdlist)))) +test(2283 + 0.04, levels=opts, + names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), + c("cyl", "mpg", names(c(mpg=sdlist)))) +test(2283 + 0.05, levels=opts, + names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", "V1", sdnames)) +test(2283 + 0.06, levels=opts, + names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), + c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) +test(2283 + 0.07, levels=opts, + names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, sdnames)) +test(2283 + 0.08, levels=opts, + names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(mean=sdlist, sum=sdlist)))) +test(2283 + 0.09, levels=opts, + names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), + c("cyl", sdnames, names(c(sum=sdlist))) ) +test(2283 + 0.10, levels=opts, + names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), + c("cyl", names(c(" "=sdlist, "."=sdlist)))) +test(2283 + 0.11, levels=opts, + names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(a=0, b=0))), sdnames)) +test(2283 + 0.12, levels=opts, + names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, 0))), sdnames)) +test(2283 + 0.13, levels=opts, + names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) +test(2283 + 0.14, levels=opts, + names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(A=list(0))), sdnames)) +test(2283 + 0.15, levels=opts, + names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) +test(2283 + 0.16, levels=opts, + names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), + c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) +test(2283 + 0.17, levels=opts, + names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) +test(2283 + 0.18, levels=opts, + names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "b", "vs", "am")) +test(2283 + 0.19, levels=opts, + names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), + c("cyl", "V1", "V2", "b", "vs", "am")) + +# Test lapply(list(...), fun) optimization +dt = as.data.table(iris) +opt = c(0L, 1L) +out = c('GForce FALSE', 'lapply optimization changed j') +test(2284.1, levels=opt, dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species, verbose=TRUE], output=out) +test(2284.2, levels=opt, names(dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species])) +test(2284.3, levels=opt, dt[, c(list(sum(Petal.Length), sum(Sepal.Length))), by=Species, verbose=TRUE], output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5540b212d3..6c191e91f6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13676,6 +13676,7 @@ DT = data.table(A=1:5, B=-3i, C=2147483647L) test(1968.2, storage.mode(DT$C), "integer") test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)), warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'") +test(1968.35, DT[, sum(as.numeric(C)), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294))) DT[3,C:=NA] test(1968.4, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(NA, 4294967294)), warning="coerced to 'numeric'") test(1968.5, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 4294967294)), warning="coerced to 'numeric'") @@ -21213,3 +21214,18 @@ test(2344.03, setkey(d1[, .(V1, label = c("one", "zero", "one"), V2)][data.table # keep sub-key in case of multiple keys, even with new columns and changing column order DT = data.table(V1 = 1:2, V2 = 3:4, V3 = 5:6, key = c("V1", "V2", "V3")) test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", "V2")) + +# gforce should also work with Map in j #5336 +# conversions should not turn gforce off #2934 +# lapply gforce should also work without .SD #5032 +dt = data.table(a=1:4, b=1:2) +out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.03, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) +test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +dt = data.table(a=1:2, b=1, c=1:4) +test(2345.06, levels=1:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) +test(2345.07, levels=1:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) diff --git a/tests/optimize.R b/tests/optimize.R new file mode 100644 index 0000000000..69dc4954cc --- /dev/null +++ b/tests/optimize.R @@ -0,0 +1,2 @@ +require(data.table) +test.data.table(script="optimize.Rraw") From 5410e315bcf1dee46ba2b1de63ac1c381801ec9a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:26:27 +0100 Subject: [PATCH 09/78] update man --- man/test.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test.Rd b/man/test.Rd index 4a25a592c5..5a2f459f2d 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize.level=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y for equality. } +\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From dece1c65b279a2196946947780b8f010bc1f4bbb Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:32:29 +0100 Subject: [PATCH 10/78] merge tests --- inst/tests/optimize.Rraw | 8 -------- inst/tests/tests.Rraw | 6 ++++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 57d53f0521..16a6bb5a43 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -445,11 +445,3 @@ test(2283 + 0.18, levels=opts, test(2283 + 0.19, levels=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) - -# Test lapply(list(...), fun) optimization -dt = as.data.table(iris) -opt = c(0L, 1L) -out = c('GForce FALSE', 'lapply optimization changed j') -test(2284.1, levels=opt, dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species, verbose=TRUE], output=out) -test(2284.2, levels=opt, names(dt[, lapply(list(Petal.Length, Sepal.Length), sum), by=Species])) -test(2284.3, levels=opt, dt[, c(list(sum(Petal.Length), sum(Sepal.Length))), by=Species, verbose=TRUE], output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6c191e91f6..9f22e624fd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21227,5 +21227,7 @@ dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.06, levels=1:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) -test(2345.07, levels=1:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=c("lapply optimization changed j", "GForce TRUE")) +out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") +test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) From 5e1789dd3c9898879b937433d8be671cd84cb210 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 15:37:37 +0100 Subject: [PATCH 11/78] polish test fun --- R/test.data.table.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index decb600bb1..645addcd80 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -379,17 +379,16 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no for (p in vector_params) { val = get(p, envir=environment()) if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # ccycle through values if fewer than levels + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than levels } } if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) - eval(cl, parent.frame()) + eval(cl, parent.frame()) # actual test call } return(invisible()) } - # print(match.call()) if (!is.null(env)) { old = Sys.getenv(names(env), names=TRUE, unset=NA) to_unset = !lengths(env) From 62f1c483115caa295525af56bfe3b8a839a2b476 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 16:40:56 +0100 Subject: [PATCH 12/78] add arithmetic --- R/data.table.R | 66 ++++++++++++++++++++++++++++++------------- inst/tests/tests.Rraw | 5 ++++ 2 files changed, 52 insertions(+), 19 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 52686ed812..8ac34f8336 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3344,31 +3344,59 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 + if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + q1 = .get_gcall(q) - if (is.null(q1)) return(FALSE) - q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] - if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 - if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && - !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) - switch(as.character(q1), - "shift" = .gshift_ok(q), - "weighted.mean" = .gweighted.mean_ok(q, x), - "tail" = , "head" = .ghead_ok(q), - "[[" = , "[" = `.g[_ok`(q, x, envir), - FALSE - ) + if (!is.null(q1)) { + q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] + if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 + if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && + !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) + return(switch(as.character(q1), + "shift" = .gshift_ok(q), + "weighted.mean" = .gweighted.mean_ok(q, x), + "tail" = , "head" = .ghead_ok(q), + "[[" = , "[" = `.g[_ok`(q, x, envir), + FALSE + )) + } + + # check if arithmetic operator -> recursively validate ALL branches (like in AST) + if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + for (i in 2:length(q)) { + if (!.gforce_ok(q[[i]], x, envir)) return(FALSE) + } + return(TRUE) + } + + FALSE } .gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { - call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. - q[[1L]] = as.name(paste0("g", call_name)) - # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok - # do not evaluate vars present as columns in x - if (length(q) >= 3L) { - for (i in 3:length(q)) { - if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 + if (!is.call(q)) return(q) + + q1 = .get_gcall(q) + if (!is.null(q1)) { + call_name = if (is.symbol(q[[1L]])) q[[1L]] else q[[1L]][[3L]] # latter is like data.table::shift, #5942. .gshift_ok checked this will work. + q[[1L]] = as.name(paste0("g", call_name)) + # gforce needs to evaluate arguments before calling C part TODO: move the evaluation into gforce_ok + # do not evaluate vars present as columns in x + if (length(q) >= 3L) { + for (i in 3:length(q)) { + if (is.symbol(q[[i]]) && !(q[[i]] %chin% names_x)) q[[i]] = eval(q[[i]], envir) # tests 1187.2 & 1187.4 + } + } + return(q) + } + + # if arithmetic operator, recursively substitute its operands. we know what branches are valid from .gforce_ok + if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + for (i in 2:length(q)) { + q[[i]] = .gforce_jsub(q[[i]], names_x, envir) } + return(q) } + # should not reach here since .gforce_ok q } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f22e624fd..2a58d0ee23 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21218,6 +21218,7 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # gforce should also work with Map in j #5336 # conversions should not turn gforce off #2934 # lapply gforce should also work without .SD #5032 +# support arithmetic in j with gforce #3815 dt = data.table(a=1:4, b=1:2) out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) @@ -21231,3 +21232,7 @@ out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +dt = data.table(a=1:4, b=1:2) +out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +test(2345.09, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +dt = data.table(a=1:4, b=1:2) From c47ec27e7039a47f444913d040089235878263cc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 17:32:11 +0100 Subject: [PATCH 13/78] add AST walker and update tests --- R/data.table.R | 11 +++++++++-- inst/tests/tests.Rraw | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 8ac34f8336..88d34510d3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3342,9 +3342,12 @@ is_constantish = function(q, check_singleton=FALSE) { "as.complex", "as.logical", "as.Date", "as.POSIXct", "as.factor") } +.gforce_ops = c("+", "-", "*", "/", "^", "%%", "%/%") + .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + if (q %iscall% "(") return(.gforce_ok(q[[2L]], x, envir)) q1 = .get_gcall(q) if (!is.null(q1)) { @@ -3362,7 +3365,7 @@ is_constantish = function(q, check_singleton=FALSE) { } # check if arithmetic operator -> recursively validate ALL branches (like in AST) - if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { for (i in 2:length(q)) { if (!.gforce_ok(q[[i]], x, envir)) return(FALSE) } @@ -3374,6 +3377,10 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_jsub = function(q, names_x, envir=parent.frame(2L)) { if (!is.call(q)) return(q) + if (q %iscall% "(") { + q[[2L]] = .gforce_jsub(q[[2L]], names_x, envir) + return(q) + } q1 = .get_gcall(q) if (!is.null(q1)) { @@ -3390,7 +3397,7 @@ is_constantish = function(q, check_singleton=FALSE) { } # if arithmetic operator, recursively substitute its operands. we know what branches are valid from .gforce_ok - if (is.symbol(q[[1L]]) && q[[1L]] %chin% c("+", "-", "*", "/", "^", "%%", "%/%")) { + if (is.symbol(q[[1L]]) && q[[1L]] %chin% .gforce_ops) { for (i in 2:length(q)) { q[[i]] = .gforce_jsub(q[[i]], names_x, envir) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2a58d0ee23..f8db7b2d34 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21219,20 +21219,20 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # conversions should not turn gforce off #2934 # lapply gforce should also work without .SD #5032 # support arithmetic in j with gforce #3815 -dt = data.table(a=1:4, b=1:2) out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +dt = data.table(a=1:4, b=1:2) test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) -test(2345.03, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +dt = data.table(a=1:4, b=1:2) +test(2345.11, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12, levels=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.04, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.05, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -out = c("GForce FALSE", "lapply optimization changed j", "GForce TRUE") -test(2345.06, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.07, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.08, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) -dt = data.table(a=1:4, b=1:2) -out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") -test(2345.09, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.21, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) +test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) From 1d324d60b991547bbedfa944e0ceaeb3b894041f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 18:16:34 +0100 Subject: [PATCH 14/78] add tests --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 88d34510d3..155e1f74ef 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3346,7 +3346,7 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 - if (!is.call(q)) return(FALSE) # plain columns are not gforce-able since they might not aggregate (see test 104.1) + if (!is.call(q)) return(is.numeric(q)) # plain columns are not gforce-able since they might not aggregate (see test 104.1) if (q %iscall% "(") return(.gforce_ok(q[[2L]], x, envir)) q1 = .get_gcall(q) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f8db7b2d34..2449e0cf94 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21236,3 +21236,7 @@ test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33, levels=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34, levels=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35, levels=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36, levels=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") From 22cf35eb5b897eda3c8e576589bd0e08b4c1d045 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 2 Nov 2025 19:42:11 +0100 Subject: [PATCH 15/78] add NEWS --- NEWS.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3ae0dc33fd..9435971257 100644 --- a/NEWS.md +++ b/NEWS.md @@ -296,7 +296,16 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T # user system elapsed # 0.028 0.000 0.005 ``` - 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + +21. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. + + This rewrite also introduces several new optimizations: + - Enables Map instead of lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) + - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) + - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) + - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) ### BUG FIXES From 25a7e2e4a2540e80f950ec87fec4a5b4319863d0 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 09:45:52 +0100 Subject: [PATCH 16/78] make function name in massageSD more expressive --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 155e1f74ef..41d0b5246f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -158,7 +158,7 @@ replace_dot_alias = function(e) { # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) # replaced SDenv$.SD to SDenv to deal with Bug #87 reported by Ricardo (Nice catch!) - thisfun = paste0("..FUN", funi) # Fix for #985 + thisfun = paste0("..LAPPLY_FUN", funi) # Fix for #985 assign(thisfun, eval(fun, SDenv, SDenv), SDenv) # to avoid creating function() for each column of .SD lockBinding(thisfun, SDenv) txt[[1L]] = as.name(thisfun) From eb8056cb12fdf7319333b69eef626f6cda3478d7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 10:35:35 +0100 Subject: [PATCH 17/78] rename levels argument to optimization --- R/test.data.table.R | 16 +- inst/tests/benchmark.Rraw | 6 +- inst/tests/optimize.Rraw | 336 +++++++++++++++++++------------------- inst/tests/tests.Rraw | 200 +++++++++++------------ man/test.Rd | 2 +- 5 files changed, 280 insertions(+), 280 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 645addcd80..fa2cb426c8 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,25 +361,25 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,levels=NULL) { - # if levels is provided, test across multiple optimization levels - if (!is.null(levels)) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimization=NULL) { + # if optimization is provided, test across multiple optimization levels + if (!is.null(optimization)) { cl = match.call() - cl$levels = NULL # Remove levels from the recursive call + cl$optimization = NULL # Remove optimization levels from the recursive call vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") # Check if y was explicitly provided (not just the default) y_provided = !missing(y) - compare = !y_provided && length(levels)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + compare = !y_provided && length(optimization)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) - for (i in seq_along(levels)) { + for (i in seq_along(optimization)) { cl$num = num + (i - 1L) * 1e-6 - opt_level = list(datatable.optimize = levels[i]) + opt_level = list(datatable.optimize = optimization[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level for (p in vector_params) { val = get(p, envir=environment()) if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than levels + cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels } } diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 52a91c723f..cdc541559b 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -test(637.1, levels=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -test(637.2, levels=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -test(637.3, levels=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 16a6bb5a43..cade0c60c7 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -19,34 +19,34 @@ for (s in sugg) { # := by group DT = data.table(a=1:3,b=(1:9)/10) -test(611.1, levels=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(611.1,optimization=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(611.2, levels=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(611.2,optimization=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Combining := by group with i -test(611.3, levels=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4, levels=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +test(611.3,optimization=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4,optimization=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # 612 was just level repetition of 611 # Assign to subset ok (NA initialized in the other items) ok : -test(613, levels=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614, levels=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615, levels=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +test(613,optimization=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614,optimization=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615,optimization=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618.1, levels=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) -test(618.2, levels=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3, levels=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) -test(618.4, levels=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) -test(618.5, levels=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +test(618.1,optimization=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2,optimization=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3,optimization=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4,optimization=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5,optimization=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) # test 619 was level 2 of 618 DT = data.table(x=letters, key="x") -test(621, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622, levels=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") +test(621,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623, levels=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +test(623,optimization=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Setup for test 656.x - gforce tests set.seed(9) @@ -57,36 +57,36 @@ DT = data.table(grp1=sample.int(150L, n, replace=TRUE), y=rnorm(n)) opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -test(656.1, levels=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) -test(656.2, levels=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) -test(656.3, levels=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) +test(656.1,optimization=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2,optimization=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3,optimization=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) -test(670.2, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) -test(670.3, levels=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(670.1,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -test(670.4, levels=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -test(670.5, levels=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(670.4,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5,optimization=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # test 671 was level 2 of 670 # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751, levels=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751,optimization=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752, levels=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752,optimization=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) opt = c(0:2, Inf) out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized -test(865, levels=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) -test(867, levels=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +test(865,optimization=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867,optimization=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here # tests of gsum and gmean with NA DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) @@ -96,82 +96,82 @@ set(DT,10:12,"y",NA) set(DT,10:12,"v",NA) opt = c(1L, 2L) out = c("(GForce FALSE)", "GForce optimized j to") -test(1184.1, levels=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) -test(1184.2, levels=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.2, levels=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], +test(1184.1,optimization=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2,optimization=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2,optimization=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -test(1187.1, levels=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], +test(1187.1,optimization=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) MyVar = TRUE -test(1187.2, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.3, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.2,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) MyVar = FALSE -test(1187.4, levels=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.5, levels=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.4,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) # GForce should not turn on when the .ok function isn't triggered -test(1187.6, levels=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], +test(1187.6,optimization=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), output='j unchanged', warning="'trim' is not yet optimized") # FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. set.seed(2L) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -test(1304.1, levels=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.1,optimization=0:2, dt[, list(.N, sum(y)), by=x]) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -test(1304.2, levels=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.2,optimization=0:2, dt[, list(.N, sum(y)), by=x]) # gmin and gmax extensive testing (because there are tricky cases) DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) opts = 0:2 # for integers -test(1313.01, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.01,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := INT(NA)] -test(1313.05, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +test(1313.05,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.09,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := NA_real_] -test(1313.13, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +test(1313.13,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18, levels=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +test(1313.17,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) DT[c(1,6), y := NA] -test(1313.19, levels=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20, levels=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21, levels=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22, levels=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +test(1313.19,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21,optimization=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22,optimization=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) DT[x==7, y := c("","b","c")] -test(1313.23, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25, levels=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26, levels=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.23,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) DT[x==6, y := NA_character_] -test(1313.27, levels=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28, levels=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +test(1313.27,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565.1, levels=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], +test(1565.1,optimization=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) # gforce optimisations @@ -188,32 +188,32 @@ opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') # make sure gforce is on # testing gforce::gmedian -test(1579.01, levels=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], +test(1579.01,optimization=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") -test(1579.02, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], +test(1579.02,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03, levels=2L, dt[, lapply(.SD, median), keyby=x], +test(1579.03,optimization=2L, dt[, lapply(.SD, median), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04, levels=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], +test(1579.04,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) # testing gforce::ghead and gforce::gtail # head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06, levels=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.08, levels=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.10, levels=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.12, levels=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) -test(1579.14, levels=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.16, levels=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.18, levels=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.20, levels=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.06,optimization=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08,optimization=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10,optimization=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12,optimization=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14,optimization=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16,optimization=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18,optimization=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20,optimization=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) # 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23, levels=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24, levels=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25, levels=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26, levels=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, levels=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28, levels=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +test(1579.23,optimization=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24,optimization=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25,optimization=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26,optimization=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27,optimization=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28,optimization=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) # gforce head/tail for n>1, #5060 set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), @@ -223,16 +223,16 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), s = sample(c("foo","bar",NA), 20, TRUE), l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -test(1579.401, levels=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402, levels=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403, levels=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404, levels=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405, levels=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406, levels=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407, levels=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408, levels=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409, levels=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410, levels=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") +test(1579.401,optimization=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402,optimization=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403,optimization=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404,optimization=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405,optimization=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406,optimization=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407,optimization=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408,optimization=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409,optimization=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410,optimization=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") # FR #971, partly addressed (only subsets in 'i') @@ -242,7 +242,7 @@ dt = data.table(x = sample(letters, 300, TRUE), d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) opt = 1:2 out = c("GForce FALSE","GForce optimized j") -test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], +test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), @@ -252,11 +252,11 @@ test(1581.01, levels=opt, ans1 <- dt[x %in% letters[15:20], output = out) # subsets in 'i' for head and tail -test(1581.04, levels=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.07, levels=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.10, levels=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +test(1581.04,optimization=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07,optimization=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10,optimization=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) # #3209 g[[ -test(1581.13, levels=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +test(1581.13,optimization=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) # also, block for non-atomic input, #4159 dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] @@ -284,16 +284,16 @@ DT = data.table( C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) opt = 0:2 -test(1583.1, levels=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], +test(1583.1,optimization=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], data.table(C1=c("A","A","B","B","C","C"), C2=c("a","b","b","c","c","d"), agg=c(1,4,5,8,9,10))) # extra test with a size-1 group containing one NaN too DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2, levels=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3, levels=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4, levels=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5, levels=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.2,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4,optimization=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5,optimization=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # FR #523, var, sd and prod DT = data.table(x=sample(5, 100, TRUE), @@ -303,24 +303,24 @@ DT = data.table(x=sample(5, 100, TRUE), z2=sample(c(runif(10),NA,NaN), 100, TRUE)) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(1594.01, levels=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) -test(1594.02, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) -test(1594.03, levels=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.01,optimization=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) # coverage: default group .N=1 case idx=DT[ , .I[1L], by=x]$V1 ans=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05, levels=opt, DT[(idx), var(y1), by=x], ans) -test(1594.06, levels=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) -test(1594.07, levels=opt, DT[(idx), var(z1), by=x], ans) -test(1594.08, levels=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) +test(1594.05,optimization=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06,optimization=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07,optimization=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08,optimization=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) -test(1594.09, levels=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) -test(1594.10, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11, levels=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.09,optimization=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) -test(1594.12, levels=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) -test(1594.13, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) -test(1594.14, levels=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.12,optimization=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) # when datatable.optimize<1, no optimisation of j should take place: dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) @@ -330,53 +330,53 @@ test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], outp DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) opt = c(1L,2L) out = c("GForce FALSE", "GForce optimized j to") -test(2231.01, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) -test(2231.02, levels=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) -test(2231.03, levels=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.01,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02,optimization=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03,optimization=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) # multiple groups DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.05, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.06, levels=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +test(2231.04,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06,optimization=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) # (only x XOR w) containing NA DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.08, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) -test(2231.09, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.10, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.07,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NaN DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) -test(2231.12, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.13, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) -test(2231.14, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.11,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.16, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.17, levels=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.18, levels=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.15,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (x and w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.20, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.19,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.22, levels=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.21,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # let wrongly named arguments get lost in ellipsis #5543 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61, levels=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) -test(2231.62, levels=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -test(2231.63, levels=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64, levels=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) +test(2231.61,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62,optimization=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64,optimization=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) # GForce retains attributes in by arguments #5567 dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) opt = c(0,Inf) out = c("GForce FALSE", "GForce optimized j to") -test(2263.1, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) -test(2263.2, levels=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) -test(2263.3, levels=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) +test(2263.1,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3,optimization=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) # named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 M <- as.data.table(mtcars) @@ -388,60 +388,60 @@ sdlist <- vector("list", length(sdnames)) names(sdlist) <- sdnames opts = 0:2 -test(2283 + 0.01, levels=opts, +test(2283 + 0.01, optimization=opts, names(M[, c(m=lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(m=sdlist)))) -test(2283 + 0.02, levels=opts, +test(2283 + 0.02, optimization=opts, names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", sdnames)) -test(2283 + 0.03, levels=opts, +test(2283 + 0.03, optimization=opts, names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", names(c(m=sdlist)))) -test(2283 + 0.04, levels=opts, +test(2283 + 0.04, optimization=opts, names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), c("cyl", "mpg", names(c(mpg=sdlist)))) -test(2283 + 0.05, levels=opts, +test(2283 + 0.05, optimization=opts, names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "V1", sdnames)) -test(2283 + 0.06, levels=opts, +test(2283 + 0.06, optimization=opts, names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) -test(2283 + 0.07, levels=opts, +test(2283 + 0.07, optimization=opts, names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, sdnames)) -test(2283 + 0.08, levels=opts, +test(2283 + 0.08, optimization=opts, names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(mean=sdlist, sum=sdlist)))) -test(2283 + 0.09, levels=opts, +test(2283 + 0.09, optimization=opts, names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, names(c(sum=sdlist))) ) -test(2283 + 0.10, levels=opts, +test(2283 + 0.10, optimization=opts, names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(" "=sdlist, "."=sdlist)))) -test(2283 + 0.11, levels=opts, +test(2283 + 0.11, optimization=opts, names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(a=0, b=0))), sdnames)) -test(2283 + 0.12, levels=opts, +test(2283 + 0.12, optimization=opts, names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, 0))), sdnames)) -test(2283 + 0.13, levels=opts, +test(2283 + 0.13, optimization=opts, names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) -test(2283 + 0.14, levels=opts, +test(2283 + 0.14, optimization=opts, names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0))), sdnames)) -test(2283 + 0.15, levels=opts, +test(2283 + 0.15, optimization=opts, names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.16, levels=opts, +test(2283 + 0.16, optimization=opts, names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.17, levels=opts, +test(2283 + 0.17, optimization=opts, names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.18, levels=opts, +test(2283 + 0.18, optimization=opts, names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.19, levels=opts, +test(2283 + 0.19, optimization=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2449e0cf94..869616111e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3890,7 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -test(1133.7, levels=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7,optimization=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3993,8 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -test(1143.4, levels=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5, levels=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) @@ -4465,16 +4465,16 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") # turn off GForce, to test dogroups -test(1230, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1230,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231, levels=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232, levels=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233, levels=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234, levels=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232,optimization=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233,optimization=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234,optimization=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235, levels=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236, levels=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237, levels=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1235,optimization=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236,optimization=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237,optimization=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4741,39 +4741,39 @@ dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01, levels=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02, levels=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03, levels=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04, levels=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05, levels=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01,optimization=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.02,optimization=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') +test(1268.03,optimization=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.04,optimization=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.05,optimization=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # newly added tests for #861 -- optimise, but no GForce -test(1268.06, levels=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06,optimization=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") # don't optimise .I in c(...) -test(1268.07, levels=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") +test(1268.07,optimization=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") -test(1268.08, levels=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09, levels=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10, levels=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11, levels=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12, levels=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13, levels=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14, levels=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.08,optimization=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") +test(1268.09,optimization=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.10,optimization=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') +test(1268.11,optimization=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.12,optimization=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.13,optimization=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") +test(1268.14,optimization=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.15, levels=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, +test(1268.15,optimization=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16, levels=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, +test(1268.16,optimization=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17, levels=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, +test(1268.17,optimization=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18, levels=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, +test(1268.18,optimization=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19, levels=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, +test(1268.19,optimization=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20, levels=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, +test(1268.20,optimization=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21, levels=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, +test(1268.21,optimization=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") -test(1268.22, levels=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22,optimization=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") @@ -6278,7 +6278,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -test(1437.21, levels=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) +test(1437.21,optimization=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6364,16 +6364,16 @@ for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery))]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, levels=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) + test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } @@ -8778,13 +8778,13 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -test(1629.09, levels=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +test(1629.09,optimization=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10, levels=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11, levels=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10,optimization=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11,optimization=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12, levels=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12,optimization=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] @@ -12756,8 +12756,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -test(1918.5, levels=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6, levels=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5,optimization=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6,optimization=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -13640,11 +13640,11 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -test(1967.69, levels=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') -test(1967.71, levels=1L, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69,optimization=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71,optimization=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72, levels=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72,optimization=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") x = data.table( @@ -13798,8 +13798,8 @@ test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1, levels=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2, levels=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1,optimization=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2,optimization=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -13831,7 +13831,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -test(1984.10, levels=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10,optimization=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -14717,10 +14717,10 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 opt = c(Inf,1L,0L) -test(2043.1, levels=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -test(2043.4, levels=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +test(2043.1,optimization=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4,optimization=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5, levels=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5,optimization=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -15883,10 +15883,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -test(2098.5, levels=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6, levels=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7, levels=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8, levels=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +test(2098.5,optimization=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6,optimization=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7,optimization=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8,optimization=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17311,14 +17311,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - test(2208.1, levels=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3, levels=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1,optimization=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3,optimization=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7, levels=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8, levels=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + test(2208.7,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17538,12 +17538,12 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, levels=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, levels=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, optimization=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -17580,7 +17580,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, levels=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, optimization=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -17685,15 +17685,15 @@ test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") DT = data.table(a=1:3,b=(1:9)/10) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(2233.01, levels=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) +test(2233.01,optimization=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02, levels=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) +test(2233.02,optimization=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03, levels=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") +test(2233.03,optimization=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04, levels=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) -test(2233.05, levels=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) +test(2233.04,optimization=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05,optimization=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -17768,17 +17768,17 @@ opt = c(0,Inf) out = c("GForce FALSE", "GForce.*gsum") B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle -test(2233.391, levels=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +test(2233.391,optimization=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) -test(2233.392, levels=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) -test(2233.393, levels=opt, any(A[,t1!=t2]), FALSE) -test(2233.394, levels=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) -test(2233.395, levels=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) +test(2233.392,optimization=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393,optimization=opt, any(A[,t1!=t2]), FALSE) +test(2233.394,optimization=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395,optimization=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -test(2233.41, levels=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) +test(2233.41,optimization=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -17789,7 +17789,7 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43, levels=c(0,Inf), options = list(datatable.verbose=TRUE), +test(2233.43,optimization=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) @@ -17941,12 +17941,12 @@ i=c(1,2) j=1L opt = c(1L,2L) out = c("GForce FALSE", "GForce TRUE") -test(2243.41, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.42, levels=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) -test(2243.54, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) -test(2243.55, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56, levels=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +test(2243.41,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42,optimization=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18403,7 +18403,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270, levels=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270,optimization=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -21221,22 +21221,22 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # support arithmetic in j with gforce #3815 out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") dt = data.table(a=1:4, b=1:2) -test(2345.01, levels=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2345.02, levels=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.01,optimization=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02,optimization=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) dt = data.table(a=1:4, b=1:2) -test(2345.11, levels=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2345.12, levels=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.11,optimization=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12,optimization=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.13, levels=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.14, levels=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13,optimization=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14,optimization=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.21, levels=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.22, levels=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.23, levels=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +test(2345.21,optimization=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22,optimization=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23,optimization=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) -test(2345.31, levels=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2345.32, levels=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2345.33, levels=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2345.34, levels=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2345.35, levels=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2345.36, levels=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.31,optimization=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32,optimization=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33,optimization=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34,optimization=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35,optimization=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36,optimization=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") diff --git a/man/test.Rd b/man/test.Rd index 5a2f459f2d..f9d3e4efa1 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{levels}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=level)} set accordingly. All levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } +\item{optimization}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimization)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From 4544398e0b5ff8f7e8b3a4ef55c32fd242cfee63 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:22:00 +0100 Subject: [PATCH 18/78] update docs --- man/test.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test.Rd b/man/test.Rd index f9d3e4efa1..2e216041c5 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL, levels = NULL) + options = NULL, env = NULL, optimization = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } From d40edb82c4c23da141a2bcc797fbc85208f6d51e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:46:16 +0100 Subject: [PATCH 19/78] restore test nums --- inst/tests/benchmark.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index cdc541559b..0132e2995d 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +test(637.1, optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +test(637.3, optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes From 5e7efb73faf7ce4d83c3ec0790434eb3508cd770 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 11:49:47 +0100 Subject: [PATCH 20/78] remove double tests --- inst/tests/tests.Rraw | 2 -- 1 file changed, 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 869616111e..70cbf36ac4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3995,8 +3995,6 @@ DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.1 test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.6, copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.7, copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 From 3826927933e72c718237ebf16238422889f93800 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 3 Nov 2025 12:02:12 +0100 Subject: [PATCH 21/78] simplify tests --- inst/tests/tests.Rraw | 45 +++++++++++++------------------------------ 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 70cbf36ac4..db7ed84e64 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -4738,43 +4738,24 @@ set.seed(45L) dt = data.table(a=sample(2,10,TRUE), b=sample(3,10,TRUE), c=sample(4,10,TRUE), d=sample(5,10,TRUE)) dt2 = data.table(x=c(1,1,1,2,2,2), y=1:6) +gf_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "GForce optimized j to") +lp_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization changed j") +opt = c(0L, 1L, Inf) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01,optimization=0L, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.02,optimization=0L, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = 'All optimizations are turned off') -test(1268.03,optimization=0L, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.04,optimization=0L, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") -test(1268.05,optimization=0L, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.01,optimization=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) +test(1268.02,optimization=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) +test(1268.03,optimization=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.04,optimization=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) +test(1268.05,optimization=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # newly added tests for #861 -- optimise, but no GForce -test(1268.06,optimization=0L, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="All optimizations are turned off") +test(1268.06,optimization=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # don't optimise .I in c(...) -test(1268.07,optimization=0L, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="All optimizations are turned off") - -test(1268.08,optimization=1L, ans1 <- dt[ , c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output="Old mean optimization.*(GForce FALSE)") -test(1268.09,optimization=1L, ans2 <- dt[, c(lapply(.SD, mean), .N), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.10,optimization=1L, ans3 <- dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output = 'Old mean optimization.*GForce FALSE') -test(1268.11,optimization=1L, ans4 <- dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose = TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.12,optimization=1L, ans5 <- dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.13,optimization=1L, ans6 <- dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output="Old mean optimization.*GForce FALSE") -test(1268.14,optimization=1L, ans7 <- dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output="Old mean optimization.*GForce FALSE") - -test(1268.15,optimization=Inf, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], ans1, - output="GForce optimized j to 'list(gmean(b), gmean(c), gmean(d), gsum(b), gsum(c), gsum(d))'") -test(1268.16,optimization=Inf, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], ans2, - output = "lapply optimization changed j from 'c(lapply(.SD, mean), .N)' to 'list(mean(b), mean(c), mean(d), .N)'") -test(1268.17,optimization=Inf, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], ans3, - output = "lapply optimization changed j from 'c(list(c), lapply(.SD, mean))' to 'list(c, mean(b), mean(c), mean(d))") -test(1268.18,optimization=Inf, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], ans4, - output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.19,optimization=Inf, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], ans5, - output = "GForce optimized j to 'list(gsum(d), gmean(b), gmean(c), gmean(d))'") -test(1268.20,optimization=Inf, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], ans6, - output = "lapply optimization changed j from 'c(list(sum(d), .I), lapply(.SD, mean))' to 'list(sum(d), .I, mean(b), mean(c), mean(d))'") -test(1268.21,optimization=Inf, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], ans7, - output = "lapply optimization is on, j unchanged as 'c(.I, lapply(.SD, mean))'") -test(1268.22,optimization=Inf, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.07,optimization=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], + output= c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization is on, j unchanged as")) +# tests .08-.21 were different optimization levels +test(1268.22,optimization=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") - ### FR #2722 tests end here ### # Wide range numeric and integer64, to test all bits From 982343fa6f4df114f413c643de1dbc15b08ef0ee Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 4 Nov 2025 08:09:14 +0100 Subject: [PATCH 22/78] phrasing Co-authored-by: Michael Chirico --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9435971257..771ea9e3c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -302,7 +302,7 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 21. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. This rewrite also introduces several new optimizations: - - Enables Map instead of lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) + - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) From 1e6ad035b3ec0b67e200eccf150f1d4c5aafefb8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:22:56 +0100 Subject: [PATCH 23/78] use mget for all vector params --- R/test.data.table.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index fa2cb426c8..95a3cb4446 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -367,19 +367,19 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no cl = match.call() cl$optimization = NULL # Remove optimization levels from the recursive call - vector_params = c("error", "warning", "message", "output", "notOutput", "ignore.warning") # Check if y was explicitly provided (not just the default) y_provided = !missing(y) - compare = !y_provided && length(optimization)>1L && !any(vapply_1b(vector_params, function(p) length(get(p, envir=environment())) > 0L)) + vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) + compare = !y_provided && length(optimization)>1L && !any(lengths(vector_params)) for (i in seq_along(optimization)) { cl$num = num + (i - 1L) * 1e-6 opt_level = list(datatable.optimize = optimization[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level - for (p in vector_params) { - val = get(p, envir=environment()) + for (param in names(vector_params)) { + val = vector_params[[param]] if (length(val) > 0L) { - cl[[p]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels + cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels } } From 9e1297e558c8c1fa270abbc92cf232b1f217ea88 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:25:24 +0100 Subject: [PATCH 24/78] rename optimization parameter --- R/test.data.table.R | 4 +- inst/tests/benchmark.Rraw | 6 +- inst/tests/optimize.Rraw | 336 +++++++++++++++++++------------------- inst/tests/tests.Rraw | 172 +++++++++---------- man/test.Rd | 4 +- 5 files changed, 261 insertions(+), 261 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 95a3cb4446..af39e3a099 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -361,11 +361,11 @@ gc_mem = function() { # nocov end } -test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimization=NULL) { +test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels if (!is.null(optimization)) { cl = match.call() - cl$optimization = NULL # Remove optimization levels from the recursive call + cl$optimize= NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) y_provided = !missing(y) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 0132e2995d..1cb35a557d 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -193,10 +193,10 @@ test(301.1, nrow(DT[,sum(B),by=C])==100010) set.seed(1) DT = data.table(a=sample(1:100, 1e6, replace=TRUE), b=sample(1:1000, 1e6, replace=TRUE), key="a") opt = c(0L,2L) -test(637.1, optimization=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) -test(637.2, optimization=opt, key(copy(DT)[J(43L), a:=99L]), NULL) +test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c(156L, 808L, 848L), m=DT[J(1), sum(b)], key="a")) +test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) -test(637.3, optimization=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL) # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index cade0c60c7..cb096e94ce 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -19,34 +19,34 @@ for (s in sugg) { # := by group DT = data.table(a=1:3,b=(1:9)/10) -test(611.1,optimization=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) +test(611.1,optimize=c(0L, 2L), DT[,v:=sum(b),by=a], data.table(a=1:3,b=(1:9)/10,v=c(1.2,1.5,1.8))) setkey(DT,a) -test(611.2,optimization=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) +test(611.2,optimize=c(0L, 2L), DT[,v:=min(b),by=a], data.table(a=1:3,b=(1:9)/10,v=(1:3)/10,key="a")) # Combining := by group with i -test(611.3,optimization=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) -test(611.4,optimization=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) +test(611.3,optimize=c(0L, 2L), DT[a>1,p:=sum(b)]$p, rep(c(NA,3.3),c(3,6))) +test(611.4,optimize=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each=3)) # 612 was just level repetition of 611 # Assign to subset ok (NA initialized in the other items) ok : -test(613,optimization=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) -test(614,optimization=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) -test(615,optimization=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +test(613,optimize=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) +test(614,optimize=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) +test(615,optimize=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] -test(618.1,optimization=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) -test(618.2,optimization=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) -test(618.3,optimization=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) -test(618.4,optimization=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) -test(618.5,optimization=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) +test(618.1,optimize=c(0L, 2L), copy(DT)[a>3,r:=sum(b)], ans) +test(618.2,optimize=c(0L, 2L), copy(DT)[J(-1),r:=sum(b)], ans) +test(618.3,optimize=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) +test(618.4,optimize=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) +test(618.5,optimize=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) # test 619 was level 2 of 618 DT = data.table(x=letters, key="x") -test(621,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained -test(622,optimization=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") +test(621,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained +test(622,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning="ignoring nomatch") set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) -test(623,optimization=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +test(623,optimize=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) # Setup for test 656.x - gforce tests set.seed(9) @@ -57,36 +57,36 @@ DT = data.table(grp1=sample.int(150L, n, replace=TRUE), y=rnorm(n)) opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') -test(656.1,optimization=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) -test(656.2,optimization=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) -test(656.3,optimization=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) +test(656.1,optimize=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) +test(656.2,optimize=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) +test(656.3,optimize=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) ans = data.table(x=rep(1:2,each=3),y=c(1L,3L,5L,2L,4L,6L),z=rep(c(9L,12L),each=3),key="x") -test(670.1,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) -test(670.2,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) -test(670.3,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), +test(670.1,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x], ans) +test(670.2,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby="x"], ans) +test(670.3,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),keyby=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L)), warning="The setkey() normally performed by keyby= has been skipped (as if by= was used) because := is being used together with keyby= but the keyby= contains some expressions. To avoid this warning, use by= instead, or provide existing column names to keyby=") -test(670.4,optimization=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) -test(670.5,optimization=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") +test(670.4,optimize=c(0L, 2L), copy(DT)[,z:=sum(y),by=x%%2], data.table(x=1:2,y=1:6,z=c(9L,12L))) +test(670.5,optimize=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with keyby is only possible when i is not supplied since") # test 671 was level 2 of 670 # varname holding colnames, by group, linked from #2120. DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" -test(751,optimization=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +test(751,optimize=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) -test(752,optimization=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +test(752,optimize=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) opt = c(0:2, Inf) out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized -test(865,optimization=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) -test(867,optimization=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +test(865,optimize=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) +test(867,optimize=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here # tests of gsum and gmean with NA DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) @@ -96,82 +96,82 @@ set(DT,10:12,"y",NA) set(DT,10:12,"v",NA) opt = c(1L, 2L) out = c("(GForce FALSE)", "GForce optimized j to") -test(1184.1,optimization=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) -test(1184.2,optimization=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") -test(1185.2,optimization=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], +test(1184.1,optimize=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +test(1184.2,optimize=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.2,optimize=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -test(1187.1,optimization=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], +test(1187.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) MyVar = TRUE -test(1187.2,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.3,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.2,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.3,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) MyVar = FALSE -test(1187.4,optimization=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) -test(1187.5,optimization=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) +test(1187.4,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) +test(1187.5,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) # GForce should not turn on when the .ok function isn't triggered -test(1187.6,optimization=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], +test(1187.6,optimize=2L, DT[, mean(y, trim=.2), by=x, verbose=TRUE], data.table(x = c("a", "b", "c", "d"), V1 = c(NA, 3.33333333333333, NA, NA)), output='j unchanged', warning="'trim' is not yet optimized") # FR #334. Test to just make sure that GForce and dogroups with .N are giving the same results. set.seed(2L) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10)) -test(1304.1,optimization=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.1,optimize=0:2, dt[, list(.N, sum(y)), by=x]) dt <- data.table(x=sample(rep(1:5e3, each=3)), y=sample(10), key="x") -test(1304.2,optimization=0:2, dt[, list(.N, sum(y)), by=x]) +test(1304.2,optimize=0:2, dt[, list(.N, sum(y)), by=x]) # gmin and gmax extensive testing (because there are tricky cases) DT <- data.table(x=rep(1:6, each=3), y=INT(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -2147483647, -2147483647, -2147483647, 2147483647, 2147483647, 2147483647)) opts = 0:2 # for integers -test(1313.01,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.02,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.03,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.04,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.01,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.02,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.03,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.04,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := INT(NA)] -test(1313.05,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.06,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) -test(1313.08,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) +test(1313.05,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.06,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.07,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) -test(1313.09,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.10,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.11,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.12,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.09,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.10,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.11,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.12,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) # testing all NA - GForce automatically converts to numeric.. optimize=1L errors due to change from integer/numeric (like median) DT[x==6, y := NA_real_] -test(1313.13,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.14,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) -test(1313.16,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) +test(1313.13,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.14,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.15,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) -test(1313.17,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.18,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) +test(1313.17,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.18,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(5,10)]) DT[c(1,6), y := NA] -test(1313.19,optimization=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) -test(1313.20,optimization=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) -test(1313.21,optimization=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) -test(1313.22,optimization=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) +test(1313.19,optimize=opts, DT[, list(y=min(y)), by=x], DT[c(1,6)]) +test(1313.20,optimize=opts, DT[, list(y=max(y)), by=x], DT[c(1,6)]) +test(1313.21,optimize=opts, DT[, list(y=min(y, na.rm=TRUE)), by=x], DT[c(2,7)]) +test(1313.22,optimize=opts, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) DT[x==7, y := c("","b","c")] -test(1313.23,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.24,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.25,optimization=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) -test(1313.26,optimization=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) +test(1313.23,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.24,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.25,optimize=opts, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) +test(1313.26,optimize=opts, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=x]) DT[x==6, y := NA_character_] -test(1313.27,optimization=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) -test(1313.28,optimization=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) -test(1313.30,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) +test(1313.27,optimize=opts, DT[, min(y), by=x], DT[, base::min(y), by=x]) +test(1313.28,optimize=opts, DT[, max(y), by=x], DT[, base::max(y), by=x]) +test(1313.29,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565.1,optimization=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], +test(1565.1,optimize=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) # gforce optimisations @@ -188,32 +188,32 @@ opt = 0:2 out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') # make sure gforce is on # testing gforce::gmedian -test(1579.01,optimization=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], +test(1579.01,optimize=2L, dt[, lapply(.SD, median), by=x, verbose=TRUE], dt[, lapply(.SD, function(x) median(as.numeric(x))), by=x], output="GForce optimized") -test(1579.02,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], +test(1579.02,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), by=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), by=x]) -test(1579.03,optimization=2L, dt[, lapply(.SD, median), keyby=x], +test(1579.03,optimize=2L, dt[, lapply(.SD, median), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x))), keyby=x]) -test(1579.04,optimization=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], +test(1579.04,optimize=2L, dt[, lapply(.SD, median, na.rm=TRUE), keyby=x], dt[, lapply(.SD, function(x) median(as.numeric(x), na.rm=TRUE)), keyby=x]) # testing gforce::ghead and gforce::gtail # head(.SD, 1) and tail(.SD, 1) optimisation -test(1579.06,optimization=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.08,optimization=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.10,optimization=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.12,optimization=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) -test(1579.14,optimization=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1579.16,optimization=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) -test(1579.18,optimization=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) -test(1579.20,optimization=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.06,optimize=opt, dt[, head(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.08,optimize=opt, dt[, head(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.10,optimize=opt, dt[, head(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.12,optimize=opt, dt[, head(.SD,1L), keyby=x, verbose=TRUE], output=out) +test(1579.14,optimize=opt, dt[, tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1579.16,optimize=opt, dt[, tail(.SD,1), keyby=x, verbose=TRUE], output=out) +test(1579.18,optimize=opt, dt[, tail(.SD,1L), by=x, verbose=TRUE], output=out) +test(1579.20,optimize=opt, dt[, tail(.SD,1L), keyby=x, verbose=TRUE], output=out) # 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23,optimization=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") -test(1579.24,optimization=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.25,optimization=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.26,optimization=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27,optimization=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 -test(1579.28,optimization=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +test(1579.23,optimize=2L, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24,optimize=opt, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25,optimize=opt, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26,optimize=opt, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27,optimize=opt, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28,optimize=opt, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) # gforce head/tail for n>1, #5060 set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), @@ -223,16 +223,16 @@ DT = data.table(x = sample(letters[1:5], 20, TRUE), s = sample(c("foo","bar",NA), 20, TRUE), l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -test(1579.401,optimization=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(1579.402,optimization=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(1579.403,optimization=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(1579.404,optimization=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(1579.405,optimization=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(1579.406,optimization=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(1579.407,optimization=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(1579.408,optimization=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(1579.409,optimization=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(1579.410,optimization=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") +test(1579.401,optimize=0:2, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402,optimize=2L, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403,optimize=2L, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404,optimize=2L, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405,optimize=2L, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406,optimize=2L, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407,optimize=2L, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408,optimize=2L, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409,optimize=2L, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410,optimize=2L, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") # FR #971, partly addressed (only subsets in 'i') @@ -242,7 +242,7 @@ dt = data.table(x = sample(letters, 300, TRUE), d2 = as.numeric(sample(c(NA, NaN, -10:10), 300, TRUE))) opt = 1:2 out = c("GForce FALSE","GForce optimized j") -test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], +test(1581.01,optimize=opt, ans1 <- dt[x %in% letters[15:20], c(.N, lapply(.SD, sum, na.rm=TRUE), lapply(.SD, min, na.rm=TRUE), lapply(.SD, max, na.rm=TRUE), @@ -252,11 +252,11 @@ test(1581.01,optimization=opt, ans1 <- dt[x %in% letters[15:20], output = out) # subsets in 'i' for head and tail -test(1581.04,optimization=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.07,optimization=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) -test(1581.10,optimization=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +test(1581.04,optimize=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.07,optimize=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +test(1581.10,optimize=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) # #3209 g[[ -test(1581.13,optimization=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +test(1581.13,optimize=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) # also, block for non-atomic input, #4159 dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] @@ -284,16 +284,16 @@ DT = data.table( C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) opt = 0:2 -test(1583.1,optimization=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], +test(1583.1,optimize=opt, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], data.table(C1=c("A","A","B","B","C","C"), C2=c("a","b","b","c","c","d"), agg=c(1,4,5,8,9,10))) # extra test with a size-1 group containing one NaN too DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -test(1583.2,optimization=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) -test(1583.3,optimization=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) -test(1583.4,optimization=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) -test(1583.5,optimization=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.2,optimize=2L, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4,optimize=opt, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5,optimize=opt, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # FR #523, var, sd and prod DT = data.table(x=sample(5, 100, TRUE), @@ -303,24 +303,24 @@ DT = data.table(x=sample(5, 100, TRUE), z2=sample(c(runif(10),NA,NaN), 100, TRUE)) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(1594.01,optimization=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) -test(1594.02,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) -test(1594.03,optimization=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.01,optimize=opt, DT[, lapply(.SD, var, na.rm=FALSE), by=x]) +test(1594.02,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x]) +test(1594.03,optimize=opt, DT[, lapply(.SD, var, na.rm=TRUE), by=x, verbose=TRUE], output=out) # coverage: default group .N=1 case idx=DT[ , .I[1L], by=x]$V1 ans=data.table(x=DT[(idx), x], V1=NA_real_) -test(1594.05,optimization=opt, DT[(idx), var(y1), by=x], ans) -test(1594.06,optimization=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) -test(1594.07,optimization=opt, DT[(idx), var(z1), by=x], ans) -test(1594.08,optimization=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) +test(1594.05,optimize=opt, DT[(idx), var(y1), by=x], ans) +test(1594.06,optimize=opt, DT[(idx), var(y1, na.rm=TRUE), by=x], ans) +test(1594.07,optimize=opt, DT[(idx), var(z1), by=x], ans) +test(1594.08,optimize=opt, DT[(idx), var(z1, na.rm=TRUE), by=x], ans) -test(1594.09,optimization=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) -test(1594.10,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) -test(1594.11,optimization=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.09,optimize=opt,DT[, lapply(.SD, sd, na.rm=FALSE), by=x]) +test(1594.10,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x], DT[, lapply(.SD, stats::sd, na.rm=TRUE), by=x]) +test(1594.11,optimize=opt, DT[, lapply(.SD, sd, na.rm=TRUE), by=x, verbose=TRUE], output=out) -test(1594.12,optimization=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) -test(1594.13,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) -test(1594.14,optimization=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) +test(1594.12,optimize=opt, DT[, lapply(.SD, prod, na.rm=FALSE), by=x]) +test(1594.13,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x]) +test(1594.14,optimize=opt, DT[, lapply(.SD, prod, na.rm=TRUE), by=x, verbose=TRUE], output=out) # when datatable.optimize<1, no optimisation of j should take place: dt = data.table(x=1:5, y=6:10, z=c(1,1,1,2,2)) @@ -330,53 +330,53 @@ test(1638, options=c(datatable.optimize=0L), dt[, .SD, by=z, verbose=TRUE], outp DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) opt = c(1L,2L) out = c("GForce FALSE", "GForce optimized j to") -test(2231.01,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) -test(2231.02,optimization=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) -test(2231.03,optimization=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.01,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45333333333333), output=out) +test(2231.02,optimize=opt, DT[, weighted.mean(w, x), g, verbose=TRUE], data.table(g=1L, V1=3.89473684210526), output=out) +test(2231.03,optimize=opt, DT[, weighted.mean(x), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) # multiple groups DT = data.table(x=c(1L,2L,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.04,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.05,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) -test(2231.06,optimization=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) +test(2231.04,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.05,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2,5)), output=out) +test(2231.06,optimize=opt, DT[, weighted.mean(x, w), seq(nrow(DT)), verbose=TRUE], data.table(seq=1L:8L, V1=c(1,2,2,3,4,5,5,6)), output=out) # (only x XOR w) containing NA DT = data.table(x=c(1L,NA,2L,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.07,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.08,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) -test(2231.09,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.10,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.07,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.08,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA_real_)), output=out) +test(2231.09,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.10,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NaN DT = data.table(x=c(1L,2L,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,2L,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.11,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) -test(2231.12,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.13,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) -test(2231.14,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.11,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, NA)), output=out) +test(2231.12,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.13,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NaN, 5)), output=out) +test(2231.14,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (only x XOR w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,1L,1L,1L,2L,NA,NaN,2L), g=rep(1L:2L, each=4L)) -test(2231.15,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) -test(2231.16,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) -test(2231.17,optimization=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.18,optimization=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.15,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA_real_, NA_real_)), output=out) +test(2231.16,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, NA)), output=out) +test(2231.17,optimize=opt, DT[, weighted.mean(x, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.18,optimize=opt, DT[, weighted.mean(x, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # (x and w) containing NA and NaN DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NA,NaN,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.19,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.20,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.19,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.20,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) -test(2231.21,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) -test(2231.22,optimization=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +test(2231.21,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) +test(2231.22,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) # let wrongly named arguments get lost in ellipsis #5543 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) -test(2231.61,optimization=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) -test(2231.62,optimization=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) -test(2231.63,optimization=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) -test(2231.64,optimization=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) +test(2231.61,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) +test(2231.62,optimize=opt, DT[, weighted.mean(x, weight=w), g, verbose=TRUE], data.table(g=1L, V1=3.325), output=out) +test(2231.63,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g], DT[, stats::weighted.mean(x, w, na.rm=FALSE), g]) +test(2231.64,optimize=opt, DT[, weighted.mean(x, weight=w, na.rm=TRUE)], DT[, stats::weighted.mean(x, weight=w, na.rm=TRUE)]) # GForce retains attributes in by arguments #5567 dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer"), att=1), c=structure(c(1L,2L,1L,2L), class = c("class_c", "integer"))) opt = c(0,Inf) out = c("GForce FALSE", "GForce optimized j to") -test(2263.1,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) -test(2263.2,optimization=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) -test(2263.3,optimization=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) +test(2263.1,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +test(2263.2,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +test(2263.3,optimize=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) # named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 M <- as.data.table(mtcars) @@ -388,60 +388,60 @@ sdlist <- vector("list", length(sdnames)) names(sdlist) <- sdnames opts = 0:2 -test(2283 + 0.01, optimization=opts, +test(2283 + 0.01, optimize=opts, names(M[, c(m=lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(m=sdlist)))) -test(2283 + 0.02, optimization=opts, +test(2283 + 0.02, optimize=opts, names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", sdnames)) -test(2283 + 0.03, optimization=opts, +test(2283 + 0.03, optimize=opts, names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), c("cyl", "Mpg", names(c(m=sdlist)))) -test(2283 + 0.04, optimization=opts, +test(2283 + 0.04, optimize=opts, names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), c("cyl", "mpg", names(c(mpg=sdlist)))) -test(2283 + 0.05, optimization=opts, +test(2283 + 0.05, optimize=opts, names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", "V1", sdnames)) -test(2283 + 0.06, optimization=opts, +test(2283 + 0.06, optimize=opts, names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) -test(2283 + 0.07, optimization=opts, +test(2283 + 0.07, optimize=opts, names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, sdnames)) -test(2283 + 0.08, optimization=opts, +test(2283 + 0.08, optimize=opts, names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(mean=sdlist, sum=sdlist)))) -test(2283 + 0.09, optimization=opts, +test(2283 + 0.09, optimize=opts, names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), c("cyl", sdnames, names(c(sum=sdlist))) ) -test(2283 + 0.10, optimization=opts, +test(2283 + 0.10, optimize=opts, names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), c("cyl", names(c(" "=sdlist, "."=sdlist)))) -test(2283 + 0.11, optimization=opts, +test(2283 + 0.11, optimize=opts, names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(a=0, b=0))), sdnames)) -test(2283 + 0.12, optimization=opts, +test(2283 + 0.12, optimize=opts, names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, 0))), sdnames)) -test(2283 + 0.13, optimization=opts, +test(2283 + 0.13, optimize=opts, names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) -test(2283 + 0.14, optimization=opts, +test(2283 + 0.14, optimize=opts, names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(A=list(0))), sdnames)) -test(2283 + 0.15, optimization=opts, +test(2283 + 0.15, optimize=opts, names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.16, optimization=opts, +test(2283 + 0.16, optimize=opts, names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) -test(2283 + 0.17, optimization=opts, +test(2283 + 0.17, optimize=opts, names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.18, optimization=opts, +test(2283 + 0.18, optimize=opts, names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "b", "vs", "am")) -test(2283 + 0.19, optimization=opts, +test(2283 + 0.19, optimize=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), c("cyl", "V1", "V2", "b", "vs", "am")) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db7ed84e64..d8395a80e3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3890,7 +3890,7 @@ test(1133.3, DT[, new := c(1,2), by=x], error="Supplied 2 items to be assigned test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned to group 1 of size 5 in column 'new'") test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) -test(1133.7,optimization=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +test(1133.7,optimize=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3993,8 +3993,8 @@ DT<-data.table(X=factor(2006:2012),Y=rep(1:7,2)) test(1143.2, DT[, Z:=paste(X,.N,sep=" - "), by=list(X)], data.table(X=factor(2006:2012),Y=rep(1:7,2), Z=paste(as.character(2006:2012), 2L, sep=" - "))) DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.160")), y=c(1L,2L)) test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) -test(1143.4,optimization=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) -test(1143.5,optimization=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.4,optimize=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +test(1143.5,optimize=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 @@ -4463,16 +4463,16 @@ test(1228.6, class(DT), class(DT[a>1, c:=sum(b), by=a])) # Test that ad hoc by detects if ordered and dogroups switches to memcpy if contiguous, #1050 DT = data.table(a=1:3,b=1:6,key="a") # turn off GForce, to test dogroups -test(1230,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1230,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") setkey(DT,NULL) -test(1231,optimization=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") -test(1232,optimization=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") -test(1233,optimization=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") -test(1234,optimization=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized +test(1231,optimize=1L, DT[, sum(b), by=a, verbose=TRUE], output="memcpy contiguous groups") +test(1232,optimize=1L, DT[, sum(b), by=a+1, verbose=TRUE], output="memcpy contiguous groups") +test(1233,optimize=1L, DT[, sum(b), by=a%%2, verbose=TRUE], output="collecting discontiguous groups") +test(1234,optimize=1L, DT[, sum(a), by=b, verbose=TRUE], output="memcpy contiguous groups") # as from v1.12.0 the out-of-order but grouped-ness is detected and utilized setkey(DT,a) -test(1235,optimization=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") -test(1236,optimization=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") -test(1237,optimization=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1235,optimize=1L, DT[.(2:3),sum(b),by=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") +test(1236,optimize=1L, DT[.(3:2),sum(b),by=.EACHI,verbose=TRUE], data.table(a=3:2,V1=c(9L,7L)), output="memcpy contiguous groups") +test(1237,optimize=1L, DT[.(3:2),sum(b),keyby=.EACHI,verbose=TRUE], data.table(a=2:3,V1=c(7L,9L),key="a"), output="memcpy contiguous groups") # check that key is not preserved when length of fastorder is > 0 DT <- data.table(x=1:5, y=6:10, key="x") @@ -4742,18 +4742,18 @@ gf_out = c("All optimizations are turned off", "Old mean optimization.*(GForce F lp_out = c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization changed j") opt = c(0L, 1L, Inf) # auto-naming behavior is different for no-optimization case; just check optimization is off -test(1268.01,optimization=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) -test(1268.02,optimization=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) -test(1268.03,optimization=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) -test(1268.04,optimization=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) -test(1268.05,optimization=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.01,optimize=opt, dt[, c(lapply(.SD, mean), lapply(.SD, sum)), by=a, verbose=TRUE], output = gf_out) +test(1268.02,optimize=opt, dt[, c(lapply(.SD, mean), .N), by=a, verbose=TRUE], output = lp_out) +test(1268.03,optimize=opt, dt[, c(list(c), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.04,optimize=opt, dt[, c(sum(d), lapply(.SD, mean)), by=a, verbose=TRUE], output= gf_out) +test(1268.05,optimize=opt, dt[, c(list(sum(d)), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # newly added tests for #861 -- optimise, but no GForce -test(1268.06,optimization=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) +test(1268.06,optimize=opt, dt[, c(list(sum(d), .I), lapply(.SD, mean)), by=a, verbose=TRUE], output= lp_out) # don't optimise .I in c(...) -test(1268.07,optimization=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], +test(1268.07,optimize=opt, dt2[, c(.I, lapply(.SD, mean)), by=x, verbose=TRUE], output= c("All optimizations are turned off", "Old mean optimization.*(GForce FALSE)", "lapply optimization is on, j unchanged as")) # tests .08-.21 were different optimization levels -test(1268.22,optimization=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], +test(1268.22,optimize=opt, dt[, c(as.list(c), lapply(.SD, mean)), by=a], error = "j doesn't evaluate to the same number of columns for each group") ### FR #2722 tests end here ### @@ -6257,7 +6257,7 @@ test(1437.17, DT[!a %chin% c("A", "B") & x == 2], DT[c(4, 5, 6)]) test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized subsetting") test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting -test(1437.21,optimization=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) +test(1437.21,optimize=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -6343,16 +6343,16 @@ for(t in seq_len(nrow(all))){ ## test the query with missing j thisQuery <- all$query[t] test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery))]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery))]) ## repeat the test with 'which = TRUE' test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), which = TRUE]) ## repeat the test with the j queries for(thisJquery in jQueries) { ## do it with and without existing "by" for(thisBy in bys){ test_no <- test_no + 1L - test(1438.0 + test_no*0.0001, optimization=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) + test(1438.0 + test_no*0.0001, optimize=c(2L,3L), DT[eval(parse(text = thisQuery)), eval(parse(text = thisJquery)), by = thisBy]) } } } @@ -8757,13 +8757,13 @@ test(1629.07, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3]) dt2 = copy(dt) test(1629.08, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)]) # grouping operations -test(1629.09,optimization=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) +test(1629.09,optimize=c(1L,2L), dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL]) ans1 = dt[, base::sum(v1), by=grp] ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3] -test(1629.10,optimization=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) -test(1629.11,optimization=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], +test(1629.10,optimize=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)]) +test(1629.11,optimize=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) -test(1629.12,optimization=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +test(1629.12,optimize=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) # gforce # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] @@ -12735,8 +12735,8 @@ DT[ , V1:=as.ordered(V1)] test(1918.3, DT[, min(V1)], structure(1L, .Label = lev, class = c("ordered", "factor"))) test(1918.4, DT[, max(V1)], structure(5L, .Label = lev, class = c("ordered", "factor"))) ## make sure GForce is activated -test(1918.5,optimization=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) -test(1918.6,optimization=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) +test(1918.5,optimize=Inf, DT[, min(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(1:3, .Label=lev, class=c("ordered", "factor")))) +test(1918.6,optimize=Inf, DT[, max(V1), by=V2], data.table(V2=c("f", "g", "h"), V1=structure(3:5, .Label=lev, class=c("ordered", "factor")))) # as.ITime.character bug for NA handling #2940 test(1919, as.ITime(c('xxx', '10:43')), structure(c(NA, 38580L), class = "ITime")) @@ -13619,11 +13619,11 @@ suppressWarnings(rm(`___data.table_internal_test_1967.68___`)) test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot find symbol') ### [.data.table verbosity & non-equi-join tests -test(1967.69,optimization=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') -test(1967.71,optimization=1L, x[order(a), .N, verbose = TRUE], 5L, +test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) -test(1967.72,optimization=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, +test(1967.72,optimize=1L, x[x, .N, on = 'a', verbose = TRUE], 5L, output = "on= matches existing key") x = data.table( @@ -13777,8 +13777,8 @@ test(1980, names(data.table(x)), "x") # crash when n="lead", #3354 DT = data.table( id = 1:5 , val = letters[1:5] ) -test(1981.1,optimization=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) -test(1981.2,optimization=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +test(1981.1,optimize=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) +test(1981.2,optimize=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") # 1982 moved to benchmark.Rraw, #5517 @@ -13810,7 +13810,7 @@ test(1984.081, DT[, sum(a), by=as.raw(0)], error="Column or expression.*1.*t test(1984.082, data.table(A=1:4, L=list(1, 1:2, 1, 1:3), V=1:4)[, sum(V), by=.(A,L)], # better error message, 4308 error="Column or expression.*2.*type 'list'.*not.*supported") test(1984.09, DT[, sum(a), by=.(1,1:2)], error="The items in the 'by' or 'keyby' list have lengths [1, 2]. Each must be length 10; the same length as there are rows in x (after subsetting if i is provided).") -test(1984.10,optimization=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], +test(1984.10,optimize=Inf, DT[ , 1, by = .(a %% 2), verbose = TRUE], data.table(a = c(1, 0), V1 = c(1, 1)), output = 'Optimization is on but left j unchanged') DT[ , f := rep(1:2, each = 5)] @@ -14696,10 +14696,10 @@ test(2042.4, DT[ , round(mean(DiffTime)), by=Group, verbose=TRUE], DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 opt = c(Inf,1L,0L) -test(2043.1,optimization=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) -test(2043.4,optimization=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) +test(2043.1,optimize=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +test(2043.4,optimize=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 -test(2043.5,optimization=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z +test(2043.5,optimize=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z # test type coercion in joins, #2592 dt1 <- data.table(int = 1L:10L, @@ -15862,10 +15862,10 @@ test(2098.1, DT[do.call(order, mget(groups)), verbose=TRUE], ans<-data.table(id= test(2098.2, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, output=out) test(2098.3, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) test(2098.4, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) -test(2098.5,optimization=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") -test(2098.6,optimization=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") -test(2098.7,optimization=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) -test(2098.8,optimization=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) +test(2098.5,optimize=0L, DT[do.call(order, mget(groups)), verbose=TRUE], ans, notOutput="forder.c") +test(2098.6,optimize=0L, DT[with(DT, do.call(order, mget(groups))), verbose=TRUE], ans, notOutput="forder.c") +test(2098.7,optimize=0L, DT[do.call(forder, mget(groups)), verbose=TRUE], ans, output=out) +test(2098.8,optimize=0L, DT[with(DT, do.call(forder, mget(groups))), verbose=TRUE], ans, output=out) # Error in update join when joining on factor, #3559 d1 <- data.table(fac = factor(letters[1:4]), char = letters[1:4], val = c(1L, NA, 3L, NA)) @@ -17290,14 +17290,14 @@ test(2207, dcast(DT, x~y, value.var="z"), data.table(x=1:3, a=c(1+6i, 3+4i, 5+2i # gmin/gmax for integer64, #4444 if (test_bit64) { DT = data.table(grp=c(1L, 1L, 1L, 2L), i64=as.integer64(c(NA, 1:3))) - test(2208.1,optimization=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.2,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) - test(2208.3,optimization=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) - test(2208.4,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) + test(2208.1,optimize=2L, DT[, min(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.2,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1, 3)))) + test(2208.3,optimize=2L, DT[, max(i64), by=grp], data.table(grp=1:2, V1=as.integer64(c(NA, 3)))) + test(2208.4,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(2:3))) # create an all-NA group DT[, i64:=rev(i64)] - test(2208.7,optimization=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) - test(2208.8,optimization=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) + test(2208.7,optimize=2L, DT[, min(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(1,NA)))) + test(2208.8,optimize=2L, DT[, max(i64, na.rm=TRUE), by=grp], data.table(grp=1:2, V1=as.integer64(c(3,NA)))) } # when user supplies dec=',' don't try sep=',', #4483 @@ -17517,12 +17517,12 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") + test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE") testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimization=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) + test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun)) } testnum = testnum + 1L -test(2220.0 + testnum*0.01, optimization=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") +test(2220.0 + testnum*0.01, optimize=2L, DT[, prod(l), g], error="GForce prod can only be applied to columns, not .SD or similar.") # tables() error when called from inside a function(...), #5197 test(2221, (function(...) tables())(), output = "No objects of class data.table exist") @@ -17559,7 +17559,7 @@ for (col in names(DT)[-1]) { for (type in c('lag','lead','shift','cyclic')) { # fill is tested by group in tests 2218.*; see comments in #5205 # sapply(sapply()) changed to for(for(for())) to save 29MiB, #5517 - test(2224.1+i/10000, optimization=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE + test(2224.1+i/10000, optimize=2L, # 192 tests here when test_bit64=TRUE; 168 when FALSE EVAL(sprintf("DT[, shift(%s, %d, type='%s'), by=x]$V1", col, n, type)), ans[[i]]) i = i+1L @@ -17664,15 +17664,15 @@ test(2232.4, unique(DT, by='g', cols='v3'), error="non-existing column(s)") DT = data.table(a=1:3,b=(1:9)/10) opt = 0:2 out = c("GForce FALSE", "GForce FALSE", "GForce optimized j to") -test(2233.01,optimization=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) +test(2233.01,optimize=opt, DT[, v := min(b), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:3)/10), output=out) # GForce returning full length -test(2233.02,optimization=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) +test(2233.02,optimize=opt, DT[, v := head(b, 3L), a, verbose=TRUE], data.table(a=1:3, b=(1:9)/10, v=(1:9)/10), output=out) # GForce neither returning 1 per group nor full length -test(2233.03,optimization=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") +test(2233.03,optimize=opt, DT[, v := head(b, 2L), a], error="Supplied .* items to be assigned to .* column 'v'.") # compare to non GForce version DT = data.table(a=1:3,b=(1:9)/10) -test(2233.04,optimization=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) -test(2233.05,optimization=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) +test(2233.04,optimize=opt, copy(DT)[, v := min(b), a, verbose=TRUE], copy(DT)[, v := base::min(b), a, ], output=out) +test(2233.05,optimize=opt, copy(DT)[, v := head(b, 3L), a, verbose=TRUE], copy(DT)[, v := utils::head(b, 3L), a], output=out) # with key and grouping by key DT = data.table(a=1:3,b=(1:9)/10, key="a") @@ -17747,17 +17747,17 @@ opt = c(0,Inf) out = c("GForce FALSE", "GForce.*gsum") B = copy(a) A = a[sample(seq_len(nrow(a)), nrow(a))] # shuffle -test(2233.391,optimization=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= +test(2233.391,optimize=opt, A[, t1 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) # y=A dummy just to test output= setorder(A, id1) -test(2233.392,optimization=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) -test(2233.393,optimization=opt, any(A[,t1!=t2]), FALSE) -test(2233.394,optimization=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) -test(2233.395,optimization=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) +test(2233.392,optimize=opt, A[, t2 := sum(flag, na.rm=TRUE), by=id2, verbose=TRUE], A, output=out) +test(2233.393,optimize=opt, any(A[,t1!=t2]), FALSE) +test(2233.394,optimize=opt, any(A[, length(unique(t1))>1, by=id2]$V1), FALSE) +test(2233.395,optimize=opt, any(A[, length(unique(t2))>1, by=id2]$V1), FALSE) # test from #5337 n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] -test(2233.41,optimization=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) +test(2233.41,optimize=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) # test from #5345 set.seed(1) DT = data.table( @@ -17768,7 +17768,7 @@ DT = data.table( ) load(testDir("test2233-43.Rdata")) # ans setDT(ans) # to silence verbose messages about internal.selfref being NULL when loaded from disk -test(2233.43,optimization=c(0,Inf), options = list(datatable.verbose=TRUE), +test(2233.43,optimize=c(0,Inf), options = list(datatable.verbose=TRUE), copy(DT)[, sum_v2_idT:=sum(v2), by=c("id", "t") ][, n_idT :=dim(.SD)[[1]], by=list(t, id) ][, sum_v2_id :=sum(v2), by=.(id) @@ -17920,12 +17920,12 @@ i=c(1,2) j=1L opt = c(1L,2L) out = c("GForce FALSE", "GForce TRUE") -test(2243.41,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") -test(2243.42,optimization=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") -test(2243.53,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) -test(2243.54,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) -test(2243.55,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") -test(2243.56,optimization=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") +test(2243.41,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +test(2243.42,optimize=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +test(2243.53,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) +test(2243.54,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) +test(2243.55,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") +test(2243.56,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1:2], x]$V1, 1:4, output="GForce FALSE") DT = data.table(1) test(2244.1, DT[, `:=`(a=1, )], error="`:=`.*Did you forget a trailing comma\\?") @@ -18382,7 +18382,7 @@ test(2269.2, fread("x\n?\n \n", colClasses="POSIXct", na.strings="?"), dt) # Error found by revdep in #6284: mean(a,b) is valid, expr names() can be NULL DT = data.table(a = 1, b = 2) -test(2270,optimization=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") +test(2270,optimize=1L, DT[, mean(b, 1), by=a], data.table(a=1, V1=2), warning="Unable to optimize call to mean()") # Missing newline in verbose output -> harder to read DT1 = data.table(a=1:2) @@ -21200,22 +21200,22 @@ test(2344.04, key(DT[, .(V4 = c("b", "a"), V2, V5 = c("y", "x"), V1)]), c("V1", # support arithmetic in j with gforce #3815 out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") dt = data.table(a=1:4, b=1:2) -test(2345.01,optimization=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2345.02,optimization=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2345.01,optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2345.02,optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) dt = data.table(a=1:4, b=1:2) -test(2345.11,optimization=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2345.12,optimization=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.11,optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2345.12,optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2345.13,optimization=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2345.14,optimization=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.13,optimize=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2345.14,optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") dt = data.table(a=1:2, b=1, c=1:4) -test(2345.21,optimization=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2345.22,optimization=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2345.23,optimization=0:2, names(dt[, lapply(list(b, c), sum), by=a])) +test(2345.21,optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2345.22,optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2345.23,optimize=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) -test(2345.31,optimization=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2345.32,optimization=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2345.33,optimization=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2345.34,optimization=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2345.35,optimization=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2345.36,optimization=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") +test(2345.31,optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2345.32,optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2345.33,optimize=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2345.34,optimize=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2345.35,optimize=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2345.36,optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") diff --git a/man/test.Rd b/man/test.Rd index 2e216041c5..db30064c02 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -8,7 +8,7 @@ test(num, x, y = TRUE, error = NULL, warning = NULL, message = NULL, output = NULL, notOutput = NULL, ignore.warning = NULL, - options = NULL, env = NULL, optimization = NULL) + options = NULL, env = NULL, optimize = NULL) } \arguments{ \item{num}{ A unique identifier for a test, helpful in identifying the source of failure when testing is not working. Currently, we use a manually-incremented system with tests formatted as \code{n.m}, where essentially \code{n} indexes an issue and \code{m} indexes aspects of that issue. For the most part, your new PR should only have one value of \code{n} (scroll to the end of \code{inst/tests/tests.Rraw} to see the next available ID) and then index the tests within your PR by increasing \code{m}. Note -- \code{n.m} is interpreted as a number, so \code{123.4} and \code{123.40} are actually the same -- please \code{0}-pad as appropriate. Test identifiers are checked to be in increasing order at runtime to prevent duplicates being possible. } @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{optimization}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimization)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } +\item{optimize}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimize)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From f6981d6db8d5582535f0d7541ee2c5d0476536d9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:27:42 +0100 Subject: [PATCH 25/78] rename optimization parameter also in test --- R/test.data.table.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index af39e3a099..32f3a14037 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -363,18 +363,18 @@ gc_mem = function() { test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels - if (!is.null(optimization)) { + if (!is.null(optimize)) { cl = match.call() cl$optimize= NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) y_provided = !missing(y) vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - compare = !y_provided && length(optimization)>1L && !any(lengths(vector_params)) + compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) - for (i in seq_along(optimization)) { + for (i in seq_along(optimize)) { cl$num = num + (i - 1L) * 1e-6 - opt_level = list(datatable.optimize = optimization[i]) + opt_level = list(datatable.optimize = optimize[i]) cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level for (param in names(vector_params)) { val = vector_params[[param]] From 9fc4734d4dc7214d0e5da3463c7e62a37d81277c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:42:48 +0100 Subject: [PATCH 26/78] add optimize param checks --- R/test.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index 32f3a14037..ca1e22e795 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -364,6 +364,8 @@ gc_mem = function() { test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,notOutput=NULL,ignore.warning=NULL,options=NULL,env=NULL,optimize=NULL) { # if optimization is provided, test across multiple optimization levels if (!is.null(optimize)) { + if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) + stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) cl = match.call() cl$optimize= NULL # Remove optimization levels from the recursive call From c07999ac76e74cd79df42ac96a78f4364414ddf4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 4 Nov 2025 08:53:15 +0100 Subject: [PATCH 27/78] remove trailing ws --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index ca1e22e795..7b8341944d 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -372,7 +372,7 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,message=NULL,output=NULL,no # Check if y was explicitly provided (not just the default) y_provided = !missing(y) vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) + compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) for (i in seq_along(optimize)) { cl$num = num + (i - 1L) * 1e-6 From 6c7e368e3d186f4944065bd77ed66afd6c44dee1 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 15 Dec 2025 14:52:28 +0100 Subject: [PATCH 28/78] Update man/test.Rd Co-authored-by: aitap --- man/test.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/test.Rd b/man/test.Rd index db30064c02..23652ceb35 100644 --- a/man/test.Rd +++ b/man/test.Rd @@ -22,7 +22,7 @@ test(num, x, y = TRUE, \item{ignore.warning}{ A single character string. Any warnings emitted by \code{x} that contain this string are dropped. Remaining warnings are compared to the expected \code{warning} as normal. } \item{options}{ A named list of options to set for the duration of the test. Any code evaluated during this call to \code{test()} (usually, \code{x}, or maybe \code{y}) will run with the named options set, and the original options will be restored on return. This is a named list since different options can have different types in general, but in typical usage, only one option is set at a time, in which case a named vector is also accepted. } \item{env}{ A named list of environment variables to set for the duration of the test, much like \code{options}. A list entry set to \code{NULL} will unset (i.e., \code{\link{Sys.unsetenv}}) the corresponding variable. } -\item{optimize}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimize)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no y is supplied, the results from the different levels are compared to each other for equality. If a y is supplied, the results from each level are compared to y. } +\item{optimize}{ A vector of different optimization levels to test. The code in \code{x} will be run once for each optimization level, with \code{options(datatable.optimize=optimize)} set accordingly. All optimization levels must pass the test for the overall test to pass. If no \code{y} is supplied, the results from the different levels are compared to each other for equality. If a \code{y} is supplied, the results from each level are compared to \code{y}. } } \note{ \code{NA_real_} and \code{NaN} are treated as equal, use \code{identical} if distinction is needed. See examples below. From 047f6be9db607874bcba9ce0e77947c19376a7a9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 5 Jan 2026 02:04:16 +0100 Subject: [PATCH 29/78] readd context --- inst/tests/optimize.Rraw | 57 ++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index cb096e94ce..7eccc49460 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -390,58 +390,77 @@ names(sdlist) <- sdnames opts = 0:2 test(2283 + 0.01, optimize=opts, names(M[, c(m=lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(m=sdlist)))) + c("cyl", names(c(m=sdlist))), + context=sprintf("optimize=%s [I]", format(opt))) test(2283 + 0.02, optimize=opts, names(M[, c(Mpg=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", sdnames)) + c("cyl", "Mpg", sdnames), + context=sprintf("optimize=%s [II]", format(opt))) test(2283 + 0.03, optimize=opts, names(M[, c(Mpg=list(mpg), m=lapply(.SD, mean)), by="cyl"]), - c("cyl", "Mpg", names(c(m=sdlist)))) + c("cyl", "Mpg", names(c(m=sdlist))), + context=sprintf("optimize=%s [III]", format(opt))) test(2283 + 0.04, optimize=opts, names(M[, c(mpg=list(mpg), mpg=lapply(.SD, mean)), by="cyl"]), - c("cyl", "mpg", names(c(mpg=sdlist)))) + c("cyl", "mpg", names(c(mpg=sdlist))), + context=sprintf("optimize=%s [IV]", format(opt))) test(2283 + 0.05, optimize=opts, names(M[, c(list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", "V1", sdnames)) + c("cyl", "V1", sdnames), + context=sprintf("optimize=%s [V]", format(opt))) test(2283 + 0.06, optimize=opts, names(M[, c(lapply(.SD, mean), list(mpg)), by="cyl"]), - c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L))) + c("cyl", sdnames, sprintf("V%d", length(sdnames)+1L)), + context=sprintf("optimize=%s [VI]", format(opt))) test(2283 + 0.07, optimize=opts, names(M[, c(lapply(.SD, mean), lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, sdnames)) + c("cyl", sdnames, sdnames), + context=sprintf("optimize=%s [VII]", format(opt))) test(2283 + 0.08, optimize=opts, names(M[, c(mean=lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(mean=sdlist, sum=sdlist)))) + c("cyl", names(c(mean=sdlist, sum=sdlist))), + context=sprintf("optimize=%s [VIII]", format(opt))) test(2283 + 0.09, optimize=opts, names(M[, c(lapply(.SD, mean), sum=lapply(.SD, sum)), by="cyl"]), - c("cyl", sdnames, names(c(sum=sdlist))) ) + c("cyl", sdnames, names(c(sum=sdlist))), + context=sprintf("optimize=%s [IX]", format(opt))) test(2283 + 0.10, optimize=opts, names(M[, c(" "=lapply(.SD, mean), "."=lapply(.SD, sum)), by="cyl"]), - c("cyl", names(c(" "=sdlist, "."=sdlist)))) + c("cyl", names(c(" "=sdlist, "."=sdlist))), + context=sprintf("optimize=%s [X]", format(opt))) test(2283 + 0.11, optimize=opts, names(M[, c(A=list(a=mpg, b=hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(a=0, b=0))), sdnames)) + c("cyl", names(c(A=list(a=0, b=0))), sdnames), + context=sprintf("optimize=%s [XI]", format(opt))) test(2283 + 0.12, optimize=opts, names(M[, c(A=list(mpg, hp), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, 0))), sdnames)) + c("cyl", names(c(A=list(0, 0))), sdnames), + context=sprintf("optimize=%s [XII]", format(opt))) test(2283 + 0.13, optimize=opts, names(M[, c(A=list(mpg, b=hp, wt), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0, b=0, 0))), sdnames)) + c("cyl", names(c(A=list(0, b=0, 0))), sdnames), + context=sprintf("optimize=%s [XIII]", format(opt))) test(2283 + 0.14, optimize=opts, names(M[, c(A=list(mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(A=list(0))), sdnames)) + c("cyl", names(c(A=list(0))), sdnames), + context=sprintf("optimize=%s [XIV]", format(opt))) test(2283 + 0.15, optimize=opts, names(M[, c(" "=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames)) + c("cyl", names(c(" "=list(" "=0, "."=0, 0))), sdnames), + context=sprintf("optimize=%s [XV]", format(opt))) test(2283 + 0.16, optimize=opts, names(M[, c("."=list(" "=hp, "."=disp, mpg), lapply(.SD, mean)), by="cyl"]), - c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames)) + c("cyl", names(c("."=list(" "=0, "."=0, 0))), sdnames), + context=sprintf("optimize=%s [XVI]", format(opt))) test(2283 + 0.17, optimize=opts, names(M[, c(list(mpg, b=hp), lapply(.SD, mean)), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) + c("cyl", "V1", "b", "vs", "am"), + context=sprintf("optimize=%s [XVII]", format(opt))) test(2283 + 0.18, optimize=opts, names(M[, c(list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "b", "vs", "am")) + c("cyl", "V1", "b", "vs", "am"), + context=sprintf("optimize=%s [XVIII]", format(opt))) test(2283 + 0.19, optimize=opts, names(M[, c(mpg[1], list(mpg, b=hp), c(lapply(.SD, mean))), by="cyl", .SDcols=c("vs", "am")]), - c("cyl", "V1", "V2", "b", "vs", "am")) + c("cyl", "V1", "V2", "b", "vs", "am"), + context=sprintf("optimize=%s [XIX]", format(opt))) From 5a7a9a36bb7e3484dca3b359e086fd5ee864797f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 6 Jan 2026 23:15:35 -0800 Subject: [PATCH 30/78] Update NEWS.md --- NEWS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index b1f3f50f1d..0c1f71b28c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,6 +16,14 @@ 1. `nafill()`, `setnafill()` extended to work on logical vectors (part of [#3992](https://github.com/Rdatatable/data.table/issues/3992)). Thanks @jangorecki for the request and @MichaelChirico for the PR. +2. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. + + This rewrite also introduces several new optimizations: + - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) + - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) + - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) + - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) + ### Notes 1. {data.table} now depends on R 3.5.0 (2018). @@ -334,14 +342,6 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. -21. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. - - This rewrite also introduces several new optimizations: - - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) - - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) - - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) - ### BUG FIXES 1. `fread()` no longer warns on certain systems on R 4.5.0+ where the file owner can't be resolved, [#6918](https://github.com/Rdatatable/data.table/issues/6918). Thanks @ProfFancyPants for the report and PR. From b49550319ee86b0ce3256a0232d932a6cc5812e1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 6 Jan 2026 23:16:19 -0800 Subject: [PATCH 31/78] revert spurious diff --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0c1f71b28c..dc1ac06d5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -340,7 +340,7 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T # 0.028 0.000 0.005 ``` -20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. ### BUG FIXES From 03bcdd86a1318ee3cdab8cbe96562df33067a5e5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 6 Jan 2026 23:16:47 -0800 Subject: [PATCH 32/78] ? --- NEWS.md | 1 - 1 file changed, 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index dc1ac06d5b..f45231cab2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -339,7 +339,6 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T # user system elapsed # 0.028 0.000 0.005 ``` - 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. ### BUG FIXES From fe525bf15e4e87aaaaee63f45e860c1d5ffb4dd9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:20:52 +0100 Subject: [PATCH 33/78] add space Co-authored-by: Michael Chirico --- R/test.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 7b99dbed53..eb8c40015d 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -378,7 +378,7 @@ test = function(num, x, y=TRUE, if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) cl = match.call() - cl$optimize= NULL # Remove optimization levels from the recursive call + cl$optimize = NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) y_provided = !missing(y) From 71b983834ca341c3559d76e7e4fe5f202d7ffbb7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:29:42 +0100 Subject: [PATCH 34/78] reference deletion of tests Co-authored-by: Michael Chirico --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 7eccc49460..93d7aa11dc 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -30,6 +30,7 @@ test(611.4,optimize=c(0L, 2L), DT[a>1,q:=sum(b),by=a]$q, rep(c(NA,1.5,1.8),each= test(613,optimize=c(0L, 2L), DT[J(2),w:=8.3]$w, rep(c(NA,8.3,NA),each=3)) test(614,optimize=c(0L, 2L), DT[J(3),x:=9L]$x, rep(c(NA_integer_,NA_integer_,9L),each=3)) test(615,optimize=c(0L, 2L), DT[J(2),z:=list(list(c(10L,11L)))]$z, rep(list(NULL, 10:11, NULL),each=3)) +# 616, 617 removed in #5245 # Empty i clause, #2034. Thanks to Chris for testing, tests from him. Plus changes from #759 ans = copy(DT)[,r:=NA_real_] From 494cfe2a9525a955aae827e50c7a5fe1f0d1ac66 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:29:56 +0100 Subject: [PATCH 35/78] reference deletion of tests2 Co-authored-by: Michael Chirico --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 93d7aa11dc..54123041b6 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -40,6 +40,7 @@ test(618.3,optimize=c(0L, 2L), copy(DT)[NA,r:=sum(b)], ans) test(618.4,optimize=c(0L, 2L), copy(DT)[0,r:=sum(b)], ans) test(618.5,optimize=c(0L, 2L), copy(DT)[NULL,r:=sum(b)], null.data.table()) # test 619 was level 2 of 618 +# test 620 was removed in #5245 DT = data.table(x=letters, key="x") test(621,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo"], DT) # when no update, key should be retained From 6f42ff58d58152be53f1c646b7aee0483db1446c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:30:29 +0100 Subject: [PATCH 36/78] add comment about removed tests Co-authored-by: Michael Chirico --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 54123041b6..79f986a6c6 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -62,6 +62,7 @@ out = c('GForce FALSE', 'GForce FALSE' ,'GForce TRUE') test(656.1,optimize=opt, DT[ , mean(x), by=grp1, verbose=TRUE], output=out) test(656.2,optimize=opt, DT[ , list(mean(x)), by=grp1, verbose=TRUE], output=out) test(656.3,optimize=opt, DT[ , list(mean(x), mean(y)), by=grp1, verbose=TRUE], output=out) +# 657-658 were for levels 1,2, resp. # Test := keyby does setkey, #2065 DT = data.table(x=1:2, y=1:6) From ac306eb776b2334a2a142c90906e894d65ea81a4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:31:30 +0100 Subject: [PATCH 37/78] add comment about optimization level comparison Co-authored-by: Michael Chirico --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 79f986a6c6..2473a05035 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -90,6 +90,7 @@ out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized test(865,optimize=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) test(867,optimize=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +# 866 was testing an intermediate step; 868 was testing equality between optimization levels # tests of gsum and gmean with NA DT = data.table(x=rep(c("a","b","c","d"),each=3), y=c(1L,3L,6L), v=as.numeric(1:12)) From 431dfc2554bc648681243dc276bbf01c55d96107 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 10:32:56 +0100 Subject: [PATCH 38/78] add comment about removed test Co-authored-by: Michael Chirico --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 2473a05035..b8eaa77fbf 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -104,6 +104,7 @@ test(1184.1,optimize=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) test(1184.2,optimize=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") test(1185.2,optimize=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) +# 1186 was 1184.1 for active GForce test(1187.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) MyVar = TRUE From 158136bdc1f7dc2cd9576eac0d7b74136da4dd63 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 10:34:11 +0100 Subject: [PATCH 39/78] fix typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index beaf698ef9..1b8e023c4e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5146,7 +5146,7 @@ DT = data.table(a=1:3,b=6:1) test(1312, DT[,setkey(.SD),by=a], error="Setting a physical key on .SD is reserved for possible future use") # was warning "Already keyed by this key but had invalid row order" due to the key not being cleared after the previous group. A solution could have been to put back the original key on populating .SD for each group. But instead we reserve it for future use and push the user towards doing it a different more efficient way (see Arun's speedups in the datatable-help thread). -# test 1313 moved to optimze.Rraw +# test 1313 moved to optimize.Rraw # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = From 0c2f61faeae1f339a968e69fada33da9821ff676 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 10:38:47 +0100 Subject: [PATCH 40/78] remove doubled test --- inst/tests/tests.Rraw | 1 - 1 file changed, 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1b8e023c4e..31a56e6dc8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21315,7 +21315,6 @@ test(2358.14,optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE] dt = data.table(a=1:2, b=1, c=1:4) test(2358.21,optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) test(2358.22,optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) -test(2358.23,optimize=0:2, names(dt[, lapply(list(b, c), sum), by=a])) dt = data.table(a=1:4, b=1:2) test(2358.31,optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) test(2358.32,optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) From 2c7ebafc9527aab43bc567a31fbac4d9c1d776fd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 10:42:07 +0100 Subject: [PATCH 41/78] add comment --- inst/tests/optimize.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index b8eaa77fbf..1d1fa942f5 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -49,6 +49,7 @@ test(622,optimize=c(0L, 2L), copy(DT)[J("bb"), x:="foo",nomatch=0], DT, warning= set.seed(2) DT = data.table(a=rnorm(5)*10, b=1:5) test(623,optimize=c(0L, 2L), copy(DT)[,s:=sum(b),by=round(a)%%2]$s, c(10L,5L,5L,10L,10L)) +# test 623 subsumes 623.1 and 623.2 for testing both levels # Setup for test 656.x - gforce tests set.seed(9) From 371e246659772bdca0c5ff88464726f6d8ac3a98 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 10:54:26 +0100 Subject: [PATCH 42/78] update subsuming comments --- inst/tests/optimize.Rraw | 11 ++++++++--- inst/tests/tests.Rraw | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 1d1fa942f5..33a0424667 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -80,10 +80,12 @@ test(670.5,optimize=c(0L, 2L), copy(DT)[x>1,z:=sum(y),keyby=x], error=":= with k DT = data.table(a=rep(1:3,1:3),b=1:6) colname = "newcol" test(751,optimize=c(0L, 2L), DT[,(colname):=sum(b),by=a], data.table(a=rep(1:3,1:3),b=1:6,newcol=INT(1,5,5,15,15,15))) +# test 751 subsumes 751.1 and 751.2 for testing both levels # Add tests for nested := in j by group, #1987 DT = data.table(a=rep(1:3,2:4),b=1:9) test(752,optimize=c(0L, 2L), DT[,head(.SD,2)[,new:=1:.N],by=a], data.table(a=rep(1:3,each=2),b=c(1:4,6:7),new=1:2)) +# test 752 subsumes 752.1 and 752.2 for testing both levels DT = data.table(a=1:3,b=1:3,v=1:6,w=1:6) opt = c(0:2, Inf) @@ -91,6 +93,7 @@ out = c('GForce FALSE', 'GForce FALSE', 'GForce TRUE', 'GForce TRUE') # v1.9.7 treats wrapped {} better, so this is now optimized test(865,optimize=opt, DT[,{list(name1=sum(v),name2=sum(w))},by=c('a', 'b'),verbose=TRUE], output=out) test(867,optimize=opt, names(DT[,list(name1=sum(v),name2=sum(w)),by=c('a', 'b')]), c("a", "b", "name1", "name2")) # list names extracted here +# test 865 subsumes 865.1, 865.2, 865.3 for testing all levels # 866 was testing an intermediate step; 868 was testing equality between optimization levels # tests of gsum and gmean with NA @@ -102,12 +105,14 @@ set(DT,10:12,"v",NA) opt = c(1L, 2L) out = c("(GForce FALSE)", "GForce optimized j to") test(1184.1,optimize=opt, DT[, sum(v), by=x, verbose=TRUE], output=out) +# test 1184.1 subsumes 1184.1 and 1186 for testing both levels test(1184.2,optimize=1L, DT[, mean(v), by=x, verbose=TRUE], output="(GForce FALSE)") +test(1185.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], + data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +# test 1185.1 subsumes 1185.1 and 1187.1 for testing all levels test(1185.2,optimize=c(0L,1L,Inf), DT[, list(mean(y), mean(v), mean(y,na.rm=TRUE), mean(v,na.rm=TRUE)), by=x, verbose=TRUE], output=c("All optimizations.*off", "Old mean.*changed j", "GForce optimized j to")) -# 1186 was 1184.1 for active GForce -test(1187.1,optimize=c(0L, 1L, 2L), DT[, list(sum(y), sum(v), sum(y,na.rm=TRUE), sum(v,na.rm=TRUE)), by=x], - data.table(x=c("a","b","c","d"), V1=c(NA,10L,NA,NA), V2=c(6,NA,NA,NA), V3=c(4L,10L,7L,0L), V4=c(6,10,15,0))) +# test 1185.2 subsumes 1185.2, 1185.3, and 1185.4 for testing all levels MyVar = TRUE test(1187.2,optimize=opt, DT[, list(sum(y,na.rm=MyVar), mean(y,na.rm=MyVar)), by=x, verbose=TRUE], output=out) test(1187.3,optimize=opt, DT[, mean(y,na.rm=MyVar), by=x, verbose=TRUE], output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 31a56e6dc8..223e1bebb3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -3892,6 +3892,7 @@ test(1133.4, DT[, new := c(1L,2L), by=x], error="Supplied 2 items to be assigned test(1133.5, DT, data.table(x=INT(1,1,1,1,1,2,2), new=99L)) test(1133.6, DT[, new := rep(-.GRP, .N), by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(-1,-1,-1,-1,-1,-2,-2))) test(1133.7,optimize=c(0L, 2L), DT[, new := .N, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(5,5,5,5,5,2,2))) +# test 1133.7 subsumes 1133.7 and 1133.75 for testing both levels # on a new column with warning on 2nd assign DT[,new:=NULL] test(1133.8, DT[, new := if (.GRP==1L) 7L else 3.4, by=x], data.table(x=INT(1,1,1,1,1,2,2), new=INT(7,7,7,7,7,3,3)), @@ -3996,6 +3997,7 @@ DT = data.table(x=as.POSIXct(c("2009-02-17 17:29:23.042", "2009-02-17 17:29:25.1 test(1143.3, DT[, list(lx=x[.N]), by=x], data.table(x=DT$x, lx=DT$x)) test(1143.4,optimize=c(0L, 2L), copy(DT)[,`:=`(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) test(1143.5,optimize=c(0L, 2L), copy(DT)[, let(lx=tail(x,1L)), by=y], copy(DT)[, lx:=x]) +# tests 1143.4 and 1143.5 subsume 1143.4, 1143.5, 1143.6 and 1143.7 for testing both levels # FR #2356 - retain names of named vector as column with keep.rownames=TRUE x <- 1:5 From e2694e11ec74f238eabd89782feb6e05ef61da55 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 11:40:59 +0100 Subject: [PATCH 43/78] add subsuming comments --- inst/tests/optimize.Rraw | 10 ++++++++-- inst/tests/tests.Rraw | 5 +++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 33a0424667..67932d5f7f 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -182,8 +182,9 @@ test(1313.30,optimize=2L, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1 # Fix for #1251, DT[, .N, by=a] and DT[, .(.N), by=a] uses GForce now dt = data.table(a=sample(3,20,TRUE), b=1:10) -test(1565.1,optimize=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], +test(1565, optimize=c(0,1,Inf), dt[, .N, by=a, verbose=TRUE], output=c("All optimizations are turned off", "lapply optimization is on, j unchanged", "GForce optimized j to")) +# test 1565 subsumes 1565.1, 1565.2 and 1565.3 for testing all levels # gforce optimisations dt = data.table(x = sample(letters, 300, TRUE), @@ -261,13 +262,17 @@ test(1581.01,optimize=opt, ans1 <- dt[x %in% letters[15:20], lapply(.SD, median, na.rm=TRUE) ), by=x, verbose=TRUE], output = out) - +# test 1581.01 subsumes 1581.01, 1581.02 and 1581.03 for testing all levels # subsets in 'i' for head and tail test(1581.04,optimize=opt, dt[x %in% letters[15:20], head(.SD,1), by=x, verbose=TRUE], output=out) +# test 1581.04 subsumes 1581.04, 1581.05 and 1581.06 for testing all levels test(1581.07,optimize=opt, dt[x %in% letters[15:20], tail(.SD,1), by=x, verbose=TRUE], output=out) +# test 1581.07 subsumes 1581.07, 1581.08 and 1581.09 for testing all levels test(1581.10,optimize=opt, dt[x %in% letters[15:20], .SD[2], by=x, verbose=TRUE], output=out) +# test 1581.10 subsumes 1581.10, 1581.11 and 1581.12 for testing all levels # #3209 g[[ test(1581.13,optimize=opt, dt[x %in% letters[15:20], d1[[2]], by=x, verbose=TRUE], output=out) +# test 1581.13 subsumes 1581.13, 1581.14 and 1581.15 for testing all levels # also, block for non-atomic input, #4159 dt = data.table(a=1:3) dt[ , l := .(list(1, 2, 3))] @@ -374,6 +379,7 @@ test(2231.20,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE DT = data.table(x=c(1L,NA,NaN,3L,4L,5L,5L,6L), w=c(1L,NaN,NA,1L,2L,2L,2L,2L), g=rep(1L:2L, each=4L)) test(2231.21,optimize=opt, DT[, weighted.mean(x, w, na.rm=FALSE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(NA, 5)), output=out) test(2231.22,optimize=opt, DT[, weighted.mean(x, w, na.rm=TRUE), g, verbose=TRUE], data.table(g=c(1L,2L), V1=c(2, 5)), output=out) +# tests 2231.31-2231.52 were subsumed in 2231.01-2231.22 for testing different optimization levels # let wrongly named arguments get lost in ellipsis #5543 DT = data.table(x=c(3.7,3.3,3.5,2.8), w=c(5,5,4,1), g=1L) test(2231.61,optimize=opt, DT[, weighted.mean(x, w), g, verbose=TRUE], data.table(g=1L, V1=3.45+1/300), output=out) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 223e1bebb3..34b1901e73 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6262,6 +6262,7 @@ test(1437.18, DT[x == 2, .(test = x+y), verbose = TRUE], output = "Optimized sub test(1437.19, DT[x == 2, test := x+y, verbose = TRUE], output = "Optimized subsetting") ## optimize option level 3 is required to get optimized subsetting test(1437.21,optimize=c(2,Inf), DT[x == 2, verbose = TRUE], output = c("^ x y", "Optimized subsetting")) +# test 1437.21 subsumes 1437.21 and 1437.22 ## NaN on right hand side is treated correctly. NA on right hand side is not reaching .prepareFastSubset, so not tested here DT <- data.table(x = c(1L:10L, NA_integer_, NA_integer_), y = c(1:10, NA_real_, NaN)) test(1437.23, DT[y == NaN], DT[0]) @@ -8767,6 +8768,7 @@ test(1629.10,optimize=c(1L,2L), dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], a test(1629.11,optimize=c(1L,2L), dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp]) test(1629.12,optimize=c(1L,Inf), dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp]) +# test 1629.12 subsumes 1629.12 and 1629.13 for testing both levels # gforce # even more complex, shouldn't run any optimisation dt[, v4 := v1/2] @@ -13621,6 +13623,7 @@ test(1967.68, setDT(`___data.table_internal_test_1967.68___`), error = 'Cannot f ### [.data.table verbosity & non-equi-join tests test(1967.69,optimize=0L, x[order(a), .N, verbose = TRUE], output='[1] 5', notOutput='forder.c') +# test 1967.69 subsumes 1967.69 and 1967.70 test(1967.71,optimize=1L, x[order(a), .N, verbose = TRUE], 5L, output = "forder.c received 5 rows and 1 column") setkey(x) @@ -13780,6 +13783,7 @@ test(1980, names(data.table(x)), "x") DT = data.table( id = 1:5 , val = letters[1:5] ) test(1981.1,optimize=c(0L, Inf), DT[, new_col := shift(val, "lead")], error=base_messages$stopifnot("is.numeric(n)")) test(1981.2,optimize=c(0L, Inf), DT[, new_col := shift(val, NA_integer_)], error="Item 1 of n is NA") +# tests 1981.1 and 1981.2 subsume 1981.1, 1981.2, 1981.3 and 1981.4 for testing different levels # 1982 moved to benchmark.Rraw, #5517 @@ -14696,6 +14700,7 @@ DT = data.table(x=INT(1,1,1,2,2), y=1:5) z = 1:5 opt = c(Inf,1L,0L) test(2043.1,optimize=opt, DT[, list(mean(z), mean(y)), by=x], data.table(x=1:2, V1=c(3,3), V2=c(2.0,4.5))) +# test 2043.1 subsumes 2043.1, 2043.2 and 2043.3 for testing different levels test(2043.4,optimize=opt, DT[, list(sd(z), sd(y)), by=x], data.table(x=1:2, V1=sd(z), V2=c(sd(1:3), sd(4:5)))) z = 1:4 test(2043.5,optimize=opt, DT[, list(mean(z), mean(y)), by=x], ans<-data.table(x=1:2, V1=c(2.5,2.5), V2=c(2.0,4.5))) # was length error about z From da771d48920ef7df651f5fcced58107d4542615b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 11:47:55 +0100 Subject: [PATCH 44/78] finish double checking of moving tests --- inst/tests/optimize.Rraw | 3 +++ inst/tests/tests.Rraw | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/inst/tests/optimize.Rraw b/inst/tests/optimize.Rraw index 67932d5f7f..88dd27498c 100644 --- a/inst/tests/optimize.Rraw +++ b/inst/tests/optimize.Rraw @@ -392,8 +392,11 @@ dt = data.table(a=letters[1:4], b=structure(1:4, class = c("class_b", "integer") opt = c(0,Inf) out = c("GForce FALSE", "GForce optimized j to") test(2263.1,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, b], data.table(b=dt$b, N=1L), output=out) +# test 2263.1 subsumes 2263.1 and 2263.4 for different optimization levels test(2263.2,optimize=opt, options=list(datatable.verbose=TRUE), dt[, .N, .(b,c)], data.table(b=dt$b, c=dt$c, N=1L), output=out) +# test 2263.2 subsumes 2263.2 and 2263.5 for different optimization levels test(2263.3,optimize=opt, options=list(datatable.verbose=TRUE), names(attributes(dt[, .N, b]$b)), c("class", "att"), output=out) +# test 2263.3 subsumes 2263.3 and 2263.6 for different optimization levels # named arguments of c() in j get prepended to lapply(.SD, FUN) #2311 M <- as.data.table(mtcars) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 34b1901e73..a0e901397a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17773,6 +17773,7 @@ n=4; k=2 mm = data.table(a = rep(1:k,n), b=seq_len(n*k), d=rep(1:n,k)) ans = copy(mm)[, e:=INT(NA,8,NA,12,NA,8,NA,12)] test(2233.41,optimize=opt, copy(mm)[a==2, e:=sum(b), by=d, verbose=TRUE], ans, output=c("GForce FALSE", "GForce.*gsum")) +# test 2233.41 subsumes 2231.41 and 2231.42 for different optimization levels # test from #5345 set.seed(1) DT = data.table( @@ -17791,6 +17792,7 @@ test(2233.43,optimize=c(0,Inf), options = list(datatable.verbose=TRUE), ][, sum_v1_id :=sum(v1), by=c("id")], ans, output=c("GForce FALSE", "GForce.*gsum")) +# test 2233.43 subsumes 2231.43 and 2231.44 for different optimization levels # optimized := with gforce functions that can return lists #5403 DT = data.table(grp=1:2, x=1:4) out = "Making each group and running j (GForce TRUE)" @@ -17936,7 +17938,9 @@ j=1L opt = c(1L,2L) out = c("GForce FALSE", "GForce TRUE") test(2243.41,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[TRUE], x]$V1, 1:4, output="GForce FALSE") +# test 2243.41 subsumes 2243.41 and 2243.51 for different optimization levels test(2243.42,optimize=opt, options=c(datatable.verbose=TRUE), dt[, z[y], x], data.table(x=c(2,2,1,1), V1=c("a","b",NA,NA)), output="GForce FALSE") +# test 2243.42 subsumes 2243.42 and 2243.52 for different optimization levels test(2243.53,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[1], x]$V1, c(1L, 3L), output=out) test(2243.54,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[j], x]$V1, c(1L, 3L), output=out) test(2243.55,optimize=opt, options=c(datatable.verbose=TRUE), dt[, .I[i], x]$V1, 1:4, output="GForce FALSE") From af15282e50a6ec01be94a411848e3eb67627f0d7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Wed, 7 Jan 2026 11:53:41 +0100 Subject: [PATCH 45/78] make optimize more robust Co-authored-by: Michael Chirico --- R/test.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/test.data.table.R b/R/test.data.table.R index eb8c40015d..ea76818431 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -378,6 +378,8 @@ test = function(num, x, y=TRUE, if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) cl = match.call() + if ("datatable.optimize" %in% names(cl$options)) + stopf("Trying to set optimization level through both options= and optimize=") cl$optimize = NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) From b61f2803f85fdaecfc836c9c687f8d69461a87c3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 11:56:50 +0100 Subject: [PATCH 46/78] add comment about removing tests in benchmark.Rraw --- inst/tests/benchmark.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 1cb35a557d..246a1d5daa 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -197,6 +197,7 @@ test(637.1, optimize=opt, copy(DT)[, m:=sum(b), by=a][1:3], data.table(a=1L, b=c test(637.2, optimize=opt, key(copy(DT)[J(43L), a:=99L]), NULL) setkey(DT, a) test(637.3, optimize=opt, key(copy(DT)[, a:=99L, by=a]), NULL) +# test 637 subsumes 637 and 638 for different optimization levels # Test X[Y] slowdown, #2216 # Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes From d8e34d3e2c28d633de3d61358b5d1eb23b54ab60 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 11:58:56 +0100 Subject: [PATCH 47/78] be clearer in NEWS --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index f45231cab2..fb59c2fad8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,8 +21,8 @@ This rewrite also introduces several new optimizations: - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) - lapply optimization works without .SD (e.g., `lapply(list(col1, col2), fun)` -> `list(fun(col1), fun(col2))` [#5032](https://github.com/Rdatatable/data.table/issues/5032) - - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))`) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - - Arithmetic operation support in GForce (e.g., `max(x) - min(x)`) [#3815](https://github.com/Rdatatable/data.table/issues/3815) + - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934) + - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815) ### Notes From c5fb65a97bc5bd507a8e96843684392aa021295c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 14:32:56 +0100 Subject: [PATCH 48/78] add nocovs for errors --- R/test.data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index ea76818431..841ecc8de1 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -376,10 +376,10 @@ test = function(num, x, y=TRUE, # if optimization is provided, test across multiple optimization levels if (!is.null(optimize)) { if (!is.numeric(optimize) || length(optimize) < 1L || anyNA(optimize) || any(optimize < 0L)) - stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) + stopf("optimize must be numeric, length >= 1, non-NA, and >= 0; got: %s", optimize) # nocov cl = match.call() if ("datatable.optimize" %in% names(cl$options)) - stopf("Trying to set optimization level through both options= and optimize=") + stopf("Trying to set optimization level through both options= and optimize=") # nocov cl$optimize = NULL # Remove optimization levels from the recursive call # Check if y was explicitly provided (not just the default) From 9f0e5cf553e741f1281f88697cb30298b1ed66d7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 15:34:51 +0100 Subject: [PATCH 49/78] add unwrapper for conversions --- R/data.table.R | 9 +++++++-- inst/tests/tests.Rraw | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a56662ccc3..8e407e53cd 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -3402,6 +3402,11 @@ is_constantish = function(q, check_singleton=FALSE) { .gforce_ops = c("+", "-", "*", "/", "^", "%%", "%/%") +.unwrap_conversions = function(expr) { + while (.is_type_conversion(expr) && length(expr) >= 2L) expr = expr[[2L]] + expr +} + .gforce_ok = function(q, x, envir=parent.frame(2L)) { if (is.N(q)) return(TRUE) # For #334 if (!is.call(q)) return(is.numeric(q)) # plain columns are not gforce-able since they might not aggregate (see test 104.1) @@ -3409,8 +3414,8 @@ is_constantish = function(q, check_singleton=FALSE) { q1 = .get_gcall(q) if (!is.null(q1)) { - q2 = if (.is_type_conversion(q[[2L]]) && is.symbol(q[[2L]][[2L]])) q[[2L]][[2L]] else q[[2L]] - if (!q2 %chin% names(x) && q2 != ".I") return(FALSE) # 875 + q2 = .unwrap_conversions(q[[2L]]) + if (!is.symbol(q2) || (!q2 %chin% names(x) && q2 != ".I")) return(FALSE) if (length(q)==2L || (.arg_is_narm(q) && is_constantish(q[[3L]]) && !(is.symbol(q[[3L]]) && q[[3L]] %chin% names(x)))) return(TRUE) return(switch(as.character(q1), diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a0e901397a..8fed7d8fd4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21317,6 +21317,7 @@ out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") dt = data.table(a=1:4, b=1:2) test(2358.01,optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) test(2358.02,optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2358.03,optimize=0:2, dt[, max(as.integer(as.integer(as.integer(a)))), by=b, verbose=TRUE], data.table(b=1:2, V1=3:4), output=out) dt = data.table(a=1:4, b=1:2) test(2358.11,optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) test(2358.12,optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") From 8129198ed1a724fa66a78d862ae2af21fa9a801a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 7 Jan 2026 15:51:08 +0100 Subject: [PATCH 50/78] add more tests --- inst/tests/tests.Rraw | 49 +++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8fed7d8fd4..dd6e3bf7a4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21232,6 +21232,7 @@ foo = function(dt) { dt[,b:=4:6]; return(7:9) } DT = data.table(a=1:3) test(2349, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9)) test(2349.1, DT[,c:=foo(DT)], data.table(a=1:3, b=4:6, c=7:9)) +rm(inner, outer, foo, DT) # exercise rehashing during forder, #6694 strings = as.character(6145:1) @@ -21314,24 +21315,40 @@ local({ # lapply gforce should also work without .SD #5032 # support arithmetic in j with gforce #3815 out = c("GForce FALSE", "GForce FALSE", "GForce TRUE") +# unwrap type conversions dt = data.table(a=1:4, b=1:2) -test(2358.01,optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) -test(2358.02,optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) -test(2358.03,optimize=0:2, dt[, max(as.integer(as.integer(as.integer(a)))), by=b, verbose=TRUE], data.table(b=1:2, V1=3:4), output=out) +test(2358.01, optimize=0:2, dt[, max(as.character(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c("3","4")), output=out) +test(2358.02, optimize=0:2, dt[, max(as.numeric(a)), by=b, verbose=TRUE], data.table(b=1:2, V1=c(3,4)), output=out) +test(2358.03, optimize=0:2, dt[, max(as.integer(as.integer(as.integer(a)))), by=b, verbose=TRUE], data.table(b=1:2, V1=3:4), output=out) +# Map in j dt = data.table(a=1:4, b=1:2) -test(2358.11,optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) -test(2358.12,optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +test(2358.11, optimize=0:2, dt[, Map(sum, .SD), b, verbose=TRUE], dt[, lapply(.SD, sum), b], output=out) +test(2358.12, optimize=0:2, dt[, Map(sum, .SD, .SD), by=b, verbose=TRUE], output="GForce FALSE") +# lapply without .SD dt = data.table(a = NA_integer_, b = 1:2, c = c(TRUE, FALSE)) -test(2358.13,optimize=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") -test(2358.14,optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2358.13, optimize=0:2, dt[, Map(weighted.mean, .SD, na.rm=c), b, .SDcols="a", verbose=TRUE], data.table(b=1:2, a=c(NaN, NA_real_)), output="GForce FALSE") +test(2358.14, optimize=0:2, dt[,list(weighted.mean(a, na.rm=c)), b, verbose=TRUE], data.table(b=1:2, V1=c(NaN, NA_real_)), output="GForce FALSE") +test(2358.15, optimize=0:2, dt[, Map(sum, .SD), by=b, .SDcols=c("a","c"), verbose=TRUE], dt[, lapply(.SD, sum), by=b, .SDcols=c("a","c")], output=out) dt = data.table(a=1:2, b=1, c=1:4) -test(2358.21,optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) -test(2358.22,optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +test(2358.21, optimize=0:2, dt[, lapply(list(b, c), sum), by=a, verbose=TRUE], output=out) +test(2358.22, optimize=0:2, dt[, c(list(sum(b), sum(c))), by=a, verbose=TRUE], output=out) +# support arithmetic in j dt = data.table(a=1:4, b=1:2) -test(2358.31,optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) -test(2358.32,optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) -test(2358.33,optimize=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) -test(2358.34,optimize=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) -test(2358.35,optimize=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) -test(2358.36,optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") -rm(inner, outer, foo, DT) +test(2358.31, optimize=0:2, dt[, .(max(a)-min(a)), by=b, verbose=TRUE], output=out) +test(2358.32, optimize=0:2, dt[, .((max(a) - min(a)) / (max(a) + min(a))), by=b, verbose=TRUE], data.table(b=1:2, V1=c(0.5, 1/3)), output=out) +test(2358.33, optimize=0:2, dt[, sum(a) / .N, b, verbose=TRUE], output=out) +test(2358.34, optimize=0:2, dt[, mean(a) * 2L + sum(a), b, verbose=TRUE], output=out) +test(2358.35, optimize=0:2, dt[, list(range=max(a)-min(a), avg=mean(a)), by=b, verbose=TRUE], output=out) +test(2358.36, optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], output="GForce FALSE") +test(2358.37, optimize=0:2, dt[, sum(a) %% 2, b, verbose=TRUE], output=out) +test(2358.38, optimize=0:2, dt[, sum(a) %/% 2, b, verbose=TRUE], output=out) +test(2358.39, optimize=0:2, dt[, -sum(a), b, verbose=TRUE], output=out) +# mix cases of the above +dt = data.table(a=1:4, b=1:2) +test(2358.41, optimize=0:2, dt[, sum(as.numeric(a)) + mean(as.integer(a)), by=b, verbose=TRUE], output=out) +test(2358.42, optimize=0:2, dt[, Map(sum, .SD), by=b, .SDcols="a", verbose=TRUE], output=out) +test(2358.43, optimize=0:2, dt[, lapply(list(as.numeric(a)), sum), by=b, verbose=TRUE], output="GForce FALSE") +test(2358.44, optimize=0:2, dt[, sum(a) + as.numeric("5"), by=b, verbose=TRUE], data.table(b=1:2, V1=c(9, 11)), output="GForce FALSE") +test(2358.45, optimize=0:2, dt[, sum(a[a > 2]), by=b, verbose=TRUE], output="GForce FALSE") +dt = data.table(a=1:4, b=1:2, c=2:5) +test(2358.46, optimize=0:2, dt[, .(sum(a) + sum(c)), by=b, verbose=TRUE], output=out) From 5c5d88bacb0aeace578bc2a413de8c1c0671d2fe Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 15:58:13 +0100 Subject: [PATCH 51/78] improve comment --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 8e407e53cd..4e4b8dce41 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2008,7 +2008,7 @@ replace_dot_alias = function(e) { SDenv$.NGRP = length(f__) lockBinding(".NGRP", SDenv) - # Call extracted GForce optimization function + # Determine GForce-optimized query if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) GForce = gforce_result$GForce From 100aad5973d897e90f8e9db2493dc5e9fb8758dc Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 16:03:52 +0100 Subject: [PATCH 52/78] unnest one layer --- R/data.table.R | 65 +++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 4e4b8dce41..b29ee03fd7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -412,40 +412,41 @@ replace_dot_alias = function(e) { .optimize_gforce = function(jsub, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { GForce = FALSE - # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with + # TODO: FR #971, make GForce work with joins. joins could work with # nomatch=NULL even now.. but not switching it on yet, will deal it separately. - if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__)) { - if (!length(ansvars) && !use.I) { - GForce = FALSE - if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { - GForce = TRUE - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) - } - } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N - GForce = FALSE - } else { - # Apply GForce - if (jsub %iscall% "list") { - GForce = TRUE - for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { - if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} - } - } else - GForce = .gforce_ok(jsub, SDenv$.SDall, envir) - if (GForce) { - if (jsub %iscall% "list") - for (ii in seq_along(jsub)[-1L]) { - if (is.N(jsub[[ii]])) next; # For #334 - jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) - } - else { - # adding argument to ghead/gtail if none is supplied to g-optimized head/tail - if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L - jsub = .gforce_jsub(jsub, names_x, envir) - } - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) - } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); + if (getOption("datatable.optimize") < 2L || is.data.table(i) || byjoin || !length(f__)) + return(list(GForce=FALSE, jsub=jsub)) + + if (!length(ansvars) && !use.I) { + GForce = FALSE + if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { + GForce = TRUE + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) } + } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N + GForce = FALSE + } else { + # Apply GForce + if (jsub %iscall% "list") { + GForce = TRUE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} + } + } else + GForce = .gforce_ok(jsub, SDenv$.SDall, envir) + if (GForce) { + if (jsub %iscall% "list") + for (ii in seq_along(jsub)[-1L]) { + if (is.N(jsub[[ii]])) next; # For #334 + jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) + } + else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L + jsub = .gforce_jsub(jsub, names_x, envir) + } + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); } list(GForce=GForce, jsub=jsub) From 9167edb227e9f1b3b1b852b404f750b5efab698e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 16:05:44 +0100 Subject: [PATCH 53/78] unnest --- R/data.table.R | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index b29ee03fd7..c7c30c9c4d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -454,37 +454,37 @@ replace_dot_alias = function(e) { # Old mean() optimization fallback when GForce is not used .optimize_mean = function(jsub, SDenv, verbose, GForce) { - if (!GForce && !is.name(jsub)) { - # Still do the old speedup for mean, for now - nomeanopt=FALSE # to be set by .optmean() using <<- inside it - oldjsub = jsub - if (jsub %iscall% "list") { - # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() - # jsub[[1]]=="list" so the first item of todo will always be FALSE - todo = sapply(jsub, `%iscall%`, 'mean') - if (any(todo)) { - w = which(todo) - jsub[w] = lapply(jsub[w], .optmean) - } - } else if (jsub %iscall% "mean") { - jsub = .optmean(jsub) - } - if (nomeanopt) { - warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) + if (GForce || is.name(jsub)) return(jsub) + + # Still do the old speedup for mean, for now + nomeanopt=FALSE # to be set by .optmean() using <<- inside it + oldjsub = jsub + if (jsub %iscall% "list") { + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been is.N() and/or the for-looped if() + # jsub[[1]]=="list" so the first item of todo will always be FALSE + todo = sapply(jsub, `%iscall%`, 'mean') + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) } - if (verbose) { - if (!identical(oldjsub, jsub)) - catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) - else - catf("Old mean optimization is on, left j unchanged.\n") - } - assign("Cfastmean", Cfastmean, SDenv) - # Old comments still here for now ... - # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. - # Maybe change to : - # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower - # when fastmean can do trim. + } else if (jsub %iscall% "mean") { + jsub = .optmean(jsub) + } + if (nomeanopt) { + warningf("Unable to optimize call to mean() and could be very slow. You must name 'na.rm' like that otherwise if you do mean(x,TRUE) the TRUE is taken to mean 'trim' which is the 2nd argument of mean. 'trim' is not yet optimized.", immediate.=TRUE) } + if (verbose) { + if (!identical(oldjsub, jsub)) + catf("Old mean optimization changed j from '%s' to '%s'\n", deparse(oldjsub), deparse(jsub, width.cutoff=200L, nlines=1L)) + else + catf("Old mean optimization is on, left j unchanged.\n") + } + assign("Cfastmean", Cfastmean, SDenv) + # Old comments still here for now ... + # Here in case nomeanopt=TRUE or some calls to mean weren't detected somehow. Better but still slow. + # Maybe change to : + # assign("mean", fastmean, SDenv) # neater than the hard work above, but slower + # when fastmean can do trim. jsub } From 282a09122cf84d38168cac6848fdf23893459822 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 16:10:07 +0100 Subject: [PATCH 54/78] move into helper --- R/data.table.R | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index c7c30c9c4d..e59efaeca8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -491,6 +491,15 @@ replace_dot_alias = function(e) { # attempts to optimize j expressions using lapply, GForce, and mean optimizations .attempt_optimize = function(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, envir) { + if (getOption("datatable.optimize") < 1L) { + if (verbose) catf("All optimizations are turned off\n") + return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames)) + } + if (!(is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N")))) { + if (verbose) catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + return(list(GForce=FALSE, jsub=jsub, jvnames=jvnames)) + } + # Step 1: Apply lapply(.SD) optimization lapply_result = .optimize_lapply(jsub, jvnames, sdvars, SDenv, verbose, envir) jsub = lapply_result$jsub @@ -2010,18 +2019,10 @@ replace_dot_alias = function(e) { lockBinding(".NGRP", SDenv) # Determine GForce-optimized query - if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && jsub %chin% c(".SD", ".N"))) ) { - gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) - GForce = gforce_result$GForce - jsub = gforce_result$jsub - jvnames = gforce_result$jvnames - } else { - GForce = FALSE - if (verbose) { - if (getOption("datatable.optimize")<1L) catf("All optimizations are turned off\n") - else catf("Optimization is on but left j unchanged (single plain symbol): '%s'\n", deparse(jsub, width.cutoff=200L, nlines=1L)) - } - } + gforce_result = .attempt_optimize(jsub, jvnames, sdvars, SDenv, verbose, i, byjoin, f__, ansvars, use.I, lhs, names_x, parent.frame()) + GForce = gforce_result$GForce + jsub = gforce_result$jsub + jvnames = gforce_result$jvnames if (byjoin) { groups = i grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i) From cf6def5573410275e21250e1cd0470d64bd0e287 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 16:14:58 +0100 Subject: [PATCH 55/78] fix NEWS numbering --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 3de3269216..fa87dc94c2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,7 +28,7 @@ 4. `sum()` by group is correct with missing entries and GForce activated ([#7571](https://github.com/Rdatatable/data.table/issues/7571)). Thanks to @rweberc for the report and @manmita for the fix. The issue was caused by a faulty early `break` that spilled between groups, and resulted in silently incorrect results! -2. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. +5. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. This rewrite also introduces several new optimizations: - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) From 0480ee536ca917f520ea2ef257bec7042c9445f5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 16:15:54 +0100 Subject: [PATCH 56/78] remove trailing ws --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 0f7beced83..f9b4470e62 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -454,7 +454,7 @@ replace_dot_alias = function(e) { # Old mean() optimization fallback when GForce is not used .optimize_mean = function(jsub, SDenv, verbose, GForce) { - if (GForce || is.name(jsub)) return(jsub) + if (GForce || is.name(jsub)) return(jsub) # Still do the old speedup for mean, for now nomeanopt=FALSE # to be set by .optmean() using <<- inside it From 4be8a24904bf2c1a5771c7d19c0934dad5191129 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 19:42:07 +0100 Subject: [PATCH 57/78] clean up merge --- inst/tests/tests.Rraw | 4 ---- 1 file changed, 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1bf6735dce..ab8733644e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17531,11 +17531,7 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L -<<<<<<< HEAD test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) -======= - test(2220.0 + testnum*0.01, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="'na.rm' must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) ->>>>>>> master testnum = testnum + 1L test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun), context=sprintf("fun=%s [factor]", fun)) } From 0a07dbaeb010bfaf322c23c95ceaa3f7ac5734ff Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 9 Jan 2026 19:59:42 +0100 Subject: [PATCH 58/78] update errors --- R/test.data.table.R | 10 ++++++---- inst/tests/tests.Rraw | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/R/test.data.table.R b/R/test.data.table.R index 841ecc8de1..d9bc72fbe5 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -385,7 +385,11 @@ test = function(num, x, y=TRUE, # Check if y was explicitly provided (not just the default) y_provided = !missing(y) vector_params = mget(c("error", "warning", "message", "output", "notOutput", "ignore.warning"), environment()) - compare = !y_provided && length(optimize)>1L && !any(lengths(vector_params)) + vector_params = vector_params[lengths(vector_params) > 0L] + compare = !y_provided && length(optimize)>1L && !length(vector_params) + # When optimize has multiple levels, vector params are recycled across levels. + if (length(optimize) > 1L && "warning" %in% names(vector_params) && length(vector_params$warning) > 1L) + warningf("warning= with multiple values is recycled across optimize levels, not treated as multiple warnings in one run") for (i in seq_along(optimize)) { cl$num = num + (i - 1L) * 1e-6 @@ -393,9 +397,7 @@ test = function(num, x, y=TRUE, cl$options = if (!is.null(options)) c(as.list(options), opt_level) else opt_level for (param in names(vector_params)) { val = vector_params[[param]] - if (length(val) > 0L) { - cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels - } + cl[[param]] = val[((i - 1L) %% length(val)) + 1L] # cycle through values if fewer than optimization levels } if (compare && i == 1L) cl$y = eval(cl$x, parent.frame()) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ab8733644e..51e4350d40 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17531,7 +17531,7 @@ funs = c("sum", "mean", "min", "max", "median", "var", "sd", "prod") testnum = 0L for (fun in funs) { testnum = testnum + 1L - test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="na.rm must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) + test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(i, na.rm='a'), g]"), error="'na.rm' must be TRUE or FALSE", context=sprintf("fun=%s [na.rm='a']", fun)) testnum = testnum + 1L test(2220.0 + testnum*0.01, optimize=2L, EVAL("DT[,",fun,"(f), g]"), error=sprintf("%s is not meaningful for factors.", fun), context=sprintf("fun=%s [factor]", fun)) } From 416bff086ecdee0e040450d850e3b5f53a51f3b9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 11 Jan 2026 00:13:10 -0800 Subject: [PATCH 59/78] try another round of unnesting --- R/data.table.R | 57 +++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index f9b4470e62..45756a1934 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -418,37 +418,42 @@ replace_dot_alias = function(e) { return(list(GForce=FALSE, jsub=jsub)) if (!length(ansvars) && !use.I) { - GForce = FALSE if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { - GForce = TRUE - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n",deparse(jsub, width.cutoff=200L, nlines=1L)) + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + } + return(list(GForce=TRUE, jsub=jsub) + } + + # turn off GForce for the combination of := and .N + if (length(lhs) && is.symbol(jsub)) + return(list(GForce=FALSE, jsub=jsub)) + + # Apply GForce + if (jsub %iscall% "list") { + GForce = TRUE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} + } + } else + GForce = .gforce_ok(jsub, SDenv$.SDall, envir) + + if (!GForce) { + if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n") + return(list(GForce=FALSE, jsub=jsub)) + } + + if (jsub %iscall% "list") { + for (ii in seq.int(from=2L, length.out=length(jsub)-1L) { + if (is.N(jsub[[ii]])) next; # For #334 + jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) } - } else if (length(lhs) && is.symbol(jsub)) { # turn off GForce for the combination of := and .N - GForce = FALSE } else { - # Apply GForce - if (jsub %iscall% "list") { - GForce = TRUE - for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { - if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} - } - } else - GForce = .gforce_ok(jsub, SDenv$.SDall, envir) - if (GForce) { - if (jsub %iscall% "list") - for (ii in seq_along(jsub)[-1L]) { - if (is.N(jsub[[ii]])) next; # For #334 - jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) - } - else { - # adding argument to ghead/gtail if none is supplied to g-optimized head/tail - if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L - jsub = .gforce_jsub(jsub, names_x, envir) - } - if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) - } else if (verbose) catf("GForce is on, but not activated for this query; left j unchanged (see ?GForce)\n"); + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub %iscall% c("head", "tail")) jsub[["n"]] = 6L + jsub = .gforce_jsub(jsub, names_x, envir) } + if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) list(GForce=GForce, jsub=jsub) } From 29864365ec9e032425d85cf86d541f1ec06277d1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 12 Jan 2026 23:25:22 -0800 Subject: [PATCH 60/78] cleaning up my mess (typos, thinkos) --- R/data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 45756a1934..ece61c67f2 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -421,7 +421,7 @@ replace_dot_alias = function(e) { if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) } - return(list(GForce=TRUE, jsub=jsub) + return(list(GForce=TRUE, jsub=jsub)) } # turn off GForce for the combination of := and .N @@ -432,7 +432,7 @@ replace_dot_alias = function(e) { if (jsub %iscall% "list") { GForce = TRUE for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { - if (.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} + if (!.gforce_ok(jsub[[ii]], SDenv$.SDall, envir)) {GForce = FALSE; break} } } else GForce = .gforce_ok(jsub, SDenv$.SDall, envir) @@ -443,7 +443,7 @@ replace_dot_alias = function(e) { } if (jsub %iscall% "list") { - for (ii in seq.int(from=2L, length.out=length(jsub)-1L) { + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { if (is.N(jsub[[ii]])) next; # For #334 jsub[[ii]] = .gforce_jsub(jsub[[ii]], names_x, envir) } From 45a457554b5e7cb72bb818834f69186eda22f460 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 12 Jan 2026 23:38:27 -0800 Subject: [PATCH 61/78] thinko --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index ece61c67f2..877ade743c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -420,8 +420,8 @@ replace_dot_alias = function(e) { if (!length(ansvars) && !use.I) { if ( ((is.name(jsub) && jsub==".N") || (jsub %iscall% 'list' && length(jsub)==2L && jsub[[2L]]==".N")) && !length(lhs) ) { if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) + return(list(GForce=TRUE, jsub=jsub)) } - return(list(GForce=TRUE, jsub=jsub)) } # turn off GForce for the combination of := and .N From 7a1b02177e891bc6bff54dc6da7fa82c9ad20c15 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 12 Jan 2026 23:46:06 -0800 Subject: [PATCH 62/78] restore fallthrough case --- R/data.table.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/data.table.R b/R/data.table.R index 877ade743c..ee0f181103 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -422,6 +422,7 @@ replace_dot_alias = function(e) { if (verbose) catf("GForce optimized j to '%s' (see ?GForce)\n", deparse(jsub, width.cutoff=200L, nlines=1L)) return(list(GForce=TRUE, jsub=jsub)) } + return(list(GForce=FALSE, jsub=jsub)) } # turn off GForce for the combination of := and .N From 383b60aca9d5159f0dcf031cffa97f18f04d9f93 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 12 Jan 2026 23:53:55 -0800 Subject: [PATCH 63/78] delint --- NEWS.md | 2 +- R/data.table.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index a679c7bbb8..407f00427f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,7 +34,7 @@ 7. Fixed compilation failure like "error: unknown type name 'siginfo_t'" in v1.18.0 in some strict environments, e.g., FreeBSD, where the header file declaring the POSIX function `waitid` does not transitively include the header file defining the `siginfo_t` type, [#7516](https://github.com/rdatatable/data.table/issues/7516). Thanks to @jszhao for the report and @aitap for the fix. -6. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. +8. GForce and lapply optimization detection has been refactored to use modular optimization paths and an AST (Abstract Syntax Tree) walker for improved maintainability and extensibility. The new architecture separates optimization detection into distinct, composable phases. This makes future optimization enhancements a lot easier. Thanks to @grantmcdermott, @jangorecki, @MichaelChirico, and @HughParsonage for the suggestions and @ben-schwen for the implementation. This rewrite also introduces several new optimizations: - Enables Map in addition to lapply optimizations (e.g., `Map(fun, .SD)` -> `list(fun(col1), fun(col2), ...)`) [#5336](https://github.com/Rdatatable/data.table/issues/5336) diff --git a/R/data.table.R b/R/data.table.R index ee0f181103..a0a4933142 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -428,7 +428,7 @@ replace_dot_alias = function(e) { # turn off GForce for the combination of := and .N if (length(lhs) && is.symbol(jsub)) return(list(GForce=FALSE, jsub=jsub)) - + # Apply GForce if (jsub %iscall% "list") { GForce = TRUE From 62997d3753663d5203a28519ee139b835003b8f5 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 09:15:47 -0800 Subject: [PATCH 64/78] move the optimization comment into 'documentation' of .massageSD --- R/data.table.R | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a0a4933142..c366a23c8f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -148,6 +148,18 @@ replace_dot_alias = function(e) { } # Transform lapply(.SD, fun) or Map(fun, .SD) into list(fun(col1), fun(col2), ...) +# +# It may seem inefficient to construct a potentially long expression. But, consider calling +# lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it +# is called, involving small memory allocations. +# The R level lapply calls as.list which needs a shallow copy. +# lapply also does a setAttib of names (duplicating the same names over and over again +# for each group) which is terrible for our needs. We replace all that with a +# (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol +# lookup), and the eval() inside dogroups hardly has to do anything. All this results in +# overhead minimised. We don't need to worry about the env passed to the eval in a possible +# lapply replacement, or how to pass ... efficiently to it. +# Plus we optimize lapply first, so that mean() can be optimized too as well, next. .massageSD = function(jsub, sdvars, SDenv, funi) { txt = as.list(jsub)[-1L] if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 @@ -175,17 +187,6 @@ replace_dot_alias = function(e) { jsub = as.call(ans) # important no names here jvnames = sdvars # but here instead list(jsub=jsub, jvnames=jvnames, funi=funi+1L) - # It may seem inefficient to construct a potentially long expression. But, consider calling - # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it - # is called, involving small memory allocations. - # The R level lapply calls as.list which needs a shallow copy. - # lapply also does a setAttib of names (duplicating the same names over and over again - # for each group) which is terrible for our needs. We replace all that with a - # (ok, long, but not huge in memory terms) list() which is primitive (so avoids symbol - # lookup), and the eval() inside dogroups hardly has to do anything. All this results in - # overhead minimised. We don't need to worry about the env passed to the eval in a possible - # lapply replacement, or how to pass ... efficiently to it. - # Plus we optimize lapply first, so that mean() can be optimized too as well, next. } # Optimize .SD subsetting patterns like .SD[1], head(.SD), first(.SD) From d3adad198b045c4ba3cfb7617734360a0d007e57 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 09:38:01 -0800 Subject: [PATCH 65/78] use more typical list style for clarity --- R/data.table.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index c366a23c8f..3713e31c0a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -218,8 +218,12 @@ replace_dot_alias = function(e) { } # FR #2722 is just about optimisation of j=c(.N, lapply(.SD, .)) that is taken care of here. # FR #735 tries to optimise j-expressions of the form c(...) as long as ... contains - # 1) lapply(.SD, ...), 2) simply .SD or .SD[..], 3) .N, 4) list(...) and 5) functions that normally return a single value* - # On 5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* + # (1) lapply(.SD, ...) + # (2) simply .SD or .SD[..] + # (3) .N + # (4) list(...) + # (5) functions that normally return a single value* + # On (5)* the IMPORTANT point to note is that things that are not wrapped within "list(...)" should *always* # return length 1 output for us to optimise. Else, there's no equivalent to optimising c(...) to list(...) AFAICT. # One issue could be that these functions (e.g., mean) can be "re-defined" by the OP to produce a length > 1 output # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. From a50503f2d6fb3ae4705841b6de52375704ee4345 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 09:44:33 -0800 Subject: [PATCH 66/78] typo it's -> its --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 3713e31c0a..f94a6616e7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -244,7 +244,7 @@ replace_dot_alias = function(e) { jsubl[[i_]] = lapply(sdvars, as.name) jvnames = c(jvnames, sdvars) } else if (this == ".N") { - # don't optimise .I in c(.SD, .I), it's length can be > 1 + # don't optimise .I in c(.SD, .I), its length can be > 1 # only c(.SD, list(.I)) should be optimised!! .N is always length 1. jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) } else { From 35bd7d9f2da7f519765a69ec3de847068149561d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 13:49:19 -0800 Subject: [PATCH 67/78] unnest for unusual list() case --- R/data.table.R | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index f94a6616e7..e6be13ddb2 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -274,28 +274,27 @@ replace_dot_alias = function(e) { # Case 2b: list(...) else if (this[[1L]] == "list") { # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen - if (length(this) > 1L) { - jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* - njl__ = names(jl__) %||% rep("", length(jl__)) - njl__nonblank = nzchar(names(jl__)) - if (length(jl__) > 1L) { - jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) - } else { - jn__ = names(jsubl)[i_] - } - jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + if (length(this) == 1L) next + jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) + if (isTRUE(nzchar(names(jsubl)[i_]))) { + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + njl__ = names(jl__) %||% rep("", length(jl__)) + njl__nonblank = nzchar(names(jl__)) + if (length(jl__) > 1L) { + jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) } else { - jn__ = names(jl__) %||% rep("", length(jl__)) + jn__ = names(jsubl)[i_] } - idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) - if (any(idx)) - jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && - jvnames = c(jvnames, jn__) - jsubl[[i_]] = jl__ - any_optimized = TRUE + jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + } else { + jn__ = names(jl__) %||% rep("", length(jl__)) } + idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) + if (any(idx)) + jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && + jvnames = c(jvnames, jn__) + jsubl[[i_]] = jl__ + any_optimized = TRUE } # Case 2c: Single-value functions like mean, sum, etc. else if (this %iscall% optfuns && length(this)>1L) { From 6243cf313a89047edbcf899433e71656c846e415 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 13:55:29 -0800 Subject: [PATCH 68/78] try a better name --- R/data.table.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index e6be13ddb2..86900a84c0 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -278,14 +278,14 @@ replace_dot_alias = function(e) { jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) if (isTRUE(nzchar(names(jsubl)[i_]))) { # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* - njl__ = names(jl__) %||% rep("", length(jl__)) - njl__nonblank = nzchar(names(jl__)) + jl__names = names(jl__) %||% rep("", length(jl__)) + jl__hasname = nzchar(names(jl__)) if (length(jl__) > 1L) { jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) } else { jn__ = names(jsubl)[i_] } - jn__[njl__nonblank] = paste(names(jsubl)[i_], njl__[njl__nonblank], sep=".") + jn__[jl__hasname] = paste(names(jsubl)[i_], jl__names[jl__hasname], sep=".") } else { jn__ = names(jl__) %||% rep("", length(jl__)) } From c5e95d23925b05c4332343348fef6997ad6678ed Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 13 Jan 2026 14:02:09 -0800 Subject: [PATCH 69/78] alignment for readability --- R/data.table.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 86900a84c0..0fa64acbe4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -301,8 +301,10 @@ replace_dot_alias = function(e) { jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) } # Case 2d: .SD[1] or similar subsetting - else if (length(this) == 3L && (this[[1L]] == "[" || this[[1L]] == "head") && - this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { + else if (length(this) == 3L + && (this[[1L]] == "[" || this[[1L]] == "head") + && this[[2L]] == ".SD" + && (is.numeric(this[[3L]]) || this[[3L]] == ".N")) { # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. any_optimized = TRUE jsubl[[i_]] = lapply(sdvars, function(x) { this[[2L]] = as.name(x); this }) From 69c3a7c72282a88949134e62eab6e5e9fbc98c67 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:21:46 +0100 Subject: [PATCH 70/78] add comment about \(x) Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 0fa64acbe4..659fe01fa1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -165,7 +165,7 @@ replace_dot_alias = function(e) { if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 # support Map instead of lapply #5336 fun = if (jsub %iscall% "Map") txt[[1L]] else txt[[2L]] - if (fun %iscall% "function") { + if (fun %iscall% "function") { # NB: '\(x)' only exists pre-parser, so it's also covered # Fix for #2381: added SDenv$.SD to 'eval' to take care of cases like: lapply(.SD, function(x) weighted.mean(x, bla)) where "bla" is a column in DT # http://stackoverflow.com/questions/13441868/data-table-and-stratified-means # adding this does not compromise in speed (that is, not any lesser than without SDenv$.SD) From e7e6444ce99f7a2e99da947db8098a74c25a63f6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 13 Jan 2026 23:27:53 +0100 Subject: [PATCH 71/78] remove old comment --- R/data.table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 659fe01fa1..e2fde8d500 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -234,7 +234,6 @@ replace_dot_alias = function(e) { jsubl = as.list.default(jsub) oldjvnames = jvnames jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. - # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! for (i_ in 2L:length(jsubl)) { this = jsub[[i_]] # Case 1: Plain name (.SD or .N) From f32875b148e3e44b480ee71977dc5ef1adc2f34a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:39:51 +0100 Subject: [PATCH 72/78] use nzchar(jl__names) for the case of `is.null(names(jl__))` this should result in `logical(0)` differently from current version which returns `c(FALSE, FALSE, ...)` Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index e2fde8d500..97c7061215 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -278,7 +278,7 @@ replace_dot_alias = function(e) { if (isTRUE(nzchar(names(jsubl)[i_]))) { # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* jl__names = names(jl__) %||% rep("", length(jl__)) - jl__hasname = nzchar(names(jl__)) + jl__hasname = nzchar(jl__names) if (length(jl__) > 1L) { jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) } else { From 864383fb53d159e72382900e96472ecc75256d4e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:41:31 +0100 Subject: [PATCH 73/78] use identical and quote instead of direct comparison Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 97c7061215..5569967280 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -288,7 +288,7 @@ replace_dot_alias = function(e) { } else { jn__ = names(jl__) %||% rep("", length(jl__)) } - idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) + idx = vapply_1b(jl__, identical, quote(.I)) if (any(idx)) jn__[idx & !nzchar(jn__)] = "I" # this & is correct not && jvnames = c(jvnames, jn__) From b57c2d52d02e9ff3e9a3ecd9989e869ad3fcfca6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 13 Jan 2026 23:42:30 +0100 Subject: [PATCH 74/78] avoid lambda fun Co-authored-by: Michael Chirico --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 5569967280..6ad375acd5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -385,7 +385,7 @@ replace_dot_alias = function(e) { # Pattern 3b: Map(fun, .SD) # Only optimize if .SD appears exactly once to avoid cases like Map(rep, .SD, .SD) else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars) && - sum(vapply_1b(as.list(jsub), function(x) identical(x, quote(.SD)))) == 1L) { + sum(vapply_1b(as.list(jsub), identical, quote(.SD))) == 1L) { massage_result = .massageSD(jsub, sdvars, SDenv, funi) jsub = massage_result$jsub jvnames = massage_result$jvnames From bf6c61feabc88fd6bd70b2fdf959356cdf658394 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Tue, 13 Jan 2026 23:55:59 +0100 Subject: [PATCH 75/78] use early exits instead of is_valid --- R/data.table.R | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 6ad375acd5..1f26314dbb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -229,7 +229,6 @@ replace_dot_alias = function(e) { # Of course this is worrying too much though. If the issue comes up, we'll just remove the relevant optimisations. # For now, we optimise all functions mentioned in 'optfuns' below. optfuns = c("max", "min", "mean", "length", "sum", "median", "sd", "var") - is_valid = TRUE any_optimized = FALSE jsubl = as.list.default(jsub) oldjvnames = jvnames @@ -248,8 +247,7 @@ replace_dot_alias = function(e) { jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) } else { # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) - is_valid = FALSE - break + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) } } # Case 2: Call expression @@ -314,34 +312,30 @@ replace_dot_alias = function(e) { # TODO, TO DO: revisit complex cases (as illustrated below) # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. - is_valid = FALSE - break + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) } # Case 2f: Other cases - skip optimization else { # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? - is_valid = FALSE - break + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) } } # Case 3: Other types - can't optimize else { - is_valid = FALSE - break + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) } } # Return result - if (!is_valid || !any_optimized) { + if (!any_optimized) { # Can't optimize - return original - list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE) - } else { - # Optimization successful - setattr(jsubl, 'names', NULL) - jsub_new = as.call(unlist(jsubl, use.names=FALSE)) - jsub_new[[1L]] = quote(list) - list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE) + return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) } + # Optimization successful + setattr(jsubl, 'names', NULL) + jsub_new = as.call(unlist(jsubl, use.names=FALSE)) + jsub_new[[1L]] = quote(list) + list(jsub=jsub_new, jvnames=jvnames, funi=funi, optimized=TRUE) } # Optimize lapply(.SD, ...) expressions From 13eed1bd9ed283d9f20c029a78f9265e450b7879 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 14 Jan 2026 11:15:48 +0100 Subject: [PATCH 76/78] add test about aggregating grouped column --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bef1137ad7..97b444b038 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21465,6 +21465,7 @@ test(2361.36, optimize=0:2, dt[, .(max(a)-sqrt(min(a))), by=b, verbose=TRUE], ou test(2361.37, optimize=0:2, dt[, sum(a) %% 2, b, verbose=TRUE], output=out) test(2361.38, optimize=0:2, dt[, sum(a) %/% 2, b, verbose=TRUE], output=out) test(2361.39, optimize=0:2, dt[, -sum(a), b, verbose=TRUE], output=out) +test(2361.40, optimize=0:2, dt[, .(sum(a)-sum(b)), b, verbose=TRUE], output="GForce FALSE") # mix cases of the above dt = data.table(a=1:4, b=1:2) test(2361.41, optimize=0:2, dt[, sum(as.numeric(a)) + mean(as.integer(a)), by=b, verbose=TRUE], output=out) From a5f588c844bcb0513694fea6a49258b8bed6fb11 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 14 Jan 2026 11:17:57 +0100 Subject: [PATCH 77/78] move extra branch into comments --- R/data.table.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1f26314dbb..eda001eb8e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -308,12 +308,12 @@ replace_dot_alias = function(e) { jvnames = c(jvnames, sdvars) } # Case 2e: Complex .SD usage - can't optimize - else if (any(all.vars(this) == ".SD")) { - # TODO, TO DO: revisit complex cases (as illustrated below) - # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] - # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. - return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) - } + # else if (any(all.vars(this) == ".SD")) { + # TODO, TO DO: revisit complex cases (as illustrated below) + # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp] + # hard to optimise such cases (+ difficulty in counting exact columns and therefore names). revert back to no optimisation. + # return(list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)) + # } # Case 2f: Other cases - skip optimization else { # TO DO, TODO: maybe a message/warning here so that we can catch the overlooked cases, if any? From 54af1628d92928f0474abed51cf82db2e9278510 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Wed, 14 Jan 2026 13:07:32 +0100 Subject: [PATCH 78/78] refactor 2b case branch --- R/data.table.R | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index eda001eb8e..a4b1a653f1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -273,18 +273,15 @@ replace_dot_alias = function(e) { # also handle c(lapply(.SD, sum), list()) - silly, yes, but can happen if (length(this) == 1L) next jl__ = as.list(jsubl[[i_]])[-1L] # just keep the '.' from list(.) - if (isTRUE(nzchar(names(jsubl)[i_]))) { - # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* - jl__names = names(jl__) %||% rep("", length(jl__)) + # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* + jl__names = names(jl__) %||% rep("", length(jl__)) + pname = names(jsubl)[i_] + if (isTRUE(nzchar(pname))) { jl__hasname = nzchar(jl__names) - if (length(jl__) > 1L) { - jn__ = paste0(names(jsubl)[i_], seq_along(jl__)) - } else { - jn__ = names(jsubl)[i_] - } - jn__[jl__hasname] = paste(names(jsubl)[i_], jl__names[jl__hasname], sep=".") + jn__ = if (length(jl__) > 1L) paste0(pname, seq_along(jl__)) else pname + jn__[jl__hasname] = paste(pname, jl__names[jl__hasname], sep=".") } else { - jn__ = names(jl__) %||% rep("", length(jl__)) + jn__ = jl__names } idx = vapply_1b(jl__, identical, quote(.I)) if (any(idx))