-
Notifications
You must be signed in to change notification settings - Fork 1k
modularize optimization internals #7401
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Conversation
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## master #7401 +/- ##
==========================================
- Coverage 99.02% 99.01% -0.02%
==========================================
Files 87 87
Lines 16803 16895 +92
==========================================
+ Hits 16640 16728 +88
- Misses 163 167 +4 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Generated via commit c5e95d2 Download link for the artifact containing the test results: ↓ atime-results.zip
|
|
I'm also not sure about moving the tests to |
| jvnames = c(jvnames, sdvars) | ||
| } | ||
| # Case 2e: Complex .SD usage - can't optimize | ||
| else if (any(all.vars(this) == ".SD")) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's just drop this branch? since it's not yet supported.
R/data.table.R
Outdated
| jsub = as.call(ans) # important no names here | ||
| jvnames = sdvars # but here instead | ||
| list(jsub=jsub, jvnames=jvnames, funi=funi+1L) | ||
| # It may seem inefficient to construct a potentially long expression. But, consider calling |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be worth benchmarking this (atime?)... written 14 years ago, I wonder if it's still true 5176108
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its stil true with nrow=1e6, ncol=10, but no noticeable effect between master and this PR
Details
library(atime)
library(data.table)
pkg.path <- '.'
limit <- 10
# Package editing function for atime
pkg.edit.fun <- function(old.Package, new.Package, sha, new.pkg.path) {
pkg_find_replace <- function(glob, FIND, REPLACE) {
atime::glob_find_replace(file.path(new.pkg.path, glob), FIND, REPLACE)
}
Package_regex <- gsub(".", "_?", old.Package, fixed = TRUE)
Package_ <- gsub(".", "_", old.Package, fixed = TRUE)
new.Package_ <- paste0(Package_, "_", sha)
pkg_find_replace(
"DESCRIPTION",
paste0("Package:\\s+", old.Package),
paste("Package:", new.Package))
pkg_find_replace(
file.path("src", "Makevars.*in"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
Package_regex,
new.Package_)
pkg_find_replace(
file.path("R", "onLoad.R"),
sprintf('packageVersion\\("%s"\\)', old.Package),
sprintf('packageVersion\\("%s"\\)', new.Package))
pkg_find_replace(
file.path("src", "init.c"),
paste0("R_init_", Package_regex),
paste0("R_init_", gsub("[.]", "_", new.Package_)))
pkg_find_replace(
"NAMESPACE",
sprintf('useDynLib\\("?%s"?', Package_regex),
paste0('useDynLib(', new.Package_))
}
# Commits to compare
versions <- c(
'master' = 'b3acef8',
'PR' = '383b60a'
)
# Vary number of groups
N_groups <- 10^seq(1, 5, 0.25)
set.seed(42)
test_data <- lapply(setNames(nm = N_groups), function(n_groups) {
n_rows <- 1e6
n_cols <- 10
dt <- data.table(grp = sample(n_groups, n_rows, replace = TRUE))
for (i in seq_len(n_cols)) {
set(dt, j = paste0("V", i), value = rnorm(n_rows))
}
dt
})
# Test 1: With optimize = 1 (GForce enabled)
gforce_opt1 <- atime_versions(
pkg.path, N_groups,
setup = {
options(datatable.optimize = 1)
DT <- test_data[[as.character(N)]]
},
expr = {
data.table:::`[.data.table`(DT, , lapply(.SD, sum), by = grp)
},
seconds.limit = limit,
verbose = TRUE,
sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
# Test 2: With optimize = 0 (GForce disabled)
gforce_opt0 <- atime_versions(
pkg.path, N_groups,
setup = {
options(datatable.optimize = 0)
DT <- test_data[[as.character(N)]]
},
expr = {data.table:::`[.data.table`(DT, , lapply(.SD, sum), by = grp)
},
seconds.limit = limit,
verbose = TRUE,
sha.vec = versions,
pkg.edit.fun = pkg.edit.fun
)
library(ggplot2)
opt1_data <- gforce_opt1$measurements
opt1_data$optimize <- "opt1"
opt0_data <- gforce_opt0$measurements
opt0_data$optimize <- "opt0"
combined_data <- rbind(opt1_data, opt0_data)
pdf("gforce_lapply_benchmark.pdf", width = 12, height = 6)
p <- ggplot(combined_data, aes(x = N, y = median, color = expr.name, linetype = optimize)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
scale_x_log10("Number of groups") +
scale_y_log10("Median time (seconds)") +
labs(title = "DT[, lapply(.SD, sum), by=grp] Performance",
subtitle = "Comparing optimize=0 vs optimize=1 across commits",
color = "Version",
linetype = "Optimization") +
theme_bw() +
theme(legend.position = "bottom")
print(p)
dev.off()
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nice... wonder if benchmark.Rraw or something like that (a Wiki?) would be a good place to compile these type of low-level optimizations based on R's own logic...
(not required for this PR)
| if (length(names(txt))>1L) .Call(Csetcharvec, names(txt), 2L, "") # fixes bug #110 | ||
| # support Map instead of lapply #5336 | ||
| fun = if (jsub %iscall% "Map") txt[[1L]] else txt[[2L]] | ||
| if (fun %iscall% "function") { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does it make sense to have a script of things that are only testable with the R 4.0.0+ parser? I guess it's pretty rare...
| if (fun %iscall% "function") { | |
| if (fun %iscall% "function") { # NB: '\(x)' only exists pre-parser, so it's also covered |
| jsubl = as.list.default(jsub) | ||
| oldjvnames = jvnames | ||
| jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*length(sdvars) + other jvars ?? not straightforward. | ||
| # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: i guess this is obsolete? there's no i in scope here and no eval() here
| # Apply GForce | ||
| if (jsub %iscall% "list") { | ||
| GForce = TRUE | ||
| for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I changed another 2:length(...) to this seq.int() form, but I see 2:length(...) elsewhere and now I'm second-guessing -- I guess it's fine (better, even) to use the shorter 2:length(...)?
| funi = massage_result$funi | ||
| jsubl[[i_]] = as.list(massage_result$jsub[-1L]) # just keep the '.' from list(.) | ||
| jn__ = massage_result$jvnames | ||
| if (isTRUE(nzchar(names(jsubl)[i_]))) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IINM isTRUE() is only needed if names() might be NA, which is possible in general but shouldn't be here. Let's remove it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh no, it's for the is.null(names(.)) case, got it.
| if (isTRUE(nzchar(names(jsubl)[i_]))) { | ||
| # Fix for #2311, prepend named list arguments of c() to that list's names. See tests 2283.* | ||
| jl__names = names(jl__) %||% rep("", length(jl__)) | ||
| jl__hasname = nzchar(names(jl__)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
?
| jl__hasname = nzchar(names(jl__)) | |
| jl__hasname = nzchar(jl__names) |
| } else { | ||
| jn__ = names(jl__) %||% rep("", length(jl__)) | ||
| } | ||
| idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe?
| idx = unlist(lapply(jl__, function(x) is.name(x) && x == ".I")) | |
| idx = vapply_1b(jl__, identical, quote(.I)) |
| } | ||
|
|
||
| # Return result | ||
| if (!is_valid || !any_optimized) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what's the difference between is_valid and any_optimized?
| jvnames = c(jvnames, gsub("^[.]([N])$", "\\1", this)) | ||
| } else { | ||
| # jvnames = c(jvnames, if (is.null(names(jsubl))) "" else names(jsubl)[i_]) | ||
| is_valid = FALSE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is_valid=FALSE; break style feels a bit like a goto statement where we have to jump past a bunch of other logic to see the flow; why not do early returns directly at the site of the "failure" instead?
Maybe something like this would help
unedited_expr = list(jsub=jsub, jvnames=oldjvnames, funi=funi, optimized=FALSE)
# ...
return(unedited_expr)| # Pattern 3b: Map(fun, .SD) | ||
| # Only optimize if .SD appears exactly once to avoid cases like Map(rep, .SD, .SD) | ||
| else if (is.call(jsub) && jsub %iscall% "Map" && length(jsub) >= 3L && jsub[[3L]] == ".SD" && length(sdvars) && | ||
| sum(vapply_1b(as.list(jsub), function(x) identical(x, quote(.SD)))) == 1L) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My memory is it's good to avoid creating lambdas if you can avoid it
| sum(vapply_1b(as.list(jsub), function(x) identical(x, quote(.SD)))) == 1L) { | |
| sum(vapply_1b(as.list(jsub), identical, quote(.SD))) == 1L) { |
MichaelChirico
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I think last round of review here. Looking great!

Mapinstead oflapplyturns GForce off #5336Adds arithmetic for GForce as demanded in #3815 but does not add support for blocks in j like
d[, j={x<-x; .(min(x))}, by=y].