From 4ef4ac2b2ade3d1716e0e3683d9ad337c26e5cf6 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Tue, 10 Dec 2024 23:07:28 +0100 Subject: [PATCH 1/7] Don't recursively call deregister_worker() on the current worker Previously we were not filtering out the current worker when calling `deregister_worker()` on `workers()`. --- src/cluster.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.jl b/src/cluster.jl index 4f7c995..d95cd3a 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -1232,7 +1232,7 @@ function deregister_worker(pg, pid) # Notify the cluster manager of this workers death manage(w.manager, w.id, w.config, :deregister) if PGRP.topology !== :all_to_all || isclusterlazy() - for rpid in workers() + for rpid in other_workers() try remote_do(deregister_worker, rpid, pid) catch From bf86b160dd3a8f3215857d2a1ab03bb200a27e4a Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Thu, 2 Jan 2025 21:55:34 +0100 Subject: [PATCH 2/7] Rename the WorkerState instances and add an exterminated state The new `WorkerState_exterminated` state is for indicating that a worker was killed by something other than us. --- src/cluster.jl | 45 +++++++++++++++++++++++++++-------------- src/messages.jl | 2 +- src/process_messages.jl | 6 +++--- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index d95cd3a..d2e456e 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -100,7 +100,15 @@ mutable struct WorkerConfig end end -@enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED W_UNKNOWN_STATE +@enum WorkerState begin + WorkerState_created + WorkerState_connected + WorkerState_terminating # rmprocs() has been called on the worker + WorkerState_terminated # Worker was gracefully removed + WorkerState_exterminated # Worker was forcefully removed (not by us) + WorkerState_unknown +end + mutable struct Worker id::Int msg_lock::Threads.ReentrantLock # Lock for del_msgs, add_msgs, and gcflag @@ -131,7 +139,7 @@ mutable struct Worker w.manager = manager w.config = config w.version = version - set_worker_state(w, W_CONNECTED) + set_worker_state(w, WorkerState_connected) register_worker_streams(w) w end @@ -142,7 +150,7 @@ mutable struct Worker @lock map_pid_wrkr if haskey(map_pid_wrkr[], id) return map_pid_wrkr[][id] end - w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func) + w=new(id, Threads.ReentrantLock(), [], [], false, WorkerState_created, Threads.Condition(), time(), conn_func) w.initialized = Event() register_worker(w) w @@ -158,8 +166,15 @@ function set_worker_state(w, state) end end +# Helper function to check if a worker is dead or not. It's recommended to use +# this instead of checking Worker.state manually. +function is_worker_dead(w::Worker) + state = @atomic w.state + return state === WorkerState_terminated || state === WorkerState_exterminated +end + function check_worker_state(w::Worker) - if (@atomic w.state) === W_CREATED + if (@atomic w.state) === WorkerState_created if !isclusterlazy() if PGRP.topology === :all_to_all # Since higher pids connect with lower pids, the remote worker @@ -198,7 +213,7 @@ function exec_conn_func(w::Worker) end function wait_for_conn(w) - if (@atomic w.state) === W_CREATED + if (@atomic w.state) === WorkerState_created timeout = worker_timeout() - (time() - w.ct_time) timeout <= 0 && error("peer $(w.id) has not connected to $(myid())") @@ -211,7 +226,7 @@ function wait_for_conn(w) errormonitor(T) lock(w.c_state) do wait(w.c_state) - (@atomic w.state) === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + (@atomic w.state) === WorkerState_created && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") end end nothing @@ -673,7 +688,7 @@ function create_worker(manager::ClusterManager, wconfig::WorkerConfig) if (jw.id != 1) && (jw.id < w.id) lock(jw.c_state) do # wait for wl to join - if (@atomic jw.state) === W_CREATED + if (@atomic jw.state) === WorkerState_created wait(jw.c_state) end end @@ -700,7 +715,7 @@ function create_worker(manager::ClusterManager, wconfig::WorkerConfig) for wl in wlist lock(wl.c_state) do - if (@atomic wl.state) === W_CREATED + if (@atomic wl.state) === WorkerState_created # wait for wl to join wait(wl.c_state) end @@ -918,7 +933,7 @@ function nprocs() n = length(PGRP.workers) # filter out workers in the process of being setup/shutdown. for jw in PGRP.workers - if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED) + if !isa(jw, LocalProcess) && ((@atomic jw.state) !== WorkerState_connected) n = n - 1 end end @@ -971,7 +986,7 @@ julia> procs() function procs() if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) # filter out workers in the process of being setup/shutdown. - return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] + return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === WorkerState_connected)] else return Int[x.id for x in PGRP.workers] end @@ -988,7 +1003,7 @@ other_procs() = filter(!=(myid()), procs()) function id_in_procs(id) # faster version of `id in procs()` if myid() == 1 || (PGRP.topology === :all_to_all && !isclusterlazy()) for x in PGRP.workers - if (x.id::Int) == id && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED) + if (x.id::Int) == id && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === WorkerState_connected) return true end end @@ -1012,7 +1027,7 @@ See also [`other_procs()`](@ref). """ function procs(pid::Integer) if myid() == 1 - all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)] + all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === WorkerState_connected)] if (pid == 1) || (isa(@lock(map_pid_wrkr, map_pid_wrkr[][pid].manager), LocalManager)) Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)] else @@ -1121,7 +1136,7 @@ function _rmprocs(pids, waitfor) else w = @lock map_pid_wrkr get(map_pid_wrkr[], p, nothing) if !isnothing(w) - set_worker_state(w, W_TERMINATING) + set_worker_state(w, WorkerState_terminating) kill(w.manager, p, w.config) push!(rmprocset, w) end @@ -1130,11 +1145,11 @@ function _rmprocs(pids, waitfor) start = time_ns() while (time_ns() - start) < waitfor*1e9 - all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break + all(is_worker_dead, rmprocset) && break sleep(min(0.1, waitfor - (time_ns() - start)/1e9)) end - unremoved = [wrkr.id for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)] + unremoved = [wrkr.id for wrkr in filter(!is_worker_dead, rmprocset)] if length(unremoved) > 0 estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.") throw(ErrorException(estr)) diff --git a/src/messages.jl b/src/messages.jl index 1a5dd82..fe63a7d 100644 --- a/src/messages.jl +++ b/src/messages.jl @@ -194,7 +194,7 @@ end function flush_gc_msgs() try for w in (PGRP::ProcessGroup).workers - if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag + if isa(w,Worker) && ((@atomic w.state) == WorkerState_connected) && w.gcflag flush_gc_msgs(w) end end diff --git a/src/process_messages.jl b/src/process_messages.jl index d6fdbb1..0f2750a 100644 --- a/src/process_messages.jl +++ b/src/process_messages.jl @@ -210,7 +210,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) handle_msg(msg, header, r_stream, w_stream, version) end catch e - oldstate = W_UNKNOWN_STATE + oldstate = WorkerState_unknown # Check again as it may have been set in a message handler but not propagated to the calling block above if wpid < 1 @@ -223,7 +223,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) elseif @lock(map_del_wrkr, !(wpid in map_del_wrkr[])) werr = worker_from_id(wpid) oldstate = @atomic werr.state - set_worker_state(werr, W_TERMINATED) + set_worker_state(werr, oldstate != WorkerState_terminating ? WorkerState_exterminated : WorkerState_terminated) # If unhandleable error occurred talking to pid 1, exit if wpid == 1 @@ -243,7 +243,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool) close(w_stream) if (myid() == 1) && (wpid > 1) - if oldstate != W_TERMINATING + if oldstate != WorkerState_terminating println(stderr, "Worker $wpid terminated.") rethrow() end From be12fd48603d578bf56aa2400e7c6ab2a1c4dfdf Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Tue, 10 Dec 2024 23:13:00 +0100 Subject: [PATCH 3/7] Add support for worker state callbacks --- docs/src/_changelog.md | 3 + docs/src/index.md | 13 +++ src/cluster.jl | 228 ++++++++++++++++++++++++++++++++++++--- test/distributed_exec.jl | 71 ++++++++++++ 4 files changed, 301 insertions(+), 14 deletions(-) diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md index 8f16817..eded6a1 100644 --- a/docs/src/_changelog.md +++ b/docs/src/_changelog.md @@ -9,6 +9,9 @@ This documents notable changes in DistributedNext.jl. The format is based on ## Unreleased +### Added +- Implemented callback support for workers being added/removed etc ([#17]). + ### Fixed - Modified the default implementations of methods like `take!` and `wait` on [`AbstractWorkerPool`](@ref) to be threadsafe and behave more consistently diff --git a/docs/src/index.md b/docs/src/index.md index 64af89d..17c66a6 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -53,6 +53,19 @@ DistributedNext.cluster_cookie() DistributedNext.cluster_cookie(::Any) ``` +## Callbacks + +```@docs +DistributedNext.add_worker_starting_callback +DistributedNext.remove_worker_starting_callback +DistributedNext.add_worker_started_callback +DistributedNext.remove_worker_started_callback +DistributedNext.add_worker_exiting_callback +DistributedNext.remove_worker_exiting_callback +DistributedNext.add_worker_exited_callback +DistributedNext.remove_worker_exited_callback +``` + ## Cluster Manager Interface This interface provides a mechanism to launch and manage Julia workers on different cluster environments. diff --git a/src/cluster.jl b/src/cluster.jl index d2e456e..eb87c1f 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -479,20 +479,28 @@ end ``` """ function addprocs(manager::ClusterManager; kwargs...) + params = merge(default_addprocs_params(manager), Dict{Symbol, Any}(kwargs)) + init_multi() cluster_mgmt_from_master_check() - lock(worker_lock) - try - addprocs_locked(manager::ClusterManager; kwargs...) - finally - unlock(worker_lock) - end + # Call worker-starting callbacks + warning_interval = params[:callback_warning_interval] + _run_callbacks_concurrently("worker-starting", worker_starting_callbacks, + warning_interval, [(manager, params)]) + + # Add new workers + new_workers = @lock worker_lock addprocs_locked(manager::ClusterManager, params) + + # Call worker-started callbacks + _run_callbacks_concurrently("worker-started", worker_started_callbacks, + warning_interval, new_workers) + + return new_workers end -function addprocs_locked(manager::ClusterManager; kwargs...) - params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs)) +function addprocs_locked(manager::ClusterManager, params) topology(Symbol(params[:topology])) if PGRP.topology !== :all_to_all @@ -579,7 +587,8 @@ default_addprocs_params() = Dict{Symbol,Any}( :exeflags => ``, :env => [], :enable_threaded_blas => false, - :lazy => true) + :lazy => true, + :callback_warning_interval => 10) function setup_launched_worker(manager, wconfig, launched_q) @@ -888,6 +897,10 @@ const HDR_COOKIE_LEN = 16 const map_pid_wrkr = Lockable(Dict{Int, Union{Worker, LocalProcess}}()) const map_sock_wrkr = Lockable(IdDict()) const map_del_wrkr = Lockable(Set{Int}()) +const worker_starting_callbacks = Dict{Any, Base.Callable}() +const worker_started_callbacks = Dict{Any, Base.Callable}() +const worker_exiting_callbacks = Dict{Any, Base.Callable}() +const worker_exited_callbacks = Dict{Any, Base.Callable}() # whether process is a master or worker in a distributed setup myrole() = LPROCROLE[] @@ -895,6 +908,163 @@ function myrole!(proctype::Symbol) LPROCROLE[] = proctype end +# Callbacks + +function _run_callbacks_concurrently(callbacks_name, callbacks_dict, warning_interval, arglist; catch_exceptions=false) + callback_tasks = Tuple{Any, Task}[] + for args in arglist + for (name, callback) in callbacks_dict + push!(callback_tasks, (name, Threads.@spawn callback(args...))) + end + end + + running_callbacks = () -> ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)] + while timedwait(() -> isempty(running_callbacks()), warning_interval) === :timed_out + callbacks_str = join(running_callbacks(), ", ") + @warn "Waiting for these $(callbacks_name) callbacks to finish: $(callbacks_str)" + end + + if catch_exceptions + for (key, task) in callback_tasks + try + wait(task) + catch ex + @error "Error when running $(callbacks_name) callback '$(key)'" exception=(ex, catch_backtrace()) + end + end + else + # Wait on the tasks so that exceptions bubble up + foreach(wait, [x[2] for x in callback_tasks]) + end +end + +function _add_callback(f, key, dict; arg_types=Tuple{Int}) + if isnothing(key) + key = Symbol(gensym(), nameof(f)) + end + + desired_signature = "f(" * join(["::$(t)" for t in arg_types.types], ", ") * ")" + + if !hasmethod(f, arg_types) + throw(ArgumentError("Callback function is invalid, it must be able to be called with these argument types: $(desired_signature)")) + elseif haskey(dict, key) + throw(ArgumentError("A callback function with key '$(key)' already exists")) + end + + dict[key] = f + return key +end + +_remove_callback(key, dict) = delete!(dict, key) + +""" + add_worker_starting_callback(f::Base.Callable; key=nothing) -> key + +Register a callback to be called on the master worker immediately before new +workers are started. Chooses and returns a unique key for the callback if `key` +is not specified. The callback `f` will be called with the `ClusterManager` +instance that is being used and a dictionary of parameters related to adding +workers, i.e. `f(manager, params)`. The `params` dictionary is specific to the +`manager` type. Note that the `LocalManager` and `SSHManager` cluster managers +in DistributedNext are not fully documented yet, see the +[managers.jl](https://github.com/JuliaParallel/DistributedNext.jl/blob/master/src/managers.jl) +file for their definitions. + +!!! warning + Adding workers can fail so it is not guaranteed that the workers requested + in `manager` will exist in the future. e.g. if a worker is requested on a + node that is unreachable then the worker-starting callbacks will be called + but the worker will never be added. + +The worker-starting callbacks will be executed concurrently. If one throws an +exception it will not be caught and will be rethrown by [`addprocs`](@ref). + +Keep in mind that the callbacks will add to the time taken to launch workers; so +try to either keep the callbacks fast to execute, or do the actual work +asynchronously by spawning a task in the callback (beware of race conditions if +you do this). +""" +add_worker_starting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_starting_callbacks; + arg_types=Tuple{ClusterManager, Dict}) +""" + remove_worker_starting_callback(key) + +Remove the callback for `key` that was added with [`add_worker_starting_callback()`](@ref). +""" +remove_worker_starting_callback(key) = _remove_callback(key, worker_starting_callbacks) + +""" + add_worker_started_callback(f::Base.Callable; key=nothing) -> key + +Register a callback to be called on the master worker whenever a worker has +been added. The callback will be called with the added worker ID, +e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is +not specified. + +The worker-started callbacks will be executed concurrently. If one throws an +exception it will not be caught and will be rethrown by [`addprocs()`](@ref). + +Keep in mind that the callbacks will add to the time taken to launch workers; so +try to either keep the callbacks fast to execute, or do the actual +initialization asynchronously by spawning a task in the callback (beware of race +conditions if you do this). +""" +add_worker_started_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_started_callbacks) + +""" + remove_worker_started_callback(key) + +Remove the callback for `key` that was added with [`add_worker_started_callback()`](@ref). +""" +remove_worker_started_callback(key) = _remove_callback(key, worker_started_callbacks) + +""" + add_worker_exiting_callback(f::Base.Callable; key=nothing) -> key + +Register a callback to be called on the master worker immediately before a +worker is removed with [`rmprocs()`](@ref). The callback will be called with the +worker ID, e.g. `f(w::Int)`. Chooses and returns a unique key for the callback +if `key` is not specified. + +All worker-exiting callbacks will be executed concurrently and if they don't +all finish before the `callback_timeout` passed to `rmprocs()` then the worker +will be removed anyway. +""" +add_worker_exiting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exiting_callbacks) + +""" + remove_worker_exiting_callback(key) + +Remove the callback for `key` that was added with [`add_worker_exiting_callback()`](@ref). +""" +remove_worker_exiting_callback(key) = _remove_callback(key, worker_exiting_callbacks) + +""" + add_worker_exited_callback(f::Base.Callable; key=nothing) -> key + +Register a callback to be called on the master worker when a worker has exited +for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker +segfaulting etc). Chooses and returns a unique key for the callback if `key` is +not specified. + +The callback will be called with the worker ID and the final +`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an +enum, a value of `WorkerState_terminated` means a graceful exit and a value of +`WorkerState_exterminated` means the worker died unexpectedly. + +All worker-exited callbacks will be executed concurrently. If a callback throws +an exception it will be caught and printed. +""" +add_worker_exited_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exited_callbacks; + arg_types=Tuple{Int, WorkerState}) + +""" + remove_worker_exited_callback(key) + +Remove the callback for `key` that was added with [`add_worker_exited_callback()`](@ref). +""" +remove_worker_exited_callback(key) = _remove_callback(key, worker_exited_callbacks) + # cluster management related API """ myid() @@ -1081,7 +1251,7 @@ function cluster_mgmt_from_master_check() end """ - rmprocs(pids...; waitfor=typemax(Int)) + rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10) Remove the specified workers. Note that only process 1 can add or remove workers. @@ -1095,6 +1265,10 @@ Argument `waitfor` specifies how long to wait for the workers to shut down: returned. The user should call [`wait`](@ref) on the task before invoking any other parallel calls. +The `callback_timeout` specifies how long to wait for any callbacks to execute +before continuing to remove the workers (see +[`add_worker_exiting_callback()`](@ref)). + # Examples ```julia-repl \$ julia -p 5 @@ -1111,24 +1285,38 @@ julia> workers() 6 ``` """ -function rmprocs(pids...; waitfor=typemax(Int)) +function rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10) cluster_mgmt_from_master_check() pids = vcat(pids...) if waitfor == 0 - t = @async _rmprocs(pids, typemax(Int)) + t = @async _rmprocs(pids, typemax(Int), callback_timeout) yield() return t else - _rmprocs(pids, waitfor) + _rmprocs(pids, waitfor, callback_timeout) # return a dummy task object that user code can wait on. return @async nothing end end -function _rmprocs(pids, waitfor) +function _rmprocs(pids, waitfor, callback_timeout) lock(worker_lock) try + # Run the callbacks + callback_tasks = Tuple{Any, Task}[] + for pid in pids + for (name, callback) in worker_exiting_callbacks + push!(callback_tasks, (name, Threads.@spawn callback(pid))) + end + end + + if timedwait(() -> all(istaskdone, [x[2] for x in callback_tasks]), callback_timeout) === :timed_out + timedout_callbacks = ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)] + callbacks_str = join(timedout_callbacks, ", ") + @warn "Some worker-exiting callbacks have not yet finished, continuing to remove workers anyway. These are the callbacks still running: $(callbacks_str)" + end + rmprocset = Union{LocalProcess, Worker}[] for p in pids if p == 1 @@ -1280,6 +1468,18 @@ function deregister_worker(pg, pid) delete!(pg.refs, id) end end + + # Call callbacks on the master + if myid() == 1 + for (name, callback) in worker_exited_callbacks + try + callback(pid, w.state) + catch ex + @error "Error when running worker-exited callback '$(name)'" exception=(ex, catch_backtrace()) + end + end + end + return end diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index de6f4fe..86946ad 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -1,6 +1,7 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license using DistributedNext, Random, Serialization, Sockets +import DistributedNext import DistributedNext: launch, manage @@ -1977,6 +1978,76 @@ include("splitrange.jl") end end +@testset "Worker state callbacks" begin + rmprocs(other_workers()) + + # Adding a callback with an invalid signature should fail + @test_throws ArgumentError DistributedNext.add_worker_started_callback(() -> nothing) + + # Smoke test to ensure that all the callbacks are executed + starting_managers = [] + started_workers = Int[] + exiting_workers = Int[] + exited_workers = [] + starting_key = DistributedNext.add_worker_starting_callback((manager, kwargs) -> push!(starting_managers, manager)) + started_key = DistributedNext.add_worker_started_callback(pid -> (push!(started_workers, pid); error("foo"))) + exiting_key = DistributedNext.add_worker_exiting_callback(pid -> push!(exiting_workers, pid)) + exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> push!(exited_workers, (pid, state))) + + # Test that the worker-started exception bubbles up + @test_throws TaskFailedException addprocs(1) + + pid = only(workers()) + @test only(starting_managers) isa DistributedNext.LocalManager + @test started_workers == [pid] + rmprocs(workers()) + @test exiting_workers == [pid] + @test exited_workers == [(pid, DistributedNext.WorkerState_terminated)] + + # Trying to reset an existing callback should fail + @test_throws ArgumentError DistributedNext.add_worker_started_callback(Returns(nothing); key=started_key) + + # Remove the callbacks + DistributedNext.remove_worker_starting_callback(starting_key) + DistributedNext.remove_worker_started_callback(started_key) + DistributedNext.remove_worker_exiting_callback(exiting_key) + DistributedNext.remove_worker_exited_callback(exited_key) + + # Test that the worker-exiting `callback_timeout` option works and that we + # get warnings about slow worker-started callbacks. + event = Base.Event() + callback_task = nothing + started_key = DistributedNext.add_worker_started_callback(_ -> sleep(0.5)) + exiting_key = DistributedNext.add_worker_exiting_callback(_ -> (callback_task = current_task(); wait(event))) + + @test_logs (:warn, r"Waiting for these worker-started callbacks.+") match_mode=:any addprocs(1; callback_warning_interval=0.05) + DistributedNext.remove_worker_started_callback(started_key) + + @test_logs (:warn, r"Some worker-exiting callbacks have not yet finished.+") rmprocs(workers(); callback_timeout=0.5) + DistributedNext.remove_worker_exiting_callback(exiting_key) + + notify(event) + wait(callback_task) + + # Test that the initial callbacks were indeed removed + @test length(starting_managers) == 1 + @test length(started_workers) == 1 + @test length(exiting_workers) == 1 + @test length(exited_workers) == 1 + + # Test that workers that were killed forcefully are detected as such + exit_state = nothing + exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> exit_state = state) + pid = only(addprocs(1)) + + redirect_stderr(devnull) do + remote_do(exit, pid) + timedwait(() -> !isnothing(exit_state), 10) + end + @test exit_state == DistributedNext.WorkerState_exterminated + DistributedNext.remove_worker_exited_callback(exited_key) +end + # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. if nprocs() > 1 From 0467de60da622f297b030e437a93ebb0f0737c20 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Wed, 11 Dec 2024 00:08:19 +0100 Subject: [PATCH 4/7] Add an extension to support Revise --- Project.toml | 11 ++++-- docs/src/_changelog.md | 1 + ext/ReviseExt.jl | 30 ++++++++++++++++ test/distributed_exec.jl | 74 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 ext/ReviseExt.jl diff --git a/Project.toml b/Project.toml index 2c1102f..a088bb8 100644 --- a/Project.toml +++ b/Project.toml @@ -7,16 +7,23 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" +[weakdeps] +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" + +[extensions] +ReviseExt = "Revise" + [compat] Aqua = "0.8" Distributed = "1" LibSSH = "0.7" LinearAlgebra = "1" Random = "1" +Revise = "3.7.0" Serialization = "1" Sockets = "1" Test = "1" -julia = "1.9" +julia = "1.10" [extras] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" @@ -26,4 +33,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Aqua", "Distributed", "LibSSH", "LinearAlgebra", "Test"] +test = ["Aqua", "Distributed", "LibSSH", "LinearAlgebra", "Revise", "Test"] diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md index eded6a1..04378ef 100644 --- a/docs/src/_changelog.md +++ b/docs/src/_changelog.md @@ -11,6 +11,7 @@ This documents notable changes in DistributedNext.jl. The format is based on ### Added - Implemented callback support for workers being added/removed etc ([#17]). +- Added a package extension to support Revise.jl ([#17]). ### Fixed - Modified the default implementations of methods like `take!` and `wait` on diff --git a/ext/ReviseExt.jl b/ext/ReviseExt.jl new file mode 100644 index 0000000..269add5 --- /dev/null +++ b/ext/ReviseExt.jl @@ -0,0 +1,30 @@ +module ReviseExt + +import DistributedNext +import DistributedNext: myid, workers, remotecall + +import Revise + + +struct DistributedNextWorker <: Revise.AbstractWorker + id::Int +end + +function get_workers() + map(DistributedNextWorker, workers()) +end + +function Revise.remotecall_impl(f, worker::DistributedNextWorker, args...; kwargs...) + remotecall(f, worker.id, args...; kwargs...) +end + +Revise.is_master_worker(::typeof(get_workers)) = myid() == 1 +Revise.is_master_worker(worker::DistributedNextWorker) = worker.id == 1 + +function __init__() + Revise.register_workers_function(get_workers) + DistributedNext.add_worker_started_callback(pid -> Revise.init_worker(DistributedNextWorker(pid)); + key="DistributedNext-integration") +end + +end diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 86946ad..996294a 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -1,5 +1,6 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license +import Revise using DistributedNext, Random, Serialization, Sockets import DistributedNext import DistributedNext: launch, manage @@ -1957,7 +1958,9 @@ include("splitrange.jl") @testset "Clear all workers for timeout tests (issue #45785)" begin nprocs() > 1 && rmprocs(workers()) - begin + + # This test requires kill(), and that doesn't work on Windows before 1.11 + if !(Sys.iswindows() && VERSION < v"1.11") # First, assert that we get no messages when we close a cooperative worker w = only(addprocs(1)) @test_nowarn begin @@ -1975,6 +1978,8 @@ include("splitrange.jl") end wait(rmprocs([w])) end + else + @warn "Skipping timeout tests because kill() isn't supported on Windows for this Julia version" end end @@ -2048,6 +2053,73 @@ end DistributedNext.remove_worker_exited_callback(exited_key) end +# This is a simplified copy of a test from Revise.jl's tests +@testset "Revise.jl integration" begin + function rm_precompile(pkgname::AbstractString) + filepath = Base.cache_file_entry(Base.PkgId(pkgname)) + isa(filepath, Tuple) && (filepath = filepath[1]*filepath[2]) # Julia 1.3+ + for depot in DEPOT_PATH + fullpath = joinpath(depot, filepath) + isfile(fullpath) && rm(fullpath) + end + end + + pid = only(addprocs(1)) + + # Test that initialization succeeds by checking that Main.whichtt is defined + # on the worker, which is defined by Revise.init_worker(). + @test timedwait(() ->remotecall_fetch(() -> hasproperty(Main, :whichtt), pid), 10) == :ok + + tmpdir = mktempdir() + @everywhere push!(LOAD_PATH, $tmpdir) # Don't want to share this LOAD_PATH + + # Create a fake package + module_file = joinpath(tmpdir, "ReviseDistributed", "src", "ReviseDistributed.jl") + mkpath(dirname(module_file)) + write(module_file, + """ + module ReviseDistributed + + f() = π + g(::Int) = 0 + + end + """) + + # Check that we can use it + @everywhere using ReviseDistributed + for p in procs() + @test remotecall_fetch(ReviseDistributed.f, p) == π + @test remotecall_fetch(ReviseDistributed.g, p, 1) == 0 + end + + # Test changing and deleting methods + write(module_file, + """ + module ReviseDistributed + + f() = 3.0 + + end + """) + for p in procs() + # We call Revise.revise() inside the timedwait() because file events + # on macOS can have significant latency, meaning a single revise() call + # may not pick up the changes yet. + @test timedwait(10; pollint=0.5) do + Revise.revise() + remotecall_fetch(ReviseDistributed.f, p) == 3.0 + end == :ok + + @test_throws RemoteException remotecall_fetch(ReviseDistributed.g, p, 1) + end + + rmprocs(workers()) + rm_precompile("ReviseDistributed") + pop!(LOAD_PATH) +end + + # Run topology tests last after removing all workers, since a given # cluster at any time only supports a single topology. if nprocs() > 1 From 95a9415268ae1f6009107767eb8e9477f5008cbd Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Thu, 2 Jan 2025 23:11:27 +0100 Subject: [PATCH 5/7] Clean up CI a bit --- .github/dependabot.yml | 4 ++++ .github/workflows/ci.yml | 16 ++++------------ .github/workflows/tagbot.yml | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/tagbot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml index d60f070..b31f841 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,3 +5,7 @@ updates: directory: "/" # Location of package manifests schedule: interval: "monthly" + - package-ecosystem: "julia" + directory: "/" + schedule: + interval: "weekly" diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 992be42..121ceae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,17 +67,7 @@ jobs: # GitHub will automatically expand to the correct value (`x86_64` or `aarch64`) # based on the architecture of the underlying GitHub Runner (virtual machine). arch: ${{ github.ref == '32' && 'x86' || runner.arch }} - - uses: actions/cache@v5 - env: - cache-name: cache-artifacts - with: - path: ~/.julia/artifacts - key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} - restore-keys: | - ${{ runner.os }}-test-${{ env.cache-name }}- - ${{ runner.os }}-test-${{ matrix.os }} - ${{ runner.os }}- - - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/cache@v3 - name: Decide what the value of JULIA_NUM_THREADS should be id: decide-numthreads-str run: | @@ -115,6 +105,7 @@ jobs: - uses: julia-actions/setup-julia@latest with: version: '1' + - uses: julia-actions/cache@v3 - name: Install dependencies run: julia --project=docs/ -e 'using Pkg; Pkg.instantiate()' - name: Build and deploy @@ -130,8 +121,9 @@ jobs: persist-credentials: false - uses: julia-actions/setup-julia@latest with: - version: '1.12' + version: '1' # version: 'nightly' + - uses: julia-actions/cache@v3 - run: julia --color=yes --project=ci/jet -e 'import Pkg; Pkg.instantiate()' - name: Run the JET tests run: julia --color=yes --project=ci/jet ci/jet/check.jl diff --git a/.github/workflows/tagbot.yml b/.github/workflows/tagbot.yml new file mode 100644 index 0000000..0f81ec9 --- /dev/null +++ b/.github/workflows/tagbot.yml @@ -0,0 +1,18 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: + inputs: + lookback: + description: "[DEPRECATED] No longer has any effect" + default: "3" +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} From 9d9cf148c9515233c2d30b2ae620e99822f80436 Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Fri, 3 Jan 2025 00:13:59 +0100 Subject: [PATCH 6/7] Replace a timeout task with timedwait() This should fix an exception seen in CI from the lingering timeout task: ``` Test Summary: | Pass Total Time Deserialization error recovery and include() | 11 11 3.9s From worker 4: Unhandled Task ERROR: EOFError: read end of file From worker 4: Stacktrace: From worker 4: [1] wait From worker 4: @ .\asyncevent.jl:159 [inlined] From worker 4: [2] sleep(sec::Float64) From worker 4: @ Base .\asyncevent.jl:265 From worker 4: [3] (::DistributedNext.var"#34#37"{DistributedNext.Worker, Float64})() From worker 4: @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\cluster.jl:213 ``` --- src/cluster.jl | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/cluster.jl b/src/cluster.jl index eb87c1f..9ef7734 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -217,16 +217,10 @@ function wait_for_conn(w) timeout = worker_timeout() - (time() - w.ct_time) timeout <= 0 && error("peer $(w.id) has not connected to $(myid())") - T = Threads.@spawn begin - sleep($timeout) - lock(w.c_state) do - notify(w.c_state; all=true) - end - end - errormonitor(T) - lock(w.c_state) do - wait(w.c_state) - (@atomic w.state) === WorkerState_created && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") + if timedwait(() -> (@atomic w.state) === WorkerState_connected, timeout) === :timed_out + # Notify any waiters on the state and throw + @lock w.c_state notify(w.c_state) + error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds") end end nothing From 07bda63610b10b08fb1ff589efc8472097b83c2e Mon Sep 17 00:00:00 2001 From: JamesWrigley Date: Sun, 5 Jan 2025 16:08:09 +0100 Subject: [PATCH 7/7] Add support for worker statuses --- Project.toml | 2 + docs/src/_changelog.md | 2 + docs/src/index.md | 4 ++ src/DistributedNext.jl | 6 ++ src/cluster.jl | 135 ++++++++++++++++++++++++++++++++++++--- test/distributed_exec.jl | 67 +++++++++++++++++-- test/runtests.jl | 2 +- 7 files changed, 200 insertions(+), 18 deletions(-) diff --git a/Project.toml b/Project.toml index a088bb8..f09edba 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ version = "1.1.1" [deps] Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" @@ -20,6 +21,7 @@ LibSSH = "0.7" LinearAlgebra = "1" Random = "1" Revise = "3.7.0" +ScopedValues = "1.6.0" Serialization = "1" Sockets = "1" Test = "1" diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md index 04378ef..e1e1e4e 100644 --- a/docs/src/_changelog.md +++ b/docs/src/_changelog.md @@ -12,6 +12,8 @@ This documents notable changes in DistributedNext.jl. The format is based on ### Added - Implemented callback support for workers being added/removed etc ([#17]). - Added a package extension to support Revise.jl ([#17]). +- Added support for setting worker statuses with [`setstatus`](@ref) and + [`getstatus`](@ref) ([#17]). ### Fixed - Modified the default implementations of methods like `take!` and `wait` on diff --git a/docs/src/index.md b/docs/src/index.md index 17c66a6..1e979ad 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -14,6 +14,10 @@ DistributedNext.rmprocs DistributedNext.interrupt DistributedNext.myid DistributedNext.pmap +DistributedNext.getstatus +DistributedNext.@getstatus +DistributedNext.setstatus! +DistributedNext.@setstatus! DistributedNext.RemoteException DistributedNext.ProcessExitedException DistributedNext.Future diff --git a/src/DistributedNext.jl b/src/DistributedNext.jl index 45a8b1f..27d7bfb 100644 --- a/src/DistributedNext.jl +++ b/src/DistributedNext.jl @@ -23,6 +23,12 @@ using Serialization, Sockets import Serialization: serialize, deserialize import Sockets: connect, wait_connected +@static if VERSION < v"1.11" + using ScopedValues: ScopedValue, @with +else + using Base.ScopedValues: ScopedValue, @with +end + # NOTE: clusterserialize.jl imports additional symbols from Serialization for use export diff --git a/src/cluster.jl b/src/cluster.jl index 9ef7734..bbbbad5 100644 --- a/src/cluster.jl +++ b/src/cluster.jl @@ -891,6 +891,8 @@ const HDR_COOKIE_LEN = 16 const map_pid_wrkr = Lockable(Dict{Int, Union{Worker, LocalProcess}}()) const map_sock_wrkr = Lockable(IdDict()) const map_del_wrkr = Lockable(Set{Int}()) +const _exited_callback_pid = ScopedValue{Int}(-1) +const map_pid_statuses = Lockable(Dict{Int, Any}()) const worker_starting_callbacks = Dict{Any, Base.Callable}() const worker_started_callbacks = Dict{Any, Base.Callable}() const worker_exiting_callbacks = Dict{Any, Base.Callable}() @@ -1042,9 +1044,9 @@ segfaulting etc). Chooses and returns a unique key for the callback if `key` is not specified. The callback will be called with the worker ID and the final -`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an -enum, a value of `WorkerState_terminated` means a graceful exit and a value of -`WorkerState_exterminated` means the worker died unexpectedly. +`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is +an enum, a value of `WorkerState_terminated` means a graceful exit and a value +of `WorkerState_exterminated` means the worker died unexpectedly. All worker-exited callbacks will be executed concurrently. If a callback throws an exception it will be caught and printed. @@ -1238,6 +1240,112 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out. """ other_workers() = filter(!=(myid()), workers()) +""" + @setstatus! x + @setstatus! x pid + +Set the status for the calling module on worker `pid` (defaults to the current +worker) to `x`. `x` may be any serializable object but it's recommended to keep +it small enough to cheaply send over a network. Statuses can be retrieved inside +worker-exited callbacks (see [`add_worker_exited_callback`](@ref)) before the +worker is fully deregistered. + +Statuses are keyed by the calling `Module`, so multiple libraries can +independently track their own status on the same worker without conflicting. + +This can be handy if you want a way to know what a worker is doing at any given +time, or (in combination with a worker-exited callback) for knowing what a +worker was last doing before it died. + +# Examples +```julia-repl +julia> DistributedNext.@setstatus! "working on dataset 42" +"working on dataset 42" + +julia> DistributedNext.@getstatus +"working on dataset 42" +``` + +See also [`setstatus!`](@ref) for the function form that accepts an explicit module key. +""" +macro setstatus!(x) + mod = __module__ + :(setstatus!($(esc(x)), $mod)) +end + +macro setstatus!(x, pid) + mod = __module__ + :(setstatus!($(esc(x)), $mod, $(esc(pid)))) +end + +""" + setstatus!(x, mod::Module, pid::Int=myid()) + +Function form of [`@setstatus!`](@ref). Sets the status for module `mod` on +worker `pid` to `x`. +""" +function setstatus!(x, mod::Module, pid::Int=myid()) + if !id_in_procs(pid) + throw(ArgumentError("Worker $(pid) does not exist, cannot set its status")) + end + + if myid() == 1 + @lock map_pid_statuses begin + statuses = get!(map_pid_statuses[], pid, Dict{Module, Any}()) + statuses[mod] = x + end + else + remotecall_fetch(setstatus!, 1, x, mod, myid()) + end +end + +function _getstatus(pid, mod) + @lock map_pid_statuses begin + statuses = get(map_pid_statuses[], pid, nothing) + isnothing(statuses) ? nothing : get(statuses, mod, nothing) + end +end + +""" + @getstatus + @getstatus pid + +Get the status set by the calling module for worker `pid` (defaults to the +current worker). If one was never explicitly set with [`@setstatus!`](@ref) +this will return `nothing`. + +See also [`getstatus`](@ref) for the function form. +""" +macro getstatus() + mod = __module__ + :(getstatus($mod)) +end +macro getstatus(pid) + mod = __module__ + :(getstatus($mod, $(esc(pid)))) +end + +""" + getstatus(mod::Module, pid::Int=myid()) + +Function form of [`@getstatus`](@ref). Gets the status for module `mod` on +worker `pid`. Returns `nothing` if no status was set. +""" +function getstatus(mod::Module, pid::Int=myid()) + # During the worker-exited callbacks this function may be called, at which + # point it will not exist in procs(). Thus we check whether the function is + # being called for an exited worker and allow it if so. + if !id_in_procs(pid) && _exited_callback_pid[] != pid + throw(ArgumentError("Worker $(pid) does not exist, cannot get its status")) + end + + if myid() == 1 + _getstatus(pid, mod) + else + remotecall_fetch(getstatus, 1, mod, pid) + end +end + function cluster_mgmt_from_master_check() if myid() != 1 throw(ErrorException("Only process 1 can add and remove workers")) @@ -1463,15 +1571,22 @@ function deregister_worker(pg, pid) end end - # Call callbacks on the master if myid() == 1 - for (name, callback) in worker_exited_callbacks - try - callback(pid, w.state) - catch ex - @error "Error when running worker-exited callback '$(name)'" exception=(ex, catch_backtrace()) - end + params = default_addprocs_params(w.manager) + warning_interval = params[:callback_warning_interval] + + # Call callbacks on the master, with the scoped value set so that + # getstatus() can be called for the exiting worker without failing the + # pid check. We go to some effort to make sure this works after + # deregistering the worker because if it's called beforehand the worker + # will incorrectly be shown in e.g. procs(). + @with _exited_callback_pid => pid begin + _run_callbacks_concurrently("worker-exited", worker_exited_callbacks, + warning_interval, [(pid, w.state)]; catch_exceptions=true) end + + # Delete its statuses + @lock map_pid_statuses delete!(map_pid_statuses[], pid) end return diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl index 996294a..fc9c0d4 100644 --- a/test/distributed_exec.jl +++ b/test/distributed_exec.jl @@ -3,7 +3,7 @@ import Revise using DistributedNext, Random, Serialization, Sockets import DistributedNext -import DistributedNext: launch, manage +import DistributedNext: launch, manage, getstatus, setstatus!, @getstatus, @setstatus! @test cluster_cookie() isa String @@ -1826,7 +1826,9 @@ end let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp pkg_project = joinpath(Base.pkgdir(DistributedNext), "Project.toml") project = mkdir(joinpath(tmp, "project")) - depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))] + # Keep the writable depot in the depots list so that external + # dependencies (i.e. ScopedValues.jl) can be loaded. + depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2")), Base.DEPOT_PATH[1]] load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@", pkg_project] pathsep = Sys.iswindows() ? ";" : ":" env = Dict( @@ -1935,7 +1937,7 @@ end project = mktempdir() env = Dict( "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib", $(repr(pathsep)), "$(escaped_pkg_project)"), - "JULIA_DEPOT_PATH" => DEPOT_PATH[1], + "JULIA_DEPOT_PATH" => DEPOT_PATH[end], "TMPDIR" => ENV["TMPDIR"], ) addprocs(1; env = env, exeflags = `--project=\$(project)`) @@ -1943,7 +1945,7 @@ end addprocs(1; env = env) """ * funcscode * """ for w in workers() - @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[1]] + @test remotecall_fetch(depot_path, w) == [DEPOT_PATH[end]] @test remotecall_fetch(load_path, w) == [LOAD_PATH[1], "@stdlib", "$(escaped_pkg_project)"] @test remotecall_fetch(active_project, w) == project @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml") @@ -1983,7 +1985,40 @@ include("splitrange.jl") end end +@testset "Worker statuses" begin + rmprocs(other_workers()) + + # Test with the local worker using macros + @test isnothing(@getstatus()) + @setstatus!("foo") + @test @getstatus() == "foo" + @test_throws ArgumentError getstatus(Main, 2) + + # Test with a remote worker using the function form + pid = only(addprocs(1)) + @test isnothing(getstatus(Main, pid)) + remotecall_wait(setstatus!, pid, "bar", Main, pid) + @test remotecall_fetch(getstatus, pid, Main) == "bar" + + # Test that different modules have independent statuses + setstatus!("from_main", Main, pid) + setstatus!("from_distributed", DistributedNext, pid) + @test getstatus(Main, pid) == "from_main" + @test getstatus(DistributedNext, pid) == "from_distributed" + + rmprocs(pid) +end + @testset "Worker state callbacks" begin + # Helper function to wait for a worker to have been completely deregistered + # (including worker-exited callbacks finished) by waiting for the workers + # status to have been deleted. Only works if the worker has a status of + # course. + function wait_for_deregistration(pid) + statuses = DistributedNext.map_pid_statuses + @test timedwait(() -> @lock(statuses, !haskey(statuses[], pid)), 10) == :ok + end + rmprocs(other_workers()) # Adding a callback with an invalid signature should fail @@ -2040,16 +2075,34 @@ end @test length(exiting_workers) == 1 @test length(exited_workers) == 1 - # Test that workers that were killed forcefully are detected as such + # Test that workers that were killed forcefully are detected as such, and + # that statuses can be retrieved in the callback. exit_state = nothing - exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> exit_state = state) + last_status = nothing + exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> (exit_state = state; last_status = @getstatus(pid))) pid = only(addprocs(1)) + @setstatus!("foo", pid) + # Kill the process with stderr redirected so the error messages don't + # unnecessarily show up in the logs. redirect_stderr(devnull) do remote_do(exit, pid) - timedwait(() -> !isnothing(exit_state), 10) + wait_for_deregistration(pid) end @test exit_state == DistributedNext.WorkerState_exterminated + @test last_status == "foo" + DistributedNext.remove_worker_exited_callback(exited_key) + + # Test that exceptions in worker-exited callbacks are caught + exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> error("foo")) + @test_logs (:error, r"Error when running worker-exited callback.+") match_mode=:any begin + pid = only(addprocs(1)) + # Set a dummy status so that wait_for_deregistration() works + @setstatus!("foo", pid) + rmprocs(pid) + + wait_for_deregistration(pid) + end DistributedNext.remove_worker_exited_callback(exited_key) end diff --git a/test/runtests.jl b/test/runtests.jl index ab596e9..99fd04c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -35,5 +35,5 @@ include("managers.jl") include("distributed_stdlib_detection.jl") @testset "Aqua" begin - Aqua.test_all(DistributedNext) + Aqua.test_all(DistributedNext; stale_deps=(; ignore=[:ScopedValues])) end