From 4ef4ac2b2ade3d1716e0e3683d9ad337c26e5cf6 Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Tue, 10 Dec 2024 23:07:28 +0100
Subject: [PATCH 1/7] Don't recursively call deregister_worker() on the current
 worker

Previously we were not filtering out the current worker when calling
`deregister_worker()` on `workers()`.
---
 src/cluster.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster.jl b/src/cluster.jl
index 4f7c995..d95cd3a 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -1232,7 +1232,7 @@ function deregister_worker(pg, pid)
             # Notify the cluster manager of this workers death
             manage(w.manager, w.id, w.config, :deregister)
             if PGRP.topology !== :all_to_all || isclusterlazy()
-                for rpid in workers()
+                for rpid in other_workers()
                     try
                         remote_do(deregister_worker, rpid, pid)
                     catch

From bf86b160dd3a8f3215857d2a1ab03bb200a27e4a Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Thu, 2 Jan 2025 21:55:34 +0100
Subject: [PATCH 2/7] Rename the WorkerState instances and add an exterminated
 state

The new `WorkerState_exterminated` state is for indicating that a worker was
killed by something other than us.
---
 src/cluster.jl          | 45 +++++++++++++++++++++++++++--------------
 src/messages.jl         |  2 +-
 src/process_messages.jl |  6 +++---
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/cluster.jl b/src/cluster.jl
index d95cd3a..d2e456e 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -100,7 +100,15 @@ mutable struct WorkerConfig
     end
 end
 
-@enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED W_UNKNOWN_STATE
+@enum WorkerState begin
+    WorkerState_created
+    WorkerState_connected
+    WorkerState_terminating    # rmprocs() has been called on the worker
+    WorkerState_terminated     # Worker was gracefully removed
+    WorkerState_exterminated   # Worker was forcefully removed (not by us)
+    WorkerState_unknown
+end
+
 mutable struct Worker
     id::Int
     msg_lock::Threads.ReentrantLock # Lock for del_msgs, add_msgs, and gcflag
@@ -131,7 +139,7 @@ mutable struct Worker
         w.manager = manager
         w.config = config
         w.version = version
-        set_worker_state(w, W_CONNECTED)
+        set_worker_state(w, WorkerState_connected)
         register_worker_streams(w)
         w
     end
@@ -142,7 +150,7 @@ mutable struct Worker
         @lock map_pid_wrkr if haskey(map_pid_wrkr[], id)
             return map_pid_wrkr[][id]
         end
-        w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Threads.Condition(), time(), conn_func)
+        w=new(id, Threads.ReentrantLock(), [], [], false, WorkerState_created, Threads.Condition(), time(), conn_func)
         w.initialized = Event()
         register_worker(w)
         w
@@ -158,8 +166,15 @@ function set_worker_state(w, state)
     end
 end
 
+# Helper function to check if a worker is dead or not. It's recommended to use
+# this instead of checking Worker.state manually.
+function is_worker_dead(w::Worker)
+    state = @atomic w.state
+    return state === WorkerState_terminated || state === WorkerState_exterminated
+end
+
 function check_worker_state(w::Worker)
-    if (@atomic w.state) === W_CREATED
+    if (@atomic w.state) === WorkerState_created
         if !isclusterlazy()
             if PGRP.topology === :all_to_all
                 # Since higher pids connect with lower pids, the remote worker
@@ -198,7 +213,7 @@ function exec_conn_func(w::Worker)
 end
 
 function wait_for_conn(w)
-    if (@atomic w.state) === W_CREATED
+    if (@atomic w.state) === WorkerState_created
         timeout =  worker_timeout() - (time() - w.ct_time)
         timeout <= 0 && error("peer $(w.id) has not connected to $(myid())")
 
@@ -211,7 +226,7 @@ function wait_for_conn(w)
         errormonitor(T)
         lock(w.c_state) do
             wait(w.c_state)
-            (@atomic w.state) === W_CREATED && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
+            (@atomic w.state) === WorkerState_created && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
         end
     end
     nothing
@@ -673,7 +688,7 @@ function create_worker(manager::ClusterManager, wconfig::WorkerConfig)
             if (jw.id != 1) && (jw.id < w.id)
                 lock(jw.c_state) do
                     # wait for wl to join
-                    if (@atomic jw.state) === W_CREATED
+                    if (@atomic jw.state) === WorkerState_created
                         wait(jw.c_state)
                     end
                 end
@@ -700,7 +715,7 @@ function create_worker(manager::ClusterManager, wconfig::WorkerConfig)
 
         for wl in wlist
             lock(wl.c_state) do
-                if (@atomic wl.state) === W_CREATED
+                if (@atomic wl.state) === WorkerState_created
                     # wait for wl to join
                     wait(wl.c_state)
                 end
@@ -918,7 +933,7 @@ function nprocs()
         n = length(PGRP.workers)
         # filter out workers in the process of being setup/shutdown.
         for jw in PGRP.workers
-            if !isa(jw, LocalProcess) && ((@atomic jw.state) !== W_CONNECTED)
+            if !isa(jw, LocalProcess) && ((@atomic jw.state) !== WorkerState_connected)
                 n = n - 1
             end
         end
@@ -971,7 +986,7 @@ julia> procs()
 function procs()
     if myid() == 1 || (PGRP.topology === :all_to_all  && !isclusterlazy())
         # filter out workers in the process of being setup/shutdown.
-        return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)]
+        return Int[x.id for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === WorkerState_connected)]
     else
         return Int[x.id for x in PGRP.workers]
     end
@@ -988,7 +1003,7 @@ other_procs() = filter(!=(myid()), procs())
 function id_in_procs(id)  # faster version of `id in procs()`
     if myid() == 1 || (PGRP.topology === :all_to_all  && !isclusterlazy())
         for x in PGRP.workers
-            if (x.id::Int) == id && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === W_CONNECTED)
+            if (x.id::Int) == id && (isa(x, LocalProcess) || (@atomic (x::Worker).state) === WorkerState_connected)
                 return true
             end
         end
@@ -1012,7 +1027,7 @@ See also [`other_procs()`](@ref).
 """
 function procs(pid::Integer)
     if myid() == 1
-        all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === W_CONNECTED)]
+        all_workers = [x for x in PGRP.workers if isa(x, LocalProcess) || ((@atomic x.state) === WorkerState_connected)]
         if (pid == 1) || (isa(@lock(map_pid_wrkr, map_pid_wrkr[][pid].manager), LocalManager))
             Int[x.id for x in filter(w -> (w.id==1) || (isa(w.manager, LocalManager)), all_workers)]
         else
@@ -1121,7 +1136,7 @@ function _rmprocs(pids, waitfor)
             else
                 w = @lock map_pid_wrkr get(map_pid_wrkr[], p, nothing)
                 if !isnothing(w)
-                    set_worker_state(w, W_TERMINATING)
+                    set_worker_state(w, WorkerState_terminating)
                     kill(w.manager, p, w.config)
                     push!(rmprocset, w)
                 end
@@ -1130,11 +1145,11 @@ function _rmprocs(pids, waitfor)
 
         start = time_ns()
         while (time_ns() - start) < waitfor*1e9
-            all(w -> (@atomic w.state) === W_TERMINATED, rmprocset) && break
+            all(is_worker_dead, rmprocset) && break
             sleep(min(0.1, waitfor - (time_ns() - start)/1e9))
         end
 
-        unremoved = [wrkr.id for wrkr in filter(w -> (@atomic w.state) !== W_TERMINATED, rmprocset)]
+        unremoved = [wrkr.id for wrkr in filter(!is_worker_dead, rmprocset)]
         if length(unremoved) > 0
             estr = string("rmprocs: pids ", unremoved, " not terminated after ", waitfor, " seconds.")
             throw(ErrorException(estr))
diff --git a/src/messages.jl b/src/messages.jl
index 1a5dd82..fe63a7d 100644
--- a/src/messages.jl
+++ b/src/messages.jl
@@ -194,7 +194,7 @@ end
 function flush_gc_msgs()
     try
         for w in (PGRP::ProcessGroup).workers
-            if isa(w,Worker) && ((@atomic w.state) == W_CONNECTED) && w.gcflag
+            if isa(w,Worker) && ((@atomic w.state) == WorkerState_connected) && w.gcflag
                 flush_gc_msgs(w)
             end
         end
diff --git a/src/process_messages.jl b/src/process_messages.jl
index d6fdbb1..0f2750a 100644
--- a/src/process_messages.jl
+++ b/src/process_messages.jl
@@ -210,7 +210,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
             handle_msg(msg, header, r_stream, w_stream, version)
         end
     catch e
-        oldstate = W_UNKNOWN_STATE
+        oldstate = WorkerState_unknown
 
         # Check again as it may have been set in a message handler but not propagated to the calling block above
         if wpid < 1
@@ -223,7 +223,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
         elseif @lock(map_del_wrkr, !(wpid in map_del_wrkr[]))
             werr = worker_from_id(wpid)
             oldstate = @atomic werr.state
-            set_worker_state(werr, W_TERMINATED)
+            set_worker_state(werr, oldstate != WorkerState_terminating ? WorkerState_exterminated : WorkerState_terminated)
 
             # If unhandleable error occurred talking to pid 1, exit
             if wpid == 1
@@ -243,7 +243,7 @@ function message_handler_loop(r_stream::IO, w_stream::IO, incoming::Bool)
         close(w_stream)
 
         if (myid() == 1) && (wpid > 1)
-            if oldstate != W_TERMINATING
+            if oldstate != WorkerState_terminating
                 println(stderr, "Worker $wpid terminated.")
                 rethrow()
             end

From be12fd48603d578bf56aa2400e7c6ab2a1c4dfdf Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Tue, 10 Dec 2024 23:13:00 +0100
Subject: [PATCH 3/7] Add support for worker state callbacks

---
 docs/src/_changelog.md   |   3 +
 docs/src/index.md        |  13 +++
 src/cluster.jl           | 228 ++++++++++++++++++++++++++++++++++++---
 test/distributed_exec.jl |  71 ++++++++++++
 4 files changed, 301 insertions(+), 14 deletions(-)

diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md
index 8f16817..eded6a1 100644
--- a/docs/src/_changelog.md
+++ b/docs/src/_changelog.md
@@ -9,6 +9,9 @@ This documents notable changes in DistributedNext.jl. The format is based on
 
 ## Unreleased
 
+### Added
+- Implemented callback support for workers being added/removed etc ([#17]).
+
 ### Fixed
 - Modified the default implementations of methods like `take!` and `wait` on
   [`AbstractWorkerPool`](@ref) to be threadsafe and behave more consistently
diff --git a/docs/src/index.md b/docs/src/index.md
index 64af89d..17c66a6 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -53,6 +53,19 @@ DistributedNext.cluster_cookie()
 DistributedNext.cluster_cookie(::Any)
 ```
 
+## Callbacks
+
+```@docs
+DistributedNext.add_worker_starting_callback
+DistributedNext.remove_worker_starting_callback
+DistributedNext.add_worker_started_callback
+DistributedNext.remove_worker_started_callback
+DistributedNext.add_worker_exiting_callback
+DistributedNext.remove_worker_exiting_callback
+DistributedNext.add_worker_exited_callback
+DistributedNext.remove_worker_exited_callback
+```
+
 ## Cluster Manager Interface
 
 This interface provides a mechanism to launch and manage Julia workers on different cluster environments.
diff --git a/src/cluster.jl b/src/cluster.jl
index d2e456e..eb87c1f 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -479,20 +479,28 @@ end
 ```
 """
 function addprocs(manager::ClusterManager; kwargs...)
+    params = merge(default_addprocs_params(manager), Dict{Symbol, Any}(kwargs))
+
     init_multi()
 
     cluster_mgmt_from_master_check()
 
-    lock(worker_lock)
-    try
-        addprocs_locked(manager::ClusterManager; kwargs...)
-    finally
-        unlock(worker_lock)
-    end
+    # Call worker-starting callbacks
+    warning_interval = params[:callback_warning_interval]
+    _run_callbacks_concurrently("worker-starting", worker_starting_callbacks,
+                                warning_interval, [(manager, params)])
+
+    # Add new workers
+    new_workers = @lock worker_lock addprocs_locked(manager::ClusterManager, params)
+
+    # Call worker-started callbacks
+    _run_callbacks_concurrently("worker-started", worker_started_callbacks,
+                                warning_interval, new_workers)
+
+    return new_workers
 end
 
-function addprocs_locked(manager::ClusterManager; kwargs...)
-    params = merge(default_addprocs_params(manager), Dict{Symbol,Any}(kwargs))
+function addprocs_locked(manager::ClusterManager, params)
     topology(Symbol(params[:topology]))
 
     if PGRP.topology !== :all_to_all
@@ -579,7 +587,8 @@ default_addprocs_params() = Dict{Symbol,Any}(
     :exeflags => ``,
     :env      => [],
     :enable_threaded_blas => false,
-    :lazy => true)
+    :lazy => true,
+    :callback_warning_interval => 10)
 
 
 function setup_launched_worker(manager, wconfig, launched_q)
@@ -888,6 +897,10 @@ const HDR_COOKIE_LEN = 16
 const map_pid_wrkr = Lockable(Dict{Int, Union{Worker, LocalProcess}}())
 const map_sock_wrkr = Lockable(IdDict())
 const map_del_wrkr = Lockable(Set{Int}())
+const worker_starting_callbacks = Dict{Any, Base.Callable}()
+const worker_started_callbacks = Dict{Any, Base.Callable}()
+const worker_exiting_callbacks = Dict{Any, Base.Callable}()
+const worker_exited_callbacks = Dict{Any, Base.Callable}()
 
 # whether process is a master or worker in a distributed setup
 myrole() = LPROCROLE[]
@@ -895,6 +908,163 @@ function myrole!(proctype::Symbol)
     LPROCROLE[] = proctype
 end
 
+# Callbacks
+
+function _run_callbacks_concurrently(callbacks_name, callbacks_dict, warning_interval, arglist; catch_exceptions=false)
+    callback_tasks = Tuple{Any, Task}[]
+    for args in arglist
+        for (name, callback) in callbacks_dict
+            push!(callback_tasks, (name, Threads.@spawn callback(args...)))
+        end
+    end
+
+    running_callbacks = () -> ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)]
+    while timedwait(() -> isempty(running_callbacks()), warning_interval) === :timed_out
+        callbacks_str = join(running_callbacks(), ", ")
+        @warn "Waiting for these $(callbacks_name) callbacks to finish: $(callbacks_str)"
+    end
+
+    if catch_exceptions
+        for (key, task) in callback_tasks
+            try
+                wait(task)
+            catch ex
+                @error "Error when running $(callbacks_name) callback '$(key)'" exception=(ex, catch_backtrace())
+            end
+        end
+    else
+        # Wait on the tasks so that exceptions bubble up
+        foreach(wait, [x[2] for x in callback_tasks])
+    end
+end
+
+function _add_callback(f, key, dict; arg_types=Tuple{Int})
+    if isnothing(key)
+        key = Symbol(gensym(), nameof(f))
+    end
+
+    desired_signature = "f(" * join(["::$(t)" for t in arg_types.types], ", ") * ")"
+
+    if !hasmethod(f, arg_types)
+        throw(ArgumentError("Callback function is invalid, it must be able to be called with these argument types: $(desired_signature)"))
+    elseif haskey(dict, key)
+        throw(ArgumentError("A callback function with key '$(key)' already exists"))
+    end
+
+    dict[key] = f
+    return key
+end
+
+_remove_callback(key, dict) = delete!(dict, key)
+
+"""
+    add_worker_starting_callback(f::Base.Callable; key=nothing) -> key
+
+Register a callback to be called on the master worker immediately before new
+workers are started. Chooses and returns a unique key for the callback if `key`
+is not specified. The callback `f` will be called with the `ClusterManager`
+instance that is being used and a dictionary of parameters related to adding
+workers, i.e. `f(manager, params)`. The `params` dictionary is specific to the
+`manager` type. Note that the `LocalManager` and `SSHManager` cluster managers
+in DistributedNext are not fully documented yet, see the
+[managers.jl](https://github.com/JuliaParallel/DistributedNext.jl/blob/master/src/managers.jl)
+file for their definitions.
+
+!!! warning
+    Adding workers can fail so it is not guaranteed that the workers requested
+    in `manager` will exist in the future. e.g. if a worker is requested on a
+    node that is unreachable then the worker-starting callbacks will be called
+    but the worker will never be added.
+
+The worker-starting callbacks will be executed concurrently. If one throws an
+exception it will not be caught and will be rethrown by [`addprocs`](@ref).
+
+Keep in mind that the callbacks will add to the time taken to launch workers; so
+try to either keep the callbacks fast to execute, or do the actual work
+asynchronously by spawning a task in the callback (beware of race conditions if
+you do this).
+"""
+add_worker_starting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_starting_callbacks;
+                                                                            arg_types=Tuple{ClusterManager, Dict})
+"""
+    remove_worker_starting_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_starting_callback()`](@ref).
+"""
+remove_worker_starting_callback(key) = _remove_callback(key, worker_starting_callbacks)
+
+"""
+    add_worker_started_callback(f::Base.Callable; key=nothing) -> key
+
+Register a callback to be called on the master worker whenever a worker has
+been added. The callback will be called with the added worker ID,
+e.g. `f(w::Int)`. Chooses and returns a unique key for the callback if `key` is
+not specified.
+
+The worker-started callbacks will be executed concurrently. If one throws an
+exception it will not be caught and will be rethrown by [`addprocs()`](@ref).
+
+Keep in mind that the callbacks will add to the time taken to launch workers; so
+try to either keep the callbacks fast to execute, or do the actual
+initialization asynchronously by spawning a task in the callback (beware of race
+conditions if you do this).
+"""
+add_worker_started_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_started_callbacks)
+
+"""
+    remove_worker_started_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_started_callback()`](@ref).
+"""
+remove_worker_started_callback(key) = _remove_callback(key, worker_started_callbacks)
+
+"""
+    add_worker_exiting_callback(f::Base.Callable; key=nothing) -> key
+
+Register a callback to be called on the master worker immediately before a
+worker is removed with [`rmprocs()`](@ref). The callback will be called with the
+worker ID, e.g. `f(w::Int)`. Chooses and returns a unique key for the callback
+if `key` is not specified.
+
+All worker-exiting callbacks will be executed concurrently and if they don't
+all finish before the `callback_timeout` passed to `rmprocs()` then the worker
+will be removed anyway.
+"""
+add_worker_exiting_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exiting_callbacks)
+
+"""
+    remove_worker_exiting_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_exiting_callback()`](@ref).
+"""
+remove_worker_exiting_callback(key) = _remove_callback(key, worker_exiting_callbacks)
+
+"""
+    add_worker_exited_callback(f::Base.Callable; key=nothing) -> key
+
+Register a callback to be called on the master worker when a worker has exited
+for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
+segfaulting etc). Chooses and returns a unique key for the callback if `key` is
+not specified.
+
+The callback will be called with the worker ID and the final
+`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
+enum, a value of `WorkerState_terminated` means a graceful exit and a value of
+`WorkerState_exterminated` means the worker died unexpectedly.
+
+All worker-exited callbacks will be executed concurrently. If a callback throws
+an exception it will be caught and printed.
+"""
+add_worker_exited_callback(f::Base.Callable; key=nothing) = _add_callback(f, key, worker_exited_callbacks;
+                                                                          arg_types=Tuple{Int, WorkerState})
+
+"""
+    remove_worker_exited_callback(key)
+
+Remove the callback for `key` that was added with [`add_worker_exited_callback()`](@ref).
+"""
+remove_worker_exited_callback(key) = _remove_callback(key, worker_exited_callbacks)
+
 # cluster management related API
 """
     myid()
@@ -1081,7 +1251,7 @@ function cluster_mgmt_from_master_check()
 end
 
 """
-    rmprocs(pids...; waitfor=typemax(Int))
+    rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
 
 Remove the specified workers. Note that only process 1 can add or remove
 workers.
@@ -1095,6 +1265,10 @@ Argument `waitfor` specifies how long to wait for the workers to shut down:
     returned. The user should call [`wait`](@ref) on the task before invoking any other
     parallel calls.
 
+The `callback_timeout` specifies how long to wait for any callbacks to execute
+before continuing to remove the workers (see
+[`add_worker_exiting_callback()`](@ref)).
+
 # Examples
 ```julia-repl
 \$ julia -p 5
@@ -1111,24 +1285,38 @@ julia> workers()
  6
 ```
 """
-function rmprocs(pids...; waitfor=typemax(Int))
+function rmprocs(pids...; waitfor=typemax(Int), callback_timeout=10)
     cluster_mgmt_from_master_check()
 
     pids = vcat(pids...)
     if waitfor == 0
-        t = @async _rmprocs(pids, typemax(Int))
+        t = @async _rmprocs(pids, typemax(Int), callback_timeout)
         yield()
         return t
     else
-        _rmprocs(pids, waitfor)
+        _rmprocs(pids, waitfor, callback_timeout)
         # return a dummy task object that user code can wait on.
         return @async nothing
     end
 end
 
-function _rmprocs(pids, waitfor)
+function _rmprocs(pids, waitfor, callback_timeout)
     lock(worker_lock)
     try
+        # Run the callbacks
+        callback_tasks = Tuple{Any, Task}[]
+        for pid in pids
+            for (name, callback) in worker_exiting_callbacks
+                push!(callback_tasks, (name, Threads.@spawn callback(pid)))
+            end
+        end
+
+        if timedwait(() -> all(istaskdone, [x[2] for x in callback_tasks]), callback_timeout) === :timed_out
+            timedout_callbacks = ["'$(key)'" for (key, task) in callback_tasks if !istaskdone(task)]
+            callbacks_str = join(timedout_callbacks, ", ")
+            @warn "Some worker-exiting callbacks have not yet finished, continuing to remove workers anyway. These are the callbacks still running: $(callbacks_str)"
+        end
+
         rmprocset = Union{LocalProcess, Worker}[]
         for p in pids
             if p == 1
@@ -1280,6 +1468,18 @@ function deregister_worker(pg, pid)
             delete!(pg.refs, id)
         end
     end
+
+    # Call callbacks on the master
+    if myid() == 1
+        for (name, callback) in worker_exited_callbacks
+            try
+                callback(pid, w.state)
+            catch ex
+                @error "Error when running worker-exited callback '$(name)'" exception=(ex, catch_backtrace())
+            end
+        end
+    end
+
     return
 end
 
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
index de6f4fe..86946ad 100644
--- a/test/distributed_exec.jl
+++ b/test/distributed_exec.jl
@@ -1,6 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 using DistributedNext, Random, Serialization, Sockets
+import DistributedNext
 import DistributedNext: launch, manage
 
 
@@ -1977,6 +1978,76 @@ include("splitrange.jl")
     end
 end
 
+@testset "Worker state callbacks" begin
+    rmprocs(other_workers())
+
+    # Adding a callback with an invalid signature should fail
+    @test_throws ArgumentError DistributedNext.add_worker_started_callback(() -> nothing)
+
+    # Smoke test to ensure that all the callbacks are executed
+    starting_managers = []
+    started_workers = Int[]
+    exiting_workers = Int[]
+    exited_workers = []
+    starting_key = DistributedNext.add_worker_starting_callback((manager, kwargs) -> push!(starting_managers, manager))
+    started_key = DistributedNext.add_worker_started_callback(pid -> (push!(started_workers, pid); error("foo")))
+    exiting_key = DistributedNext.add_worker_exiting_callback(pid -> push!(exiting_workers, pid))
+    exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> push!(exited_workers, (pid, state)))
+
+    # Test that the worker-started exception bubbles up
+    @test_throws TaskFailedException addprocs(1)
+
+    pid = only(workers())
+    @test only(starting_managers) isa DistributedNext.LocalManager
+    @test started_workers == [pid]
+    rmprocs(workers())
+    @test exiting_workers == [pid]
+    @test exited_workers == [(pid, DistributedNext.WorkerState_terminated)]
+
+    # Trying to reset an existing callback should fail
+    @test_throws ArgumentError DistributedNext.add_worker_started_callback(Returns(nothing); key=started_key)
+
+    # Remove the callbacks
+    DistributedNext.remove_worker_starting_callback(starting_key)
+    DistributedNext.remove_worker_started_callback(started_key)
+    DistributedNext.remove_worker_exiting_callback(exiting_key)
+    DistributedNext.remove_worker_exited_callback(exited_key)
+
+    # Test that the worker-exiting `callback_timeout` option works and that we
+    # get warnings about slow worker-started callbacks.
+    event = Base.Event()
+    callback_task = nothing
+    started_key = DistributedNext.add_worker_started_callback(_ -> sleep(0.5))
+    exiting_key = DistributedNext.add_worker_exiting_callback(_ -> (callback_task = current_task(); wait(event)))
+
+    @test_logs (:warn, r"Waiting for these worker-started callbacks.+") match_mode=:any addprocs(1; callback_warning_interval=0.05)
+    DistributedNext.remove_worker_started_callback(started_key)
+
+    @test_logs (:warn, r"Some worker-exiting callbacks have not yet finished.+") rmprocs(workers(); callback_timeout=0.5)
+    DistributedNext.remove_worker_exiting_callback(exiting_key)
+
+    notify(event)
+    wait(callback_task)
+
+    # Test that the initial callbacks were indeed removed
+    @test length(starting_managers) == 1
+    @test length(started_workers) == 1
+    @test length(exiting_workers) == 1
+    @test length(exited_workers) == 1
+
+    # Test that workers that were killed forcefully are detected as such
+    exit_state = nothing
+    exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> exit_state = state)
+    pid = only(addprocs(1))
+
+    redirect_stderr(devnull) do
+        remote_do(exit, pid)
+        timedwait(() -> !isnothing(exit_state), 10)
+    end
+    @test exit_state == DistributedNext.WorkerState_exterminated
+    DistributedNext.remove_worker_exited_callback(exited_key)
+end
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 if nprocs() > 1

From 0467de60da622f297b030e437a93ebb0f0737c20 Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Wed, 11 Dec 2024 00:08:19 +0100
Subject: [PATCH 4/7] Add an extension to support Revise

---
 Project.toml             | 11 ++++--
 docs/src/_changelog.md   |  1 +
 ext/ReviseExt.jl         | 30 ++++++++++++++++
 test/distributed_exec.jl | 74 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 113 insertions(+), 3 deletions(-)
 create mode 100644 ext/ReviseExt.jl

diff --git a/Project.toml b/Project.toml
index 2c1102f..a088bb8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,16 +7,23 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
 
+[weakdeps]
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+
+[extensions]
+ReviseExt = "Revise"
+
 [compat]
 Aqua = "0.8"
 Distributed = "1"
 LibSSH = "0.7"
 LinearAlgebra = "1"
 Random = "1"
+Revise = "3.7.0"
 Serialization = "1"
 Sockets = "1"
 Test = "1"
-julia = "1.9"
+julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
@@ -26,4 +33,4 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Aqua", "Distributed", "LibSSH", "LinearAlgebra", "Test"]
+test = ["Aqua", "Distributed", "LibSSH", "LinearAlgebra", "Revise", "Test"]
diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md
index eded6a1..04378ef 100644
--- a/docs/src/_changelog.md
+++ b/docs/src/_changelog.md
@@ -11,6 +11,7 @@ This documents notable changes in DistributedNext.jl. The format is based on
 
 ### Added
 - Implemented callback support for workers being added/removed etc ([#17]).
+- Added a package extension to support Revise.jl ([#17]).
 
 ### Fixed
 - Modified the default implementations of methods like `take!` and `wait` on
diff --git a/ext/ReviseExt.jl b/ext/ReviseExt.jl
new file mode 100644
index 0000000..269add5
--- /dev/null
+++ b/ext/ReviseExt.jl
@@ -0,0 +1,30 @@
+module ReviseExt
+
+import DistributedNext
+import DistributedNext: myid, workers, remotecall
+
+import Revise
+
+
+struct DistributedNextWorker <: Revise.AbstractWorker
+    id::Int
+end
+
+function get_workers()
+    map(DistributedNextWorker, workers())
+end
+
+function Revise.remotecall_impl(f, worker::DistributedNextWorker, args...; kwargs...)
+    remotecall(f, worker.id, args...; kwargs...)
+end
+
+Revise.is_master_worker(::typeof(get_workers)) = myid() == 1
+Revise.is_master_worker(worker::DistributedNextWorker) = worker.id == 1
+
+function __init__()
+    Revise.register_workers_function(get_workers)
+    DistributedNext.add_worker_started_callback(pid -> Revise.init_worker(DistributedNextWorker(pid));
+                                                key="DistributedNext-integration")
+end
+
+end
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
index 86946ad..996294a 100644
--- a/test/distributed_exec.jl
+++ b/test/distributed_exec.jl
@@ -1,5 +1,6 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
+import Revise
 using DistributedNext, Random, Serialization, Sockets
 import DistributedNext
 import DistributedNext: launch, manage
@@ -1957,7 +1958,9 @@ include("splitrange.jl")
 
 @testset "Clear all workers for timeout tests (issue #45785)" begin
     nprocs() > 1 && rmprocs(workers())
-    begin
+
+    # This test requires kill(), and that doesn't work on Windows before 1.11
+    if !(Sys.iswindows() && VERSION < v"1.11")
         # First, assert that we get no messages when we close a cooperative worker
         w = only(addprocs(1))
         @test_nowarn begin
@@ -1975,6 +1978,8 @@ include("splitrange.jl")
             end
             wait(rmprocs([w]))
         end
+    else
+        @warn "Skipping timeout tests because kill() isn't supported on Windows for this Julia version"
     end
 end
 
@@ -2048,6 +2053,73 @@ end
     DistributedNext.remove_worker_exited_callback(exited_key)
 end
 
+# This is a simplified copy of a test from Revise.jl's tests
+@testset "Revise.jl integration" begin
+    function rm_precompile(pkgname::AbstractString)
+        filepath = Base.cache_file_entry(Base.PkgId(pkgname))
+        isa(filepath, Tuple) && (filepath = filepath[1]*filepath[2])  # Julia 1.3+
+        for depot in DEPOT_PATH
+            fullpath = joinpath(depot, filepath)
+            isfile(fullpath) && rm(fullpath)
+        end
+    end
+
+    pid = only(addprocs(1))
+
+    # Test that initialization succeeds by checking that Main.whichtt is defined
+    # on the worker, which is defined by Revise.init_worker().
+    @test timedwait(() ->remotecall_fetch(() -> hasproperty(Main, :whichtt), pid), 10) == :ok
+
+    tmpdir = mktempdir()
+    @everywhere push!(LOAD_PATH, $tmpdir)  # Don't want to share this LOAD_PATH
+
+    # Create a fake package
+    module_file = joinpath(tmpdir, "ReviseDistributed", "src", "ReviseDistributed.jl")
+    mkpath(dirname(module_file))
+    write(module_file,
+          """
+          module ReviseDistributed
+
+          f() = π
+          g(::Int) = 0
+
+          end
+          """)
+
+    # Check that we can use it
+    @everywhere using ReviseDistributed
+    for p in procs()
+        @test remotecall_fetch(ReviseDistributed.f, p)    == π
+        @test remotecall_fetch(ReviseDistributed.g, p, 1) == 0
+    end
+
+    # Test changing and deleting methods
+    write(module_file,
+          """
+          module ReviseDistributed
+
+          f() = 3.0
+
+          end
+          """)
+    for p in procs()
+        # We call Revise.revise() inside the timedwait() because file events
+        # on macOS can have significant latency, meaning a single revise() call
+        # may not pick up the changes yet.
+        @test timedwait(10; pollint=0.5) do
+            Revise.revise()
+            remotecall_fetch(ReviseDistributed.f, p) == 3.0
+        end == :ok
+
+        @test_throws RemoteException remotecall_fetch(ReviseDistributed.g, p, 1)
+    end
+
+    rmprocs(workers())
+    rm_precompile("ReviseDistributed")
+    pop!(LOAD_PATH)
+end
+
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 if nprocs() > 1

From 95a9415268ae1f6009107767eb8e9477f5008cbd Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Thu, 2 Jan 2025 23:11:27 +0100
Subject: [PATCH 5/7] Clean up CI a bit

---
 .github/dependabot.yml       |  4 ++++
 .github/workflows/ci.yml     | 16 ++++------------
 .github/workflows/tagbot.yml | 18 ++++++++++++++++++
 3 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 .github/workflows/tagbot.yml

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index d60f070..b31f841 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,7 @@ updates:
     directory: "/" # Location of package manifests
     schedule:
       interval: "monthly"
+  - package-ecosystem: "julia"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 992be42..121ceae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -67,17 +67,7 @@ jobs:
           # GitHub will automatically expand to the correct value (`x86_64` or `aarch64`)
           # based on the architecture of the underlying GitHub Runner (virtual machine).
           arch: ${{ github.ref == '32' && 'x86' || runner.arch }}
-      - uses: actions/cache@v5
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-${{ matrix.os }}
-            ${{ runner.os }}-
-      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/cache@v3
       - name: Decide what the value of JULIA_NUM_THREADS should be
         id: decide-numthreads-str
         run: |
@@ -115,6 +105,7 @@ jobs:
       - uses: julia-actions/setup-julia@latest
         with:
           version: '1'
+      - uses: julia-actions/cache@v3
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.instantiate()'
       - name: Build and deploy
@@ -130,8 +121,9 @@ jobs:
           persist-credentials: false
       - uses: julia-actions/setup-julia@latest
         with:
-          version: '1.12'
+          version: '1'
           # version: 'nightly'
+      - uses: julia-actions/cache@v3
       - run: julia --color=yes --project=ci/jet -e 'import Pkg; Pkg.instantiate()'
       - name: Run the JET tests
         run: julia --color=yes --project=ci/jet ci/jet/check.jl
diff --git a/.github/workflows/tagbot.yml b/.github/workflows/tagbot.yml
new file mode 100644
index 0000000..0f81ec9
--- /dev/null
+++ b/.github/workflows/tagbot.yml
@@ -0,0 +1,18 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        description: "[DEPRECATED] No longer has any effect"
+        default: "3"
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}

From 9d9cf148c9515233c2d30b2ae620e99822f80436 Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Fri, 3 Jan 2025 00:13:59 +0100
Subject: [PATCH 6/7] Replace a timeout task with timedwait()

This should fix an exception seen in CI from the lingering timeout task:
```
 Test Summary:                                | Pass  Total  Time
Deserialization error recovery and include() |   11     11  3.9s
      From worker 4:	Unhandled Task ERROR: EOFError: read end of file
      From worker 4:	Stacktrace:
      From worker 4:	 [1] wait
      From worker 4:	   @ .\asyncevent.jl:159 [inlined]
      From worker 4:	 [2] sleep(sec::Float64)
      From worker 4:	   @ Base .\asyncevent.jl:265
      From worker 4:	 [3] (::DistributedNext.var"#34#37"{DistributedNext.Worker, Float64})()
      From worker 4:	   @ DistributedNext D:\a\DistributedNext.jl\DistributedNext.jl\src\cluster.jl:213
```
---
 src/cluster.jl | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/cluster.jl b/src/cluster.jl
index eb87c1f..9ef7734 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -217,16 +217,10 @@ function wait_for_conn(w)
         timeout =  worker_timeout() - (time() - w.ct_time)
         timeout <= 0 && error("peer $(w.id) has not connected to $(myid())")
 
-        T = Threads.@spawn begin
-            sleep($timeout)
-            lock(w.c_state) do
-                notify(w.c_state; all=true)
-            end
-        end
-        errormonitor(T)
-        lock(w.c_state) do
-            wait(w.c_state)
-            (@atomic w.state) === WorkerState_created && error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
+        if timedwait(() -> (@atomic w.state) === WorkerState_connected, timeout) === :timed_out
+            # Notify any waiters on the state and throw
+            @lock w.c_state notify(w.c_state)
+            error("peer $(w.id) didn't connect to $(myid()) within $timeout seconds")
         end
     end
     nothing

From 07bda63610b10b08fb1ff589efc8472097b83c2e Mon Sep 17 00:00:00 2001
From: JamesWrigley <james@puiterwijk.org>
Date: Sun, 5 Jan 2025 16:08:09 +0100
Subject: [PATCH 7/7] Add support for worker statuses

---
 Project.toml             |   2 +
 docs/src/_changelog.md   |   2 +
 docs/src/index.md        |   4 ++
 src/DistributedNext.jl   |   6 ++
 src/cluster.jl           | 135 ++++++++++++++++++++++++++++++++++++---
 test/distributed_exec.jl |  67 +++++++++++++++++--
 test/runtests.jl         |   2 +-
 7 files changed, 200 insertions(+), 18 deletions(-)

diff --git a/Project.toml b/Project.toml
index a088bb8..f09edba 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ version = "1.1.1"
 
 [deps]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
 
@@ -20,6 +21,7 @@ LibSSH = "0.7"
 LinearAlgebra = "1"
 Random = "1"
 Revise = "3.7.0"
+ScopedValues = "1.6.0"
 Serialization = "1"
 Sockets = "1"
 Test = "1"
diff --git a/docs/src/_changelog.md b/docs/src/_changelog.md
index 04378ef..e1e1e4e 100644
--- a/docs/src/_changelog.md
+++ b/docs/src/_changelog.md
@@ -12,6 +12,8 @@ This documents notable changes in DistributedNext.jl. The format is based on
 ### Added
 - Implemented callback support for workers being added/removed etc ([#17]).
 - Added a package extension to support Revise.jl ([#17]).
+- Added support for setting worker statuses with [`setstatus`](@ref) and
+  [`getstatus`](@ref) ([#17]).
 
 ### Fixed
 - Modified the default implementations of methods like `take!` and `wait` on
diff --git a/docs/src/index.md b/docs/src/index.md
index 17c66a6..1e979ad 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -14,6 +14,10 @@ DistributedNext.rmprocs
 DistributedNext.interrupt
 DistributedNext.myid
 DistributedNext.pmap
+DistributedNext.getstatus
+DistributedNext.@getstatus
+DistributedNext.setstatus!
+DistributedNext.@setstatus!
 DistributedNext.RemoteException
 DistributedNext.ProcessExitedException
 DistributedNext.Future
diff --git a/src/DistributedNext.jl b/src/DistributedNext.jl
index 45a8b1f..27d7bfb 100644
--- a/src/DistributedNext.jl
+++ b/src/DistributedNext.jl
@@ -23,6 +23,12 @@ using Serialization, Sockets
 import Serialization: serialize, deserialize
 import Sockets: connect, wait_connected
 
+@static if VERSION < v"1.11"
+    using ScopedValues: ScopedValue, @with
+else
+    using Base.ScopedValues: ScopedValue, @with
+end
+
 # NOTE: clusterserialize.jl imports additional symbols from Serialization for use
 
 export
diff --git a/src/cluster.jl b/src/cluster.jl
index 9ef7734..bbbbad5 100644
--- a/src/cluster.jl
+++ b/src/cluster.jl
@@ -891,6 +891,8 @@ const HDR_COOKIE_LEN = 16
 const map_pid_wrkr = Lockable(Dict{Int, Union{Worker, LocalProcess}}())
 const map_sock_wrkr = Lockable(IdDict())
 const map_del_wrkr = Lockable(Set{Int}())
+const _exited_callback_pid = ScopedValue{Int}(-1)
+const map_pid_statuses = Lockable(Dict{Int, Any}())
 const worker_starting_callbacks = Dict{Any, Base.Callable}()
 const worker_started_callbacks = Dict{Any, Base.Callable}()
 const worker_exiting_callbacks = Dict{Any, Base.Callable}()
@@ -1042,9 +1044,9 @@ segfaulting etc). Chooses and returns a unique key for the callback if `key` is
 not specified.
 
 The callback will be called with the worker ID and the final
-`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
-enum, a value of `WorkerState_terminated` means a graceful exit and a value of
-`WorkerState_exterminated` means the worker died unexpectedly.
+`Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is
+an enum, a value of `WorkerState_terminated` means a graceful exit and a value
+of `WorkerState_exterminated` means the worker died unexpectedly.
 
 All worker-exited callbacks will be executed concurrently. If a callback throws
 an exception it will be caught and printed.
@@ -1238,6 +1240,112 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out.
 """
 other_workers() = filter(!=(myid()), workers())
 
+"""
+    @setstatus! x
+    @setstatus! x pid
+
+Set the status for the calling module on worker `pid` (defaults to the current
+worker) to `x`. `x` may be any serializable object but it's recommended to keep
+it small enough to cheaply send over a network. Statuses can be retrieved inside
+worker-exited callbacks (see [`add_worker_exited_callback`](@ref)) before the
+worker is fully deregistered.
+
+Statuses are keyed by the calling `Module`, so multiple libraries can
+independently track their own status on the same worker without conflicting.
+
+This can be handy if you want a way to know what a worker is doing at any given
+time, or (in combination with a worker-exited callback) for knowing what a
+worker was last doing before it died.
+
+# Examples
+```julia-repl
+julia> DistributedNext.@setstatus! "working on dataset 42"
+"working on dataset 42"
+
+julia> DistributedNext.@getstatus
+"working on dataset 42"
+```
+
+See also [`setstatus!`](@ref) for the function form that accepts an explicit module key.
+"""
+macro setstatus!(x)
+    mod = __module__
+    :(setstatus!($(esc(x)), $mod))
+end
+
+macro setstatus!(x, pid)
+    mod = __module__
+    :(setstatus!($(esc(x)), $mod, $(esc(pid))))
+end
+
+"""
+    setstatus!(x, mod::Module, pid::Int=myid())
+
+Function form of [`@setstatus!`](@ref). Sets the status for module `mod` on
+worker `pid` to `x`.
+"""
+function setstatus!(x, mod::Module, pid::Int=myid())
+    if !id_in_procs(pid)
+        throw(ArgumentError("Worker $(pid) does not exist, cannot set its status"))
+    end
+
+    if myid() == 1
+        @lock map_pid_statuses begin
+            statuses = get!(map_pid_statuses[], pid, Dict{Module, Any}())
+            statuses[mod] = x
+        end
+    else
+        remotecall_fetch(setstatus!, 1, x, mod, myid())
+    end
+end
+
+function _getstatus(pid, mod)
+    @lock map_pid_statuses begin
+        statuses = get(map_pid_statuses[], pid, nothing)
+        isnothing(statuses) ? nothing : get(statuses, mod, nothing)
+    end
+end
+
+"""
+    @getstatus
+    @getstatus pid
+
+Get the status set by the calling module for worker `pid` (defaults to the
+current worker). If one was never explicitly set with [`@setstatus!`](@ref)
+this will return `nothing`.
+
+See also [`getstatus`](@ref) for the function form.
+"""
+macro getstatus()
+    mod = __module__
+    :(getstatus($mod))
+end
+macro getstatus(pid)
+    mod = __module__
+    :(getstatus($mod, $(esc(pid))))
+end
+
+"""
+    getstatus(mod::Module, pid::Int=myid())
+
+Function form of [`@getstatus`](@ref). Gets the status for module `mod` on
+worker `pid`. Returns `nothing` if no status was set.
+"""
+function getstatus(mod::Module, pid::Int=myid())
+    # During the worker-exited callbacks this function may be called, at which
+    # point it will not exist in procs(). Thus we check whether the function is
+    # being called for an exited worker and allow it if so.
+    if !id_in_procs(pid) && _exited_callback_pid[] != pid
+        throw(ArgumentError("Worker $(pid) does not exist, cannot get its status"))
+    end
+
+    if myid() == 1
+        _getstatus(pid, mod)
+    else
+        remotecall_fetch(getstatus, 1, mod, pid)
+    end
+end
+
 function cluster_mgmt_from_master_check()
     if myid() != 1
         throw(ErrorException("Only process 1 can add and remove workers"))
@@ -1463,15 +1571,22 @@ function deregister_worker(pg, pid)
         end
     end
 
-    # Call callbacks on the master
     if myid() == 1
-        for (name, callback) in worker_exited_callbacks
-            try
-                callback(pid, w.state)
-            catch ex
-                @error "Error when running worker-exited callback '$(name)'" exception=(ex, catch_backtrace())
-            end
+        params = default_addprocs_params(w.manager)
+        warning_interval = params[:callback_warning_interval]
+
+        # Call callbacks on the master, with the scoped value set so that
+        # getstatus() can be called for the exiting worker without failing the
+        # pid check. We go to some effort to make sure this works after
+        # deregistering the worker because if it's called beforehand the worker
+        # will incorrectly be shown in e.g. procs().
+        @with _exited_callback_pid => pid begin
+            _run_callbacks_concurrently("worker-exited", worker_exited_callbacks,
+                                        warning_interval, [(pid, w.state)]; catch_exceptions=true)
         end
+
+        # Delete its statuses
+        @lock map_pid_statuses delete!(map_pid_statuses[], pid)
     end
 
     return
diff --git a/test/distributed_exec.jl b/test/distributed_exec.jl
index 996294a..fc9c0d4 100644
--- a/test/distributed_exec.jl
+++ b/test/distributed_exec.jl
@@ -3,7 +3,7 @@
 import Revise
 using DistributedNext, Random, Serialization, Sockets
 import DistributedNext
-import DistributedNext: launch, manage
+import DistributedNext: launch, manage, getstatus, setstatus!, @getstatus, @setstatus!
 
 
 @test cluster_cookie() isa String
@@ -1826,7 +1826,9 @@ end
     let julia = `$(Base.julia_cmd()) --startup-file=no`; mktempdir() do tmp
         pkg_project = joinpath(Base.pkgdir(DistributedNext), "Project.toml")
         project = mkdir(joinpath(tmp, "project"))
-        depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2"))]
+        # Keep the writable depot in the depots list so that external
+        # dependencies (i.e. ScopedValues.jl) can be loaded.
+        depots = [mkdir(joinpath(tmp, "depot1")), mkdir(joinpath(tmp, "depot2")), Base.DEPOT_PATH[1]]
         load_path = [mkdir(joinpath(tmp, "load_path")), "@stdlib", "@", pkg_project]
         pathsep = Sys.iswindows() ? ";" : ":"
         env = Dict(
@@ -1935,7 +1937,7 @@ end
         project = mktempdir()
         env = Dict(
             "JULIA_LOAD_PATH" => string(LOAD_PATH[1], $(repr(pathsep)), "@stdlib", $(repr(pathsep)), "$(escaped_pkg_project)"),
-            "JULIA_DEPOT_PATH" => DEPOT_PATH[1],
+            "JULIA_DEPOT_PATH" => DEPOT_PATH[end],
             "TMPDIR" => ENV["TMPDIR"],
         )
         addprocs(1; env = env, exeflags = `--project=\$(project)`)
@@ -1943,7 +1945,7 @@ end
         addprocs(1; env = env)
         """ * funcscode * """
         for w in workers()
-            @test remotecall_fetch(depot_path, w)          == [DEPOT_PATH[1]]
+            @test remotecall_fetch(depot_path, w)          == [DEPOT_PATH[end]]
             @test remotecall_fetch(load_path, w)           == [LOAD_PATH[1], "@stdlib", "$(escaped_pkg_project)"]
             @test remotecall_fetch(active_project, w)      == project
             @test remotecall_fetch(Base.active_project, w) == joinpath(project, "Project.toml")
@@ -1983,7 +1985,40 @@ include("splitrange.jl")
     end
 end
 
+@testset "Worker statuses" begin
+    rmprocs(other_workers())
+
+    # Test with the local worker using macros
+    @test isnothing(@getstatus())
+    @setstatus!("foo")
+    @test @getstatus() == "foo"
+    @test_throws ArgumentError getstatus(Main, 2)
+
+    # Test with a remote worker using the function form
+    pid = only(addprocs(1))
+    @test isnothing(getstatus(Main, pid))
+    remotecall_wait(setstatus!, pid, "bar", Main, pid)
+    @test remotecall_fetch(getstatus, pid, Main) == "bar"
+
+    # Test that different modules have independent statuses
+    setstatus!("from_main", Main, pid)
+    setstatus!("from_distributed", DistributedNext, pid)
+    @test getstatus(Main, pid) == "from_main"
+    @test getstatus(DistributedNext, pid) == "from_distributed"
+
+    rmprocs(pid)
+end
+
 @testset "Worker state callbacks" begin
+    # Helper function to wait for a worker to have been completely deregistered
+    # (including worker-exited callbacks finished) by waiting for the workers
+    # status to have been deleted. Only works if the worker has a status of
+    # course.
+    function wait_for_deregistration(pid)
+        statuses = DistributedNext.map_pid_statuses
+        @test timedwait(() -> @lock(statuses, !haskey(statuses[], pid)), 10) == :ok
+    end
+
     rmprocs(other_workers())
 
     # Adding a callback with an invalid signature should fail
@@ -2040,16 +2075,34 @@ end
     @test length(exiting_workers) == 1
     @test length(exited_workers) == 1
 
-    # Test that workers that were killed forcefully are detected as such
+    # Test that workers that were killed forcefully are detected as such, and
+    # that statuses can be retrieved in the callback.
     exit_state = nothing
-    exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> exit_state = state)
+    last_status = nothing
+    exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> (exit_state = state; last_status = @getstatus(pid)))
     pid = only(addprocs(1))
+    @setstatus!("foo", pid)
 
+    # Kill the process with stderr redirected so the error messages don't
+    # unnecessarily show up in the logs.
     redirect_stderr(devnull) do
         remote_do(exit, pid)
-        timedwait(() -> !isnothing(exit_state), 10)
+        wait_for_deregistration(pid)
     end
     @test exit_state == DistributedNext.WorkerState_exterminated
+    @test last_status == "foo"
+    DistributedNext.remove_worker_exited_callback(exited_key)
+
+    # Test that exceptions in worker-exited callbacks are caught
+    exited_key = DistributedNext.add_worker_exited_callback((pid, state) -> error("foo"))
+    @test_logs (:error, r"Error when running worker-exited callback.+") match_mode=:any begin
+        pid = only(addprocs(1))
+        # Set a dummy status so that wait_for_deregistration() works
+        @setstatus!("foo", pid)
+        rmprocs(pid)
+
+        wait_for_deregistration(pid)
+    end
     DistributedNext.remove_worker_exited_callback(exited_key)
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index ab596e9..99fd04c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -35,5 +35,5 @@ include("managers.jl")
 include("distributed_stdlib_detection.jl")
 
 @testset "Aqua" begin
-    Aqua.test_all(DistributedNext)
+    Aqua.test_all(DistributedNext; stale_deps=(; ignore=[:ScopedValues]))
 end