From 9975663e43888ecde0af217aa1a5b1ba101b9145 Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Wed, 27 Sep 2023 18:38:58 +0200 Subject: [PATCH 1/5] Add metrics on the internal-workers --- service/domain_worker.ml | 4 +-- service/domain_worker.mli | 5 ++-- service/pool.ml | 16 +++++++--- service/solver.ml | 61 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 77 insertions(+), 9 deletions(-) diff --git a/service/domain_worker.ml b/service/domain_worker.ml index dab3cdb..6df1f5c 100644 --- a/service/domain_worker.ml +++ b/service/domain_worker.ml @@ -11,7 +11,7 @@ type request = { cancelled : unit Eio.Promise.t option; } -type reply = ((OpamPackage.t list, string) result * float, [`Msg of string]) result +type reply = ((OpamPackage.t list, string) result * float, [`Msg of string | `Cancelled]) result let env (vars : Worker.Vars.t) v = match v with @@ -31,7 +31,7 @@ let env (vars : Worker.Vars.t) v = let solve { packages; root_pkgs; pinned_pkgs; vars; cancelled } = match cancelled with - | Some p when Promise.is_resolved p -> Error (`Msg "Cancelled") + | Some p when Promise.is_resolved p -> Error `Cancelled | _ -> try let pins = root_pkgs @ pinned_pkgs |> OpamPackage.Name.Map.of_list in diff --git a/service/domain_worker.mli b/service/domain_worker.mli index 9e51e24..502b38e 100644 --- a/service/domain_worker.mli +++ b/service/domain_worker.mli @@ -15,10 +15,11 @@ type request = { (** If resolved, the result is not needed. *) } -type reply = ((OpamPackage.t list, string) result * float, [`Msg of string]) result +type reply = ((OpamPackage.t list, string) result * float, [`Msg of string | `Cancelled]) result (** [Ok (Ok selection)] if there is a solution. [Ok (Error msg)] if there is no solution. - [Error msg] if the request was invalid. *) + [Error msg] if the request was invalid. + [Error Cancelled] if the request was cancelled before started. *) val env : Solver_service_api.Worker.Vars.t -> string -> OpamVariable.variable_contents option (** [env vars name] is the value of [name] in [vars]. *) diff --git a/service/pool.ml b/service/pool.ml index 6ba96ea..26f9963 100644 --- a/service/pool.ml +++ b/service/pool.ml @@ -1,14 +1,18 @@ open Eio.Std -type ('request, 'reply) t = ('request * 'reply Promise.u) Eio.Stream.t +type ('request, 'reply) t = { + requests: ('request * 'reply Promise.u) Eio.Stream.t; running: int Atomic.t; n_workers: int +} let rec run_worker t handle = - let request, set_reply = Eio.Stream.take t in + let request, set_reply = Eio.Stream.take t.requests in + Atomic.incr t.running; handle request |> Promise.resolve set_reply; + Atomic.decr t.running; run_worker t handle let create ~sw ~domain_mgr ~n_workers handle = - let t = Eio.Stream.create 0 in + let t = { requests = Eio.Stream.create 0; running = Atomic.make 0; n_workers } in for _i = 1 to n_workers do Fiber.fork_daemon ~sw (fun () -> Eio.Domain_manager.run domain_mgr (fun () -> run_worker t handle) @@ -18,5 +22,9 @@ let create ~sw ~domain_mgr ~n_workers handle = let use t request = let reply, set_reply = Promise.create () in - Eio.Stream.add t (request, set_reply); + Eio.Stream.add t.requests (request, set_reply); Promise.await reply + +let running_workers t = Atomic.get t.running + +let n_workers t = t.n_workers diff --git a/service/solver.ml b/service/solver.ml index 160d9da..81739e9 100644 --- a/service/solver.ml +++ b/service/solver.ml @@ -12,6 +12,49 @@ type t = { let ocaml = OpamPackage.Name.of_string "ocaml" +module Metrics = struct + open Prometheus + + let namespace = "ocluster" + let subsystem = "worker" + + let request_handling_total = + let help = "Total number of handled solve requests" in + Counter.v ~help ~namespace ~subsystem "requests_handled_total" + + let request_handling = + let help = "Number of handled requests by state" in + Gauge.v_label ~label_name:"state" ~help ~namespace ~subsystem "solve_request_state" + + let update_request_handling pool = + let running = Pool.running_workers pool in + let waiting = (Pool.n_workers pool) - running in + Gauge.set (request_handling "running") (float_of_int running); + Gauge.set (request_handling "waiting") (float_of_int waiting) + + + let request_ok = + let help = "Total number of success solve requests" in + Counter.v ~help ~namespace ~subsystem "success_solve" + + let request_fail = + let help = "Total number of fail solve requests" in + Counter.v ~help ~namespace ~subsystem "fail_solve" + + let request_no_solution = + let help = "Total number of no solution solve requests " in + Counter.v ~help ~namespace ~subsystem "no_solution_solve" + + let request_cancelled = + let help = "Total number of cancel without running solve requests" in + Counter.v ~help ~namespace ~subsystem "cancel_without_running_solve" + + let request_cancelled_after = + let help = "Total number of cancel when running solve requests" in + Counter.v ~help ~namespace ~subsystem "cancel_when_running_solve" + +end + (* If a local package has a literal constraint on OCaml's version and it doesn't match the platform, we just remove that package from the set to test, so other packages can still be tested. *) @@ -47,6 +90,7 @@ let solve_for_platform ?cancelled t ~log ~opam_repository_commits ~packages ~roo ) else ( let slice = { Domain_worker.vars; root_pkgs; packages; pinned_pkgs; cancelled } in match Pool.use t.pool slice with + | Error `Cancelled -> Error `Cancelled | Error (`Msg m) -> Error (`Msg m) | Ok (results, time) -> match results with @@ -115,12 +159,15 @@ let solve ?cancelled t ~log request = in Log.info log "Solving for %a" Fmt.(list ~sep:comma string) root_pkgs; let serious_errors = ref [] in + let cancels_without_running = ref 0 in let*! root_pkgs = parse_opams request.root_pkgs in let*! pinned_pkgs = parse_opams request.pinned_pkgs in let*! packages = Stores.packages t.stores opam_repository_commits in let results = platforms |> Fiber.List.map (fun (id, vars) -> + Prometheus.Counter.inc_one Metrics.request_handling_total; + Metrics.update_request_handling t.pool; let result = solve_for_platform t id ?cancelled @@ -132,12 +179,14 @@ let solve ?cancelled t ~log request = ~pins ~vars in + Metrics.update_request_handling t.pool; (id, result) ) |> List.filter_map (fun (id, result) -> Log.info log "= %s =" id; match result with | Ok result -> + Prometheus.Counter.inc_one Metrics.request_ok; Log.info log "-> @[%a@]" Fmt.(list ~sep:sp string) result.Selection.packages; @@ -145,17 +194,27 @@ let solve ?cancelled t ~log request = Fmt.(list ~sep:semi (pair ~sep:comma string string)) result.Selection.commits; Some result + | Error `Cancelled -> + Prometheus.Counter.inc_one Metrics.request_cancelled; + incr cancels_without_running; + Log.info log "%s" "Cancelled"; + None | Error (`No_solution msg) -> + Prometheus.Counter.inc_one Metrics.request_no_solution; Log.info log "%s" msg; None | Error (`Msg msg) -> + Prometheus.Counter.inc_one Metrics.request_fail; Log.info log "%s" msg; serious_errors := msg :: !serious_errors; None ) in match cancelled with - | Some p when Promise.is_resolved p -> Error `Cancelled + | Some p when Promise.is_resolved p -> + let cancels = (List.length platforms) - (!cancels_without_running) in + Prometheus.Counter.inc Metrics.request_cancelled_after (float_of_int cancels); + Error `Cancelled | _ -> match !serious_errors with | [] -> Ok results From 08be7f2c038e836f6c2076e5e47c3a030ca009a6 Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Mon, 9 Oct 2023 15:10:44 +0200 Subject: [PATCH 2/5] Update from Thomas's comment --- service/pool.ml | 6 ++---- service/solver.ml | 16 ++++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/service/pool.ml b/service/pool.ml index 26f9963..d717edd 100644 --- a/service/pool.ml +++ b/service/pool.ml @@ -6,9 +6,7 @@ type ('request, 'reply) t = { let rec run_worker t handle = let request, set_reply = Eio.Stream.take t.requests in - Atomic.incr t.running; handle request |> Promise.resolve set_reply; - Atomic.decr t.running; run_worker t handle let create ~sw ~domain_mgr ~n_workers handle = @@ -25,6 +23,6 @@ let use t request = Eio.Stream.add t.requests (request, set_reply); Promise.await reply -let running_workers t = Atomic.get t.running - let n_workers t = t.n_workers + +let wait_requests t = Eio.Stream.length t.requests diff --git a/service/solver.ml b/service/solver.ml index 81739e9..4d3344c 100644 --- a/service/solver.ml +++ b/service/solver.ml @@ -12,6 +12,7 @@ type t = { let ocaml = OpamPackage.Name.of_string "ocaml" + module Metrics = struct open Prometheus @@ -26,13 +27,13 @@ module Metrics = struct let help = "Number of handled requests by state" in Gauge.v_label ~label_name:"state" ~help ~namespace ~subsystem "solve_request_state" - let update_request_handling pool = - let running = Pool.running_workers pool in - let waiting = (Pool.n_workers pool) - running in + let update_request_handling pool n_requests = + let workers = Pool.n_workers pool in + let waiting = Pool.wait_requests pool in + let running = min !n_requests workers in Gauge.set (request_handling "running") (float_of_int running); Gauge.set (request_handling "waiting") (float_of_int waiting) - let request_ok = let help = "Total number of success solve requests" in Counter.v ~help ~namespace ~subsystem "success_solve" @@ -160,6 +161,7 @@ let solve ?cancelled t ~log request = Log.info log "Solving for %a" Fmt.(list ~sep:comma string) root_pkgs; let serious_errors = ref [] in let cancels_without_running = ref 0 in + let n_requests = ref 0 in let*! root_pkgs = parse_opams request.root_pkgs in let*! pinned_pkgs = parse_opams request.pinned_pkgs in let*! packages = Stores.packages t.stores opam_repository_commits in @@ -167,7 +169,8 @@ let solve ?cancelled t ~log request = platforms |> Fiber.List.map (fun (id, vars) -> Prometheus.Counter.inc_one Metrics.request_handling_total; - Metrics.update_request_handling t.pool; + incr n_requests; + Metrics.update_request_handling t.pool n_requests; let result = solve_for_platform t id ?cancelled @@ -179,7 +182,8 @@ let solve ?cancelled t ~log request = ~pins ~vars in - Metrics.update_request_handling t.pool; + decr n_requests; + Metrics.update_request_handling t.pool n_requests; (id, result) ) |> List.filter_map (fun (id, result) -> From fbe5cd92e26dd79b7865f33018b8897b324755a0 Mon Sep 17 00:00:00 2001 From: Alpha Issiaga DIALLO Date: Wed, 18 Oct 2023 15:39:32 +0200 Subject: [PATCH 3/5] Update service/solver.ml solver instead of worker make sense. Co-authored-by: Mark Elvers --- service/solver.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service/solver.ml b/service/solver.ml index 4d3344c..b2451d8 100644 --- a/service/solver.ml +++ b/service/solver.ml @@ -17,7 +17,7 @@ module Metrics = struct open Prometheus let namespace = "ocluster" - let subsystem = "worker" + let subsystem = "solver" let request_handling_total = let help = "Total number of handled solve requests" in From b857d5f56a556a5a2e0b294f10db969cef3b4b1d Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Fri, 20 Oct 2023 16:54:27 +0200 Subject: [PATCH 4/5] Refactoring, fixing the number of waiting requests The pool stream wasn't accumulating the requests. So it was incorrect to consider its length as waiting requests. --- service/pool.ml | 2 +- service/solver.ml | 38 ++++++++++++++++++++------------------ 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/service/pool.ml b/service/pool.ml index d717edd..6f5c6d7 100644 --- a/service/pool.ml +++ b/service/pool.ml @@ -10,7 +10,7 @@ let rec run_worker t handle = run_worker t handle let create ~sw ~domain_mgr ~n_workers handle = - let t = { requests = Eio.Stream.create 0; running = Atomic.make 0; n_workers } in + let t = { requests = Eio.Stream.create max_int; running = Atomic.make 0; n_workers } in for _i = 1 to n_workers do Fiber.fork_daemon ~sw (fun () -> Eio.Domain_manager.run domain_mgr (fun () -> run_worker t handle) diff --git a/service/solver.ml b/service/solver.ml index b2451d8..3a6c25d 100644 --- a/service/solver.ml +++ b/service/solver.ml @@ -25,7 +25,7 @@ module Metrics = struct let request_handling = let help = "Number of handled requests by state" in - Gauge.v_label ~label_name:"state" ~help ~namespace ~subsystem "solve_request_state" + Gauge.v_label ~label_name:"state" ~help ~namespace ~subsystem "request_state" let update_request_handling pool n_requests = let workers = Pool.n_workers pool in @@ -36,23 +36,23 @@ module Metrics = struct let request_ok = let help = "Total number of success solve requests" in - Counter.v ~help ~namespace ~subsystem "success_solve" + Counter.v ~help ~namespace ~subsystem "success" let request_fail = let help = "Total number of fail solve requests" in - Counter.v ~help ~namespace ~subsystem "fail_solve" + Counter.v ~help ~namespace ~subsystem "failed" let request_no_solution = let help = "Total number of no solution solve requests " in - Counter.v ~help ~namespace ~subsystem "no_solution_solve" + Counter.v ~help ~namespace ~subsystem "no_solution" - let request_cancelled = - let help = "Total number of cancel without running solve requests" in - Counter.v ~help ~namespace ~subsystem "cancel_without_running_solve" + let request_cancel_waiting = + let help = "Total number of cancel when the requests are waiting" in + Counter.v ~help ~namespace ~subsystem "cancel_waiting" - let request_cancelled_after = - let help = "Total number of cancel when running solve requests" in - Counter.v ~help ~namespace ~subsystem "cancel_when_running_solve" + let request_cancel_running = + let help = "Total number of cancel when the requests are running" in + Counter.v ~help ~namespace ~subsystem "cancel_running" end @@ -160,8 +160,9 @@ let solve ?cancelled t ~log request = in Log.info log "Solving for %a" Fmt.(list ~sep:comma string) root_pkgs; let serious_errors = ref [] in - let cancels_without_running = ref 0 in + let cancel_waiting = ref 0 in let n_requests = ref 0 in + let pool = t.pool in let*! root_pkgs = parse_opams request.root_pkgs in let*! pinned_pkgs = parse_opams request.pinned_pkgs in let*! packages = Stores.packages t.stores opam_repository_commits in @@ -170,7 +171,7 @@ let solve ?cancelled t ~log request = |> Fiber.List.map (fun (id, vars) -> Prometheus.Counter.inc_one Metrics.request_handling_total; incr n_requests; - Metrics.update_request_handling t.pool n_requests; + Metrics.update_request_handling pool n_requests; let result = solve_for_platform t id ?cancelled @@ -182,12 +183,13 @@ let solve ?cancelled t ~log request = ~pins ~vars in - decr n_requests; - Metrics.update_request_handling t.pool n_requests; + Metrics.update_request_handling pool n_requests; (id, result) ) |> List.filter_map (fun (id, result) -> Log.info log "= %s =" id; + decr n_requests; + Metrics.update_request_handling pool n_requests; match result with | Ok result -> Prometheus.Counter.inc_one Metrics.request_ok; @@ -199,8 +201,8 @@ let solve ?cancelled t ~log request = result.Selection.commits; Some result | Error `Cancelled -> - Prometheus.Counter.inc_one Metrics.request_cancelled; - incr cancels_without_running; + Prometheus.Counter.inc_one Metrics.request_cancel_waiting; + incr cancel_waiting; Log.info log "%s" "Cancelled"; None | Error (`No_solution msg) -> @@ -216,8 +218,8 @@ let solve ?cancelled t ~log request = in match cancelled with | Some p when Promise.is_resolved p -> - let cancels = (List.length platforms) - (!cancels_without_running) in - Prometheus.Counter.inc Metrics.request_cancelled_after (float_of_int cancels); + let cancels = (List.length platforms) - (!cancel_waiting) in + Prometheus.Counter.inc Metrics.request_cancel_running (float_of_int cancels); Error `Cancelled | _ -> match !serious_errors with From c790a40c0bb6733f3b8fc65aab583a4069cbfe9f Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Mon, 13 Nov 2023 13:39:03 +0100 Subject: [PATCH 5/5] Add the total time spent finding solutions. --- service/solver.ml | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/service/solver.ml b/service/solver.ml index 3a6c25d..4a4ec85 100644 --- a/service/solver.ml +++ b/service/solver.ml @@ -20,7 +20,7 @@ module Metrics = struct let subsystem = "solver" let request_handling_total = - let help = "Total number of handled solve requests" in + let help = "Total number of handled requests" in Counter.v ~help ~namespace ~subsystem "requests_handled_total" let request_handling = @@ -32,18 +32,19 @@ module Metrics = struct let waiting = Pool.wait_requests pool in let running = min !n_requests workers in Gauge.set (request_handling "running") (float_of_int running); - Gauge.set (request_handling "waiting") (float_of_int waiting) + Gauge.set (request_handling "waiting") (float_of_int waiting); + Gauge.set (request_handling "capacity") (float_of_int workers) let request_ok = - let help = "Total number of success solve requests" in + let help = "Total number of success requests" in Counter.v ~help ~namespace ~subsystem "success" let request_fail = - let help = "Total number of fail solve requests" in + let help = "Total number of fail requests" in Counter.v ~help ~namespace ~subsystem "failed" let request_no_solution = - let help = "Total number of no solution solve requests " in + let help = "Total number of no solution requests " in Counter.v ~help ~namespace ~subsystem "no_solution" let request_cancel_waiting = @@ -54,6 +55,9 @@ module Metrics = struct let help = "Total number of cancel when the requests are running" in Counter.v ~help ~namespace ~subsystem "cancel_running" + let request_timing = + let help = "Total time spent finding solutions" in + Summary.v ~help ~namespace ~subsystem "timing" end (* If a local package has a literal constraint on OCaml's version and it doesn't match @@ -96,9 +100,11 @@ let solve_for_platform ?cancelled t ~log ~opam_repository_commits ~packages ~roo | Ok (results, time) -> match results with | Error e -> + Prometheus.Summary.observe Metrics.request_timing time; Log.info log "%s: eliminated all possibilities in %.2f s" id time; Error (`No_solution e) | Ok packages -> + Prometheus.Summary.observe Metrics.request_timing time; Log.info log "%s: found solution in %.2f s" id time; let repo_packages = packages