Skip to content

Commit b4bf111

Browse files
committed
background task to time out incomplete audit logs
1 parent 8f06de6 commit b4bf111

21 files changed

Lines changed: 782 additions & 12 deletions

File tree

dev-tools/omdb/src/bin/omdb/nexus.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ use nexus_types::deployment::OximeterReadPolicy;
5252
use nexus_types::fm;
5353
use nexus_types::internal_api::background::AbandonedVmmReaperStatus;
5454
use nexus_types::internal_api::background::AttachedSubnetManagerStatus;
55+
use nexus_types::internal_api::background::AuditLogTimeoutIncompleteStatus;
5556
use nexus_types::internal_api::background::BlueprintPlannerStatus;
5657
use nexus_types::internal_api::background::BlueprintRendezvousStats;
5758
use nexus_types::internal_api::background::BlueprintRendezvousStatus;
@@ -1220,6 +1221,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) {
12201221
"attached_subnet_manager" => {
12211222
print_task_attached_subnet_manager_status(details);
12221223
}
1224+
"audit_log_timeout_incomplete" => {
1225+
print_task_audit_log_timeout_incomplete(details);
1226+
}
12231227
"blueprint_planner" => {
12241228
print_task_blueprint_planner(details);
12251229
}
@@ -2679,6 +2683,38 @@ fn print_task_saga_recovery(details: &serde_json::Value) {
26792683
}
26802684
}
26812685

2686+
fn print_task_audit_log_timeout_incomplete(details: &serde_json::Value) {
2687+
match serde_json::from_value::<AuditLogTimeoutIncompleteStatus>(
2688+
details.clone(),
2689+
) {
2690+
Err(error) => eprintln!(
2691+
"warning: failed to interpret task details: {:?}: {:?}",
2692+
error, details
2693+
),
2694+
Ok(status) => {
2695+
const TIMED_OUT: &str = "timed_out:";
2696+
const CUTOFF: &str = "cutoff:";
2697+
const MAX_UPDATE: &str = "max_update_per_activation:";
2698+
const ERROR: &str = "error:";
2699+
const WIDTH: usize =
2700+
const_max_len(&[TIMED_OUT, CUTOFF, MAX_UPDATE, ERROR]) + 1;
2701+
2702+
println!(" {TIMED_OUT:<WIDTH$}{}", status.timed_out);
2703+
println!(
2704+
" {CUTOFF:<WIDTH$}{}",
2705+
status.cutoff.to_rfc3339_opts(SecondsFormat::AutoSi, true),
2706+
);
2707+
println!(
2708+
" {MAX_UPDATE:<WIDTH$}{}",
2709+
status.max_update_per_activation
2710+
);
2711+
if let Some(error) = &status.error {
2712+
println!(" {ERROR:<WIDTH$}{error}");
2713+
}
2714+
}
2715+
};
2716+
}
2717+
26822718
fn print_task_session_cleanup(details: &serde_json::Value) {
26832719
match serde_json::from_value::<SessionCleanupStatus>(details.clone()) {
26842720
Err(error) => eprintln!(

dev-tools/omdb/tests/env.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ task: "attached_subnet_manager"
3838
distributes attached subnets to sleds and switch
3939

4040

41+
task: "audit_log_timeout_incomplete"
42+
transitions stale incomplete audit log entries to timeout status so they
43+
become visible in the audit log
44+
45+
4146
task: "bfd_manager"
4247
Manages bidirectional fowarding detection (BFD) configuration on rack
4348
switches
@@ -283,6 +288,11 @@ task: "attached_subnet_manager"
283288
distributes attached subnets to sleds and switch
284289

285290

291+
task: "audit_log_timeout_incomplete"
292+
transitions stale incomplete audit log entries to timeout status so they
293+
become visible in the audit log
294+
295+
286296
task: "bfd_manager"
287297
Manages bidirectional fowarding detection (BFD) configuration on rack
288298
switches
@@ -515,6 +525,11 @@ task: "attached_subnet_manager"
515525
distributes attached subnets to sleds and switch
516526

517527

528+
task: "audit_log_timeout_incomplete"
529+
transitions stale incomplete audit log entries to timeout status so they
530+
become visible in the audit log
531+
532+
518533
task: "bfd_manager"
519534
Manages bidirectional fowarding detection (BFD) configuration on rack
520535
switches

dev-tools/omdb/tests/successes.out

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,11 @@ task: "attached_subnet_manager"
273273
distributes attached subnets to sleds and switch
274274

275275

276+
task: "audit_log_timeout_incomplete"
277+
transitions stale incomplete audit log entries to timeout status so they
278+
become visible in the audit log
279+
280+
276281
task: "bfd_manager"
277282
Manages bidirectional fowarding detection (BFD) configuration on rack
278283
switches
@@ -588,6 +593,14 @@ task: "attached_subnet_manager"
588593
no dendrite instances found
589594
no sleds found
590595

596+
task: "audit_log_timeout_incomplete"
597+
configured period: every <REDACTED_DURATION>m
598+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
599+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
600+
timed_out: 0
601+
cutoff: <REDACTED_TIMESTAMP>
602+
max_update_per_activation: 1000
603+
591604
task: "bfd_manager"
592605
configured period: every <REDACTED_DURATION>s
593606
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
@@ -1183,6 +1196,14 @@ task: "attached_subnet_manager"
11831196
no dendrite instances found
11841197
no sleds found
11851198

1199+
task: "audit_log_timeout_incomplete"
1200+
configured period: every <REDACTED_DURATION>m
1201+
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>
1202+
started at <REDACTED_TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
1203+
timed_out: 0
1204+
cutoff: <REDACTED_TIMESTAMP>
1205+
max_update_per_activation: 1000
1206+
11861207
task: "bfd_manager"
11871208
configured period: every <REDACTED_DURATION>s
11881209
last completed activation: <REDACTED ITERATIONS>, triggered by <TRIGGERED_BY_REDACTED>

nexus-config/src/nexus_config.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,8 @@ pub struct BackgroundTaskConfig {
437437
pub attached_subnet_manager: AttachedSubnetManagerConfig,
438438
/// configuration for console session cleanup task
439439
pub session_cleanup: SessionCleanupConfig,
440+
/// configuration for audit log incomplete timeout task
441+
pub audit_log_timeout_incomplete: AuditLogTimeoutIncompleteConfig,
440442
}
441443

442444
#[serde_as]
@@ -450,6 +452,21 @@ pub struct SessionCleanupConfig {
450452
pub max_delete_per_activation: u32,
451453
}
452454

455+
#[serde_as]
456+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
457+
pub struct AuditLogTimeoutIncompleteConfig {
458+
/// period (in seconds) for periodic activations of this task
459+
#[serde_as(as = "DurationSeconds<u64>")]
460+
pub period_secs: Duration,
461+
462+
/// how old an incomplete entry must be before it is timed out
463+
#[serde_as(as = "DurationSeconds<u64>")]
464+
pub timeout_secs: Duration,
465+
466+
/// max rows per SQL statement
467+
pub max_update_per_activation: u32,
468+
}
469+
453470
#[serde_as]
454471
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
455472
pub struct DnsTasksConfig {
@@ -1267,6 +1284,9 @@ mod test {
12671284
attached_subnet_manager.period_secs = 60
12681285
session_cleanup.period_secs = 300
12691286
session_cleanup.max_delete_per_activation = 10000
1287+
audit_log_timeout_incomplete.period_secs = 600
1288+
audit_log_timeout_incomplete.timeout_secs = 14400
1289+
audit_log_timeout_incomplete.max_update_per_activation = 1000
12701290
[default_region_allocation_strategy]
12711291
type = "random"
12721292
seed = 0
@@ -1534,6 +1554,12 @@ mod test {
15341554
period_secs: Duration::from_secs(300),
15351555
max_delete_per_activation: 10_000,
15361556
},
1557+
audit_log_timeout_incomplete:
1558+
AuditLogTimeoutIncompleteConfig {
1559+
period_secs: Duration::from_secs(600),
1560+
timeout_secs: Duration::from_secs(14400),
1561+
max_update_per_activation: 1000,
1562+
},
15371563
},
15381564
multicast: MulticastConfig { enabled: false },
15391565
default_region_allocation_strategy:
@@ -1641,6 +1667,9 @@ mod test {
16411667
attached_subnet_manager.period_secs = 60
16421668
session_cleanup.period_secs = 300
16431669
session_cleanup.max_delete_per_activation = 10000
1670+
audit_log_timeout_incomplete.period_secs = 600
1671+
audit_log_timeout_incomplete.timeout_secs = 14400
1672+
audit_log_timeout_incomplete.max_update_per_activation = 1000
16441673
16451674
[default_region_allocation_strategy]
16461675
type = "random"

nexus/background-task-interface/src/init.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub struct BackgroundTasks {
3737
pub task_instance_reincarnation: Activator,
3838
pub task_service_firewall_propagation: Activator,
3939
pub task_abandoned_vmm_reaper: Activator,
40+
pub task_audit_log_timeout_incomplete: Activator,
4041
pub task_vpc_route_manager: Activator,
4142
pub task_saga_recovery: Activator,
4243
pub task_lookup_region_port: Activator,

nexus/db-model/src/audit_log.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -276,15 +276,15 @@ pub enum AuditLogCompletion {
276276
/// error, and I don't think we even have API timeouts) but rather that the
277277
/// attempts to complete the log entry failed (or were never even attempted
278278
/// because, e.g., Nexus crashed during the operation), and this entry had
279-
/// to be cleaned up later by a background job (which doesn't exist yet)
280-
/// after a timeout. Note we represent this result status as "Unknown" in
281-
/// the external API because timeout is an implementation detail and makes
282-
/// it sound like the operation timed out.
279+
/// to be cleaned up later by a background job after a timeout. Note we
280+
/// represent this result status as "Unknown" in the external API because
281+
/// timeout is an implementation detail and makes it sound like the
282+
/// operation timed out.
283283
Timeout,
284284
}
285285

286286
#[derive(AsChangeset, Clone)]
287-
#[diesel(table_name = audit_log)]
287+
#[diesel(table_name = audit_log, treat_none_as_null = true)]
288288
pub struct AuditLogCompletionUpdate {
289289
pub time_completed: DateTime<Utc>,
290290
pub result_kind: AuditLogResultKind,

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(237, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(238, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(238, "audit-log-incomplete-timeout"),
3132
KnownVersion::new(237, "switch-slot-enum"),
3233
KnownVersion::new(
3334
236,

0 commit comments

Comments
 (0)