Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions ci/nightly/pipeline.template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2374,6 +2374,19 @@ steps:
agents:
queue: hetzner-aarch64-16cpu-32gb

- id: orchestratord-revert-rollout
label: Orchestratord revert in-progress rollout
artifact_paths: ["mz_debug_*.zip"]
depends_on: devel-docker-tags
timeout_in_minutes: 60
plugins:
- ./ci/plugins/mzcompose:
composition: orchestratord
run: revert-rollout
ci-builder: stable
agents:
queue: hetzner-aarch64-16cpu-32gb

- id: orchestratord-upgrade-operator
label: Orchestratord operator upgrade
artifact_paths: ["mz_debug_*.zip"]
Expand Down
17 changes: 13 additions & 4 deletions doc/user/content/self-managed-deployments/upgrading/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,19 +273,28 @@ kubectl logs -l app.kubernetes.io/name=materialize-operator -n materialize

You may want to cancel an in-progress rollout if the upgrade has failed. This may be indicated by new pods not being healthy. Before cancelling, verify that the upgrade has not already completed by checking that the deploy generation (found via `status.activeGeneration`) is still the one from before the upgrade. Once an upgrade has already happened, you cannot revert using this method.

To cancel an in-progress rollout and revert to the last completed rollout state, first retrieve the last rollout request ID from your Materialize CR:
To cancel an in-progress rollout and revert to the last completed rollout
state, revert both `requestRollout` and `environmentdImageRef` back to the
values from the last completed rollout. Reverting `environmentdImageRef`
alongside `requestRollout` keeps the spec aligned with what is actually
running, so a later rollout doesn't accidentally pick up the previously
attempted upgrade image.

First, retrieve the last completed rollout request ID and the matching
environmentd image ref from your Materialize CR:

```shell
kubectl get materialize <instance-name> -n materialize-environment -o jsonpath='{.status.lastCompletedRolloutRequest}'
kubectl get materialize <instance-name> -n materialize-environment \
-o jsonpath='{.status.lastCompletedRolloutRequest} {.status.lastCompletedRolloutEnvironmentdImageRef}'
```

Then, set the `requestRollout` back to this value:
Then, set both fields back to these values in a single patch:

```shell
kubectl patch materialize <instance-name> \
-n materialize-environment \
--type='merge' \
-p "{\"spec\": {\"requestRollout\": \"<lastCompletedRolloutRequest-value>\"}}"
-p "{\"spec\": {\"requestRollout\": \"<lastCompletedRolloutRequest-value>\", \"environmentdImageRef\": \"<lastCompletedRolloutEnvironmentdImageRef-value>\"}}"
```

## Version Specific Upgrade Notes
Expand Down
75 changes: 75 additions & 0 deletions src/cloud-resources/src/crd/materialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,20 @@ pub mod v1alpha1 {
.map_or_else(Uuid::nil, |status| status.last_completed_rollout_request)
}

/// Returns the environmentd image ref of the currently-active
/// generation: the image of the last completed rollout, falling back
/// to the spec image when no rollout has completed yet. Downstream
/// resources (balancerd, console) should track this rather than
/// [`MaterializeSpec::environmentd_image_ref`] so they stay aligned
/// with the running environmentd when the spec is mid-rollout or has
/// been partially reverted (DEP-42).
pub fn active_environmentd_image_ref(&self) -> &str {
self.status
.as_ref()
.and_then(|s| s.last_completed_rollout_environmentd_image_ref.as_deref())
.unwrap_or(&self.spec.environmentd_image_ref)
}

pub fn set_force_promote(&mut self) {
self.spec.force_promote = self.spec.request_rollout;
}
Expand Down Expand Up @@ -824,4 +838,65 @@ mod tests {
);
}
}

#[mz_ore::test]
fn active_environmentd_image_ref() {
use super::v1alpha1::MaterializeStatus;

const OLD: &str = "materialize/environmentd:v26.0.0";
const NEW: &str = "materialize/environmentd:v27.0.0";

let mz_with = |spec_image: &str, status: Option<MaterializeStatus>| Materialize {
spec: MaterializeSpec {
environmentd_image_ref: spec_image.to_owned(),
..Default::default()
},
metadata: ObjectMeta::default(),
status,
};

// No status yet (pre-initial-reconcile): fall back to spec.
let mz = mz_with(NEW, None);
assert_eq!(mz.active_environmentd_image_ref(), NEW);

// Status present but last_completed_rollout_environmentd_image_ref
// unset (e.g. resource upgraded from older orchestratord that didn't
// populate the field): fall back to spec.
let mz = mz_with(
NEW,
Some(MaterializeStatus {
last_completed_rollout_environmentd_image_ref: None,
..Default::default()
}),
);
assert_eq!(mz.active_environmentd_image_ref(), NEW);

// Steady state: spec image == last completed image. Either source is
// fine; the method must return that image.
let mz = mz_with(
NEW,
Some(MaterializeStatus {
last_completed_rollout_environmentd_image_ref: Some(NEW.to_owned()),
..Default::default()
}),
);
assert_eq!(mz.active_environmentd_image_ref(), NEW);

// DEP-42 / mid-rollout: spec image == NEW but last_completed_* still
// holds OLD — either because the user canceled the rollout by
// reverting only requestRollout, or because the new generation has
// not yet been promoted. The active environmentd is still OLD, so
// downstream resources must track OLD. Without this method,
// balancerd would inherit the spec's NEW image while environmentd
// still runs OLD, leaving balancerd pods skewed from the running
// env.
let mz = mz_with(
NEW,
Some(MaterializeStatus {
last_completed_rollout_environmentd_image_ref: Some(OLD.to_owned()),
..Default::default()
}),
);
assert_eq!(mz.active_environmentd_image_ref(), OLD);
}
}
7 changes: 4 additions & 3 deletions src/orchestratord/src/controller/materialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,7 @@ impl k8s_controller::Context for Context {
metadata: mz.managed_resource_meta(mz.name_unchecked()),
spec: BalancerSpec {
balancerd_image_ref: matching_image_from_environmentd_image_ref(
&mz.spec.environmentd_image_ref,
mz.active_environmentd_image_ref(),
"balancerd",
None,
),
Expand Down Expand Up @@ -759,8 +759,9 @@ impl k8s_controller::Context for Context {
// enforced by wait_for_balancer

if self.config.create_console {
let active_environmentd_image_ref = mz.active_environmentd_image_ref();
let environmentd_image_tag =
parse_image_tag(&mz.spec.environmentd_image_ref).unwrap_or("latest");
parse_image_tag(active_environmentd_image_ref).unwrap_or("latest");
let console_image_tag = self
.config
.console_image_tag_map
Expand All @@ -772,7 +773,7 @@ impl k8s_controller::Context for Context {
metadata: mz.managed_resource_meta(mz.name_unchecked()),
spec: ConsoleSpec {
console_image_ref: matching_image_from_environmentd_image_ref(
&mz.spec.environmentd_image_ref,
active_environmentd_image_ref,
"console",
Some(&console_image_tag),
),
Expand Down
185 changes: 185 additions & 0 deletions test/orchestratord/mzcompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -2396,6 +2396,191 @@ def check():
check_balancerd_version(versions[-1])


def workflow_revert_rollout(c: Composition, parser: WorkflowArgumentParser) -> None:
# Regression test for DEP-42: if a user starts an upgrade and then cancels
# by reverting only `spec.requestRollout` without also reverting
# `spec.environmentdImageRef`, the spec image stays ahead of the image
# actually running in environmentd. Downstream resources (balancerd,
# console) must continue tracking the last completed rollout's image
# rather than the diverged spec image — otherwise they end up
# version-skewed from environmentd.
#
# We exercise this end-to-end by initial-deploying on a prior released
# version, then starting a `ManuallyPromote` upgrade to the current
# build's image and parking it at `ReadyToPromote` (never promoting it),
# and finally reverting only `requestRollout`.
parser.add_argument(
"--recreate-cluster",
action=argparse.BooleanOptionalAction,
help="Recreate cluster if it exists already",
)
parser.add_argument("--tag", type=str, help="Custom version tag to use")
parser.add_argument(
"--orchestratord-override",
default=True,
action=argparse.BooleanOptionalAction,
help="Override orchestratord tag",
)
args = parser.parse_args()

def get_cr(plural: str) -> dict[str, Any]:
# orchestratord creates the Balancer and Console CRs after writing
# `lastCompletedRolloutRequest`, so `post_run_check` can return
# before they exist. Retry until the resource shows up.
result: dict[str, Any] = {}

def fetch() -> None:
data = json.loads(
spawn.capture(
[
"kubectl",
"get",
plural,
"-n",
"materialize-environment",
"-o",
"json",
]
)
)
assert len(data["items"]) >= 1, f"{plural} not yet present"
result["item"] = data["items"][0]

retry(fetch, 120)
return result["item"]

definition = setup(c, args)

# Initial deploy on a prior released version, so the upgrade target
# (current build) is a real, pullable, different image. orchestratord
# itself stays on the current build — that's the binary whose fix is
# under test.
initial_version = get_all_self_managed_versions()[-1]
initial_image = get_image(
c.compose["services"]["environmentd"]["image"],
str(initial_version),
)
upgrade_image = get_image(
c.compose["services"]["environmentd"]["image"],
args.tag,
)
assert upgrade_image != initial_image, (
"test setup invariant: initial and upgrade env images must differ "
f"(both are {initial_image!r})"
)

definition["materialize"]["spec"]["environmentdImageRef"] = initial_image
init(definition)
run(definition, False)

initial_mz = get_cr("materializes")
initial_request = initial_mz["spec"]["requestRollout"]
assert initial_mz["status"]["lastCompletedRolloutRequest"] == initial_request
assert (
initial_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"]
== initial_image
), (
"status.lastCompletedRolloutEnvironmentdImageRef was not populated "
"with the rolled-out image"
)
initial_balancerd_image_ref = get_cr("balancers")["spec"]["balancerdImageRef"]
initial_console_image_ref = get_cr("consoles")["spec"]["consoleImageRef"]

# Kick off a real ManuallyPromote upgrade to the current build's image
# and leave it parked at ReadyToPromote. At this point the new-generation
# environmentd is up and running, but the active environmentd is still
# the initial-version pod, so `status.lastCompletedRollout*` must not
# have advanced.
definition["materialize"]["spec"]["rolloutStrategy"] = "ManuallyPromote"
definition["materialize"]["spec"]["environmentdImageRef"] = upgrade_image
definition["materialize"]["spec"]["requestRollout"] = str(uuid.uuid4())
spawn.runv(
["kubectl", "apply", "-f", "-"],
stdin=yaml.dump_all(
[
definition["namespace"],
definition["secret"],
definition["materialize"],
]
).encode(),
)
for _ in range(900):
time.sleep(1)
if is_ready_to_manually_promote():
break
else:
spawn.runv(
[
"kubectl",
"get",
"materializes",
"-n",
"materialize-environment",
"-o",
"yaml",
],
)
raise RuntimeError("upgrade never became ready for manual promotion")

parked_mz = get_cr("materializes")
assert parked_mz["status"]["lastCompletedRolloutRequest"] == initial_request
assert (
parked_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"] == initial_image
)

# Even mid-rollout (spec.envImageRef = upgrade, status = initial),
# downstream CRs must track the active image, not the spec image.
parked_balancerd_image_ref = get_cr("balancers")["spec"]["balancerdImageRef"]
assert parked_balancerd_image_ref == initial_balancerd_image_ref, (
f"Balancer CR balancerdImageRef drifted during a parked upgrade: "
f"{initial_balancerd_image_ref!r} -> {parked_balancerd_image_ref!r}"
)
parked_console_image_ref = get_cr("consoles")["spec"]["consoleImageRef"]
assert parked_console_image_ref == initial_console_image_ref, (
f"Console CR consoleImageRef drifted during a parked upgrade: "
f"{initial_console_image_ref!r} -> {parked_console_image_ref!r}"
)

# Cancel the upgrade by reverting only `requestRollout`. Deliberately
# leave `environmentdImageRef` pointed at the upgrade image — that's the
# DEP-42 scenario: spec image and running image diverge.
definition["materialize"]["spec"]["requestRollout"] = initial_request
spawn.runv(
["kubectl", "apply", "-f", "-"],
stdin=yaml.dump(definition["materialize"]).encode(),
)

def check_revert_applied() -> None:
mz = get_cr("materializes")
assert mz["spec"]["requestRollout"] == initial_request
assert mz["spec"]["environmentdImageRef"] == upgrade_image

retry(check_revert_applied, 60)

# Let orchestratord reconcile several times after the revert.
time.sleep(30)

final_mz = get_cr("materializes")
assert final_mz["status"]["lastCompletedRolloutRequest"] == initial_request
assert (
final_mz["status"]["lastCompletedRolloutEnvironmentdImageRef"] == initial_image
), "lastCompletedRolloutEnvironmentdImageRef should not change without a completed rollout"

final_balancerd_image_ref = get_cr("balancers")["spec"]["balancerdImageRef"]
assert final_balancerd_image_ref == initial_balancerd_image_ref, (
f"Balancer CR balancerdImageRef tracked diverged spec image instead "
f"of the last completed rollout image: "
f"{initial_balancerd_image_ref!r} -> {final_balancerd_image_ref!r}"
)

final_console_image_ref = get_cr("consoles")["spec"]["consoleImageRef"]
assert final_console_image_ref == initial_console_image_ref, (
f"Console CR consoleImageRef tracked diverged spec image instead of "
f"the last completed rollout image: "
f"{initial_console_image_ref!r} -> {final_console_image_ref!r}"
)


def workflow_default(c: Composition, parser: WorkflowArgumentParser) -> None:
parser.add_argument(
"--recreate-cluster",
Expand Down
Loading