diff --git a/chart/templates/skyhook-crd.yaml b/chart/templates/skyhook-crd.yaml index dece8b0e..2f761e59 100644 --- a/chart/templates/skyhook-crd.yaml +++ b/chart/templates/skyhook-crd.yaml @@ -536,20 +536,30 @@ spec: type: object type: object x-kubernetes-map-type: atomic - runtimeRequired: - default: false - description: This skyhook is required to have been completed before - any workloads can start - type: boolean priority: description: Priority determines the order in which skyhooks are applied. Lower values are applied first. type: integer minimum: 1 default: 200 + runtimeRequired: + default: false + description: This skyhook is required to have been completed before + any workloads can start + type: boolean + sequencing: + default: node + description: |- + Sequencing controls whether priority ordering is enforced globally or per-node. + "node" (default): a node can proceed past this skyhook independently once it completes on that node. + "all": all nodes must complete this skyhook before any node starts the next priority. + enum: + - all + - node + type: string serial: default: false - description: Serial tells skyhook if it allowed to run in parallel or - not when applying packages + description: Serial tells skyhook if it allowed to run in packages + in parallel. If true, the operator will run one package at a time. type: boolean type: object status: diff --git a/docs/ordering_of_skyhooks.md b/docs/ordering_of_skyhooks.md index 645435ef..f184b13d 100644 --- a/docs/ordering_of_skyhooks.md +++ b/docs/ordering_of_skyhooks.md @@ -1,42 +1,102 @@ # Ordering of Skyhooks -## What -Skyhooks are applied in a repeatable and specific order based on their `priority` field. Each custom resource supports a `priority` field which is a non-zero positive integer. Skyhooks will be processed in order starting from 1, any Skyhooks with the same `priority` will be processed by sorting them by their `metadata.name` field. + +## Priority + +Skyhooks are applied in a repeatable and specific order based on their `priority` field. Each custom resource supports a `priority` field which is a non-zero positive integer. Skyhooks will be processed in order starting from 1. Skyhooks with the same `priority` are processed by sorting on their `metadata.name` field. **NOTE**: Any Skyhook which does NOT provide a `priority` field will be assigned a priority value of 200. -## Per-Node Ordering +--- + +## Sequencing + +The `sequencing` field on each Skyhook controls how it gates the next priority level. This determines whether nodes progress independently or must synchronize. + +### `sequencing: node` (default) + +Per-node ordering. A node proceeds past this skyhook independently once it completes on that node. Other nodes do not need to finish first. + +``` +Node A completes Skyhook 1 → Node A immediately starts Skyhook 2 +Node B still on Skyhook 1 → Node B shows "waiting" on Skyhook 2 +Node A completes Skyhook 2 → Node A is fully complete +Node B completes Skyhook 1 → Node B starts Skyhook 2 +``` + +This prevents deadlocks where stuck or bad nodes block healthy nodes from progressing. + +### `sequencing: all` + +Global ordering. **ALL** nodes must complete this skyhook before **ANY** node starts the next priority level. Use this when the next priority depends on every node being at the same stage (e.g., cluster-wide configuration that must be applied everywhere before proceeding). + +```yaml +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cluster-config +spec: + priority: 10 + sequencing: all # all nodes must finish before priority 11+ starts + ... +``` + +``` +Node A completes cluster-config → Node A waits +Node B still on cluster-config → both nodes blocked from priority 11 +Node B completes cluster-config → both nodes start priority 11 +``` + +When a skyhook with `sequencing: all` is not yet globally complete, it shows `waiting` status at the skyhook level. Individual nodes inherit this waiting state rather than being evaluated independently. -**Important**: Priority ordering is enforced **per-node**, not globally across all nodes. This means: -- Node A can proceed to Skyhook 2 as soon as Skyhook 1 completes on Node A -- Node A does NOT wait for Node B to complete Skyhook 1 -- If Node B is stuck on Skyhook 1, Node A can still progress through all its skyhooks +### Mixing modes -This per-node behavior prevents deadlocks where a few stuck/bad nodes would block all other healthy nodes from progressing through their skyhook sequence. +Different skyhooks can use different sequencing modes. A skyhook's `sequencing` field determines how **it** gates the next priority: -### Example -With two nodes (A, B) and two skyhooks (priority 1 and priority 2): -- Node A completes Skyhook 1 → Node A immediately starts Skyhook 2 -- Node B is still processing Skyhook 1 → Node B shows "waiting" status on Skyhook 2 -- Node A completes Skyhook 2 → Node A is fully complete -- Node B eventually completes Skyhook 1 → Node B starts Skyhook 2 +``` +Priority 1: driver-install (sequencing: node) ← nodes progress independently +Priority 2: cluster-config (sequencing: all) ← sync point: all must finish +Priority 3: workload-setup (sequencing: node) ← resumes per-node after sync +``` + +In this example, fast nodes can install drivers independently, but all nodes must complete the cluster config before any node starts workload setup. + +### Caution: Deadlock risks + +**`sequencing: all` + `runtimeRequired: true`** — This combination can deadlock your cluster. With `runtimeRequired`, nodes are tainted until the skyhook completes, preventing workloads from scheduling. With `sequencing: all`, every node must complete before any node moves to the next priority. If a single node fails (unhealthy, can't schedule pods, bad hardware), all nodes remain tainted and blocked indefinitely. New nodes joining the cluster with the same selector will also be tainted and must complete before the gate releases — if those nodes aren't healthy, the deadlock worsens. + +**`sequencing: all` with unreliable packages** — Even without `runtimeRequired`, `sequencing: all` means one stuck node blocks all nodes from progressing to the next priority. If your package has a bug or a node has an issue that prevents completion, the entire rollout stalls. Prefer `sequencing: node` (the default) unless you have a strong reason to require cluster-wide synchronization. + +**`runtimeRequired: true` with untested packages** — Since `runtimeRequired` leaves nodes tainted until the skyhook completes, a broken package image or misconfigured package will leave nodes tainted and unable to run workloads. Always test packages on a small node group first before applying with `runtimeRequired` to your full cluster. + +--- ## Flow Control Annotations Two flow control features can be set in the annotations of each skyhook: - * `skyhook.nvidia.com/disable`: bool. When `true` it will skip this Skyhook from processing and continue with any other ones further down the priority order. - * `skyhook.nvidia.com/pause`: bool. When `true` it will NOT process this Skyhook and it WILL NOT continue to process any Skyhook's after this one on that node. This will effectively stop all application of Skyhooks starting with this one. NOTE: This ability used to be on the Skyhook spec itself as the `pause` field and has been moved here to be consistent with `disable` and to avoid incrementing the generation of a Skyhook Custom Resource instance when changing it. + +- `skyhook.nvidia.com/disable`: bool. When `true`, skips this Skyhook from processing and continues with any others further down the priority order. +- `skyhook.nvidia.com/pause`: bool. When `true`, does NOT process this Skyhook and will NOT continue to process any Skyhooks after this one on that node. This effectively stops all application of Skyhooks starting with this one. + +**NOTE**: `pause` was previously on the Skyhook spec and has been moved to annotations to be consistent with `disable` and to avoid incrementing the generation when toggling it. + +--- + +## Recommended Priority Buckets + +To coordinate work without explicit communication, we recommend bucketing Skyhooks by priority range: + +| Range | Purpose | Examples | +|-------|---------|----------| +| 1–99 | Initialization and infrastructure | Security tools, monitoring agents | +| 100–199 | Configuration | SSH access, network settings | +| 200+ | User-level configuration | Workload tuning, application setup | + +--- ## Why -This solves a few problems: -The first is to to better support debugging. Prior to this it was impossible to know the order Skyhooks would get applied to nodes as they would all run in parallel. This can, and has, lead to issues debugging a problem as it isn't deterministic. Now every node will always receive updates in the same order as every other node. Additionaly, this removes the possiblility of conflicts between Skyhooks by heaving each one run in order. +**Deterministic ordering** — Prior to priority ordering, Skyhooks ran in parallel with no deterministic order. This made debugging difficult since different nodes could receive updates in different sequences. Priority ordering ensures every node processes Skyhooks in the same order. -The second is to provide the ability for complex tasks to be sequenced. This comes up when needing to apply different sets of work to different node groups in a particular order. +**Complex sequencing** — Some workflows require applying different sets of work to different node groups in a particular order. Priority ordering with `sequencing: all` enables cluster-wide synchronization points. -The third is to provide the community a way to bucket Skyhooks according to where they might live in a stream of updates and therefore better coordinate work without explicit communication. We propose the following buckets: - * 1 - 99 for initialization and infrastucture work - * install security or monitoring tools - * 100 - 199 for configuration work - * configuring ssh access - * 200+ for final user level configuration - * applying tuning for workloads \ No newline at end of file +**Community coordination** — Priority buckets provide a shared convention so different teams can coordinate Skyhook ordering without direct communication. diff --git a/k8s-tests/chainsaw/cli/deployment-policy/chainsaw-test.yaml b/k8s-tests/chainsaw/cli/deployment-policy/chainsaw-test.yaml index 47ff8b8e..ddb8cae7 100644 --- a/k8s-tests/chainsaw/cli/deployment-policy/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/cli/deployment-policy/chainsaw-test.yaml @@ -21,8 +21,8 @@ metadata: name: cli-deployment-policy-reset spec: timeouts: - assert: 120s - exec: 90s + assert: 150s + exec: 30s steps: # Step 0: Reset state and label worker nodes - name: reset-state diff --git a/k8s-tests/chainsaw/cli/deployment-policy/deployment-policy.yaml b/k8s-tests/chainsaw/cli/deployment-policy/deployment-policy.yaml index 83a2f6ac..83661608 100644 --- a/k8s-tests/chainsaw/cli/deployment-policy/deployment-policy.yaml +++ b/k8s-tests/chainsaw/cli/deployment-policy/deployment-policy.yaml @@ -19,7 +19,7 @@ kind: DeploymentPolicy metadata: name: cli-dp-reset-policy spec: - resetBatchStateOnCompletion: true + resetBatchStateOnCompletion: true # Skyhook override (false) should take precedence default: budget: diff --git a/k8s-tests/chainsaw/cli/deployment-policy/skyhook.yaml b/k8s-tests/chainsaw/cli/deployment-policy/skyhook.yaml index db037266..e2e5135c 100644 --- a/k8s-tests/chainsaw/cli/deployment-policy/skyhook.yaml +++ b/k8s-tests/chainsaw/cli/deployment-policy/skyhook.yaml @@ -31,7 +31,7 @@ spec: packages: test-package: - version: "1.0.0" + version: "1.1.1" image: ghcr.io/nvidia/skyhook-packages/shellscript configMap: apply.sh: | diff --git a/k8s-tests/chainsaw/deployment-policy/batch-state-reset/chainsaw-test.yaml b/k8s-tests/chainsaw/deployment-policy/batch-state-reset/chainsaw-test.yaml index 2913c8a0..8c66f38c 100644 --- a/k8s-tests/chainsaw/deployment-policy/batch-state-reset/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/deployment-policy/batch-state-reset/chainsaw-test.yaml @@ -25,7 +25,7 @@ metadata: spec: description: Test batch state reset - auto-reset on completion, config precedence, and manual CLI reset timeouts: - assert: 240s + assert: 270s exec: 30s steps: diff --git a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/deployment-policy.yaml b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/deployment-policy.yaml index 059e30b3..6608a9d7 100644 --- a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/deployment-policy.yaml +++ b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/deployment-policy.yaml @@ -22,6 +22,7 @@ kind: DeploymentPolicy metadata: name: overlapping-selector-policy spec: + resetBatchStateOnCompletion: false # Preserve batch state so test can assert batch progression default: budget: percent: 100 diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/README.md b/k8s-tests/chainsaw/skyhook/cleanup-pods/README.md index 4b1ee707..0ad72ec6 100644 --- a/k8s-tests/chainsaw/skyhook/cleanup-pods/README.md +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/README.md @@ -22,6 +22,9 @@ Validates that the operator correctly cleans up pods when a node's state is rese ## Files -- `chainsaw-test.yaml` - Main test configuration -- `skyhook.yaml` - Skyhook resource definition -- `assert.yaml` - State assertions +- `chainsaw-test.yaml` - Main test configuration with lifecycle assertions inline (pods, nodes, skyhook status) for sequential ordering +- `setup.yaml` - Skyhook resource definition with package dependencies +- `assert-setup-complete.yaml` - Assertion for initial setup completion +- `assert-config-complete.yaml` - Assertion for config cycle completion +- `force-config.yaml` - Update to trigger a config cycle +- `muck_up.yaml` - Update to make a package error diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml deleted file mode 100644 index 2ac8f1e6..00000000 --- a/k8s-tests/chainsaw/skyhook/cleanup-pods/assert-cleaned-pods.yaml +++ /dev/null @@ -1,162 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: cleanup-pods - skyhook.nvidia.com/package: aa-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "aa", - "version": "1.2.3", - "skyhook": "cleanup-pods", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: cleanup-pods -spec: - nodeName: kind-worker - initContainers: - - name: aa-init - - name: aa-apply - args: - ([0]): apply - ([1]): /root - (length(@)): 3 - - name: aa-applycheck - args: - ([0]): apply-check - ([1]): /root - (length(@)): 3 ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: cleanup-pods - skyhook.nvidia.com/package: aa-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "aa", - "version": "1.2.3", - "skyhook": "cleanup-pods", - "stage": "config", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: cleanup-pods -spec: - nodeName: kind-worker - initContainers: - - name: aa-init - - name: aa-config - args: - ([0]): config - ([1]): /root - (length(@)): 3 - - name: aa-configcheck - args: - ([0]): config-check - ([1]): /root - (length(@)): 3 ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: cleanup-pods - skyhook.nvidia.com/package: bb-1.2 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "bb", - "version": "1.2", - "skyhook": "cleanup-pods", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: cleanup-pods -spec: - nodeName: kind-worker - initContainers: - - name: bb-init - - name: bb-apply - args: - ([0]): apply - ([1]): /root - (length(@)): 3 - - name: bb-applycheck - args: - ([0]): apply-check - ([1]): /root - (length(@)): 3 ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: cleanup-pods -status: - status: erroring - observedGeneration: 4 - nodeState: - kind-worker: - aa|1.2.3: - name: aa - state: complete - version: '1.2.3' - image: ghcr.io/nvidia/skyhook/agentless - stage: config - bb|1.2: - name: bb - state: erroring - version: '1.2' - image: ghcr.io/nvidia/skyhook/agentless - stage: apply - kind-worker2: - aa|1.2.3: - name: aa - state: complete - version: '1.2.3' - image: ghcr.io/nvidia/skyhook/agentless - stage: config - bb|1.2: - name: bb - state: erroring - version: '1.2' - image: ghcr.io/nvidia/skyhook/agentless - stage: config - - nodeStatus: - # grab values should be one and is complete - (values(@)): - - erroring - - erroring diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml index 768ab4ad..e63acf01 100644 --- a/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml @@ -21,10 +21,11 @@ metadata: name: cleanup-pods spec: timeouts: - assert: 240s + assert: 120s steps: - - try: - ## setup step, skyhook to complete + - name: setup + description: Create skyhook with dependencies, complete it, then trigger a config cycle + try: - script: content: | ## remove annotation from last run @@ -44,13 +45,176 @@ spec: - sleep: ## there is a race between pods marking the node complete, i think it will trigger eventually, but the event is missed duration: 2s - - try: + - name: cleanup + description: Make package error, reset node annotation, verify orphan pods are cleaned up + try: - update: file: muck_up.yaml ## wack node annotation to rest node + - assert: + ## wait for muck_up to be processed before resetting the node + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: cleanup-pods + status: + status: erroring - script: content: | ## delete the node annotation on one node to trigger cleanup kubectl annotate node/kind-worker skyhook.nvidia.com/nodeState_cleanup-pods- + ## assert pods are recreated on kind-worker after reset (sequential: pods get cleaned up between stages) + - assert: + ## aa apply pod on kind-worker + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: aa-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa", + "version": "1.2.3", + "skyhook": "cleanup-pods", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods + spec: + nodeName: kind-worker + initContainers: + - name: aa-init + - name: aa-apply + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: aa-applycheck + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 + - assert: + ## aa config pod on kind-worker + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: aa-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa", + "version": "1.2.3", + "skyhook": "cleanup-pods", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods + spec: + nodeName: kind-worker + initContainers: + - name: aa-init + - name: aa-config + args: + ([0]): config + ([1]): /root + (length(@)): 3 + - name: aa-configcheck + args: + ([0]): config-check + ([1]): /root + (length(@)): 3 + - assert: + ## bb apply pod on kind-worker (errors due to EXIT_CODE=2) + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: cleanup-pods + skyhook.nvidia.com/package: bb-1.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "bb", + "version": "1.2", + "skyhook": "cleanup-pods", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: cleanup-pods + spec: + nodeName: kind-worker + initContainers: + - name: bb-init + - name: bb-apply + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: bb-applycheck + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 + ## assert final skyhook status - assert: - file: assert-cleaned-pods.yaml + ## skyhook shows erroring on both nodes + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: cleanup-pods + status: + status: erroring + observedGeneration: 4 + nodeState: + kind-worker: + aa|1.2.3: + name: aa + state: complete + version: '1.2.3' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + bb|1.2: + name: bb + state: erroring + version: '1.2' + image: ghcr.io/nvidia/skyhook/agentless + stage: apply + kind-worker2: + aa|1.2.3: + name: aa + state: complete + version: '1.2.3' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + bb|1.2: + name: bb + state: erroring + version: '1.2' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + nodeStatus: + (values(@)): + - erroring + - erroring diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/README.md b/k8s-tests/chainsaw/skyhook/config-skyhook/README.md index ce824fa1..6cb9a60d 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/README.md +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/README.md @@ -28,7 +28,14 @@ Validates that configuration changes work correctly for the operator, including ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with lifecycle assertions inline (nodes, skyhook status) for sequential ordering - `skyhook.yaml` - Initial skyhook definition -- `update*.yaml` - Various update configurations -- `assert*.yaml` - State assertions for each phase +- `update-while-running.yaml` - Config update applied while skyhook is still running +- `update.yaml` - Standard config update +- `update-no-interrupt.yaml` - Config update with no interrupt +- `update-glob.yaml` - Config update using glob-based package selection +- `assert-cm-deploy.yaml` - ConfigMap assertions for initial deploy phase +- `assert-cm-update-while-running.yaml` - ConfigMap assertions for update-while-running phase +- `assert-cm-update.yaml` - ConfigMap assertions for standard update phase +- `assert-cm-update-no-interrupt.yaml` - ConfigMap assertions for no-interrupt update phase +- `assert-cm-update-glob.yaml` - ConfigMap assertions for glob update phase diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-deploy.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-deploy.yaml new file mode 100644 index 00000000..b102517c --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-deploy.yaml @@ -0,0 +1,78 @@ +# ConfigMaps created during initial deploy (parallel: these are distinct resources that coexist) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-dexter-1.2.3 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + enemies=aliens + lives=3 + enemies.cheat=true + enemies.cheat.level=noGoodRotten + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-baxter-3.2.1 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + enemies=aliens + lives=3 + enemies.cheat=true + enemies.cheat.level=noGoodRotten + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"baxter"')): true + (contains(@, '"spencer"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-no-interrupt.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-no-interrupt.yaml new file mode 100644 index 00000000..5c60bfab --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-no-interrupt.yaml @@ -0,0 +1,64 @@ +# ConfigMaps after no-interrupt update (parallel: distinct resources that coexist) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-dexter-1.2.3 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + changed + ui.properties: | + changed +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-baxter-3.2.1 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + changed again + ui.properties: | + changed +--- +kind: ConfigMap +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-while-running.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-while-running.yaml new file mode 100644 index 00000000..d9e2123c --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update-while-running.yaml @@ -0,0 +1,73 @@ +# ConfigMaps after update-while-running (parallel: distinct resources that coexist) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-dexter-1.2.3 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + enemies=aliens + lives=3 + enemies.cheat=true + enemies.cheat.level=noGoodRotten + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-baxter-3.2.1 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + changed + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update.yaml new file mode 100644 index 00000000..d8297452 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-cm-update.yaml @@ -0,0 +1,72 @@ +# ConfigMaps after update (parallel: distinct resources that coexist) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-dexter-1.2.3 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + changed + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: config-skyhook-baxter-3.2.1 + namespace: skyhook + labels: + skyhook.nvidia.com/name: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + game.properties: | + changed again + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: config-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: config-skyhook +data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true + (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true + (contains(@, '"baxter"')): true + (contains(@, '"spencer"')): true + (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml deleted file mode 100644 index 03438f34..00000000 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-no-interrupt.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_config-skyhook: complete - annotations: - ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): - { - "baxter|3.2.1": { - "name": "baxter", - "version": "3.2.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - } - } - skyhook.nvidia.com/status_config-skyhook: complete -status: - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: config-skyhook -status: - status: complete - observedGeneration: 5 - nodeState: - (values(@)): - - dexter|1.2.3: - name: dexter - state: complete - version: '1.2.3' - stage: config - image: ghcr.io/nvidia/skyhook/agentless - baxter|3.2.1: - name: baxter - state: complete - version: '3.2.1' - stage: config - image: ghcr.io/nvidia/skyhook/agentless - spencer|3.2.3: - name: spencer - state: complete - version: '3.2.3' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-dexter-1.2.3 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - changed - ui.properties: | - changed ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-baxter-3.2.1 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - changed again - ui.properties: | - changed ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dexter"')): true - (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml deleted file mode 100644 index 51518b6c..00000000 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update-while-running.yaml +++ /dev/null @@ -1,160 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_config-skyhook: complete - annotations: - ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): - { - "baxter|3.2.1": { - "name": "baxter", - "version": "3.2.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - } - } - skyhook.nvidia.com/status_config-skyhook: complete -status: - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: config-skyhook -status: - status: complete - observedGeneration: 3 - nodeState: - (values(@)): - - dexter|1.2.3: - name: dexter - state: complete - version: '1.2.3' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - baxter|3.2.1: - name: baxter - state: complete - version: '3.2.1' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - spencer|3.2.3: - name: spencer - state: complete - version: '3.2.3' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-dexter-1.2.3 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - enemies=aliens - lives=3 - enemies.cheat=true - enemies.cheat.level=noGoodRotten - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-baxter-3.2.1 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - changed - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dexter"')): true - (contains(@, '"3.2.3"')): true \ No newline at end of file diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml deleted file mode 100644 index 370d55e3..00000000 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert-update.yaml +++ /dev/null @@ -1,159 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_config-skyhook: complete - annotations: - ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): - { - "baxter|3.2.1": { - "name": "baxter", - "version": "3.2.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - } - } - skyhook.nvidia.com/status_config-skyhook: complete -status: - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: config-skyhook -status: - status: complete - observedGeneration: 4 - nodeState: - (values(@)): - - dexter|1.2.3: - name: dexter - state: complete - version: '1.2.3' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - baxter|3.2.1: - name: baxter - state: complete - version: '3.2.1' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - spencer|3.2.3: - name: spencer - state: complete - version: '3.2.3' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-dexter-1.2.3 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - changed - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-baxter-3.2.1 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - changed again - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dexter"')): true - (contains(@, '"baxter"')): true - (contains(@, '"spencer"')): true - (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml deleted file mode 100644 index dd7b1005..00000000 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/assert.yaml +++ /dev/null @@ -1,167 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_config-skyhook: in_progress - annotations: - ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): - { - "baxter|3.2.1": { - "name": "baxter", - "version": "3.2.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "apply", - "state": "in_progress" - }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "apply", - "state": "in_progress" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "apply", - "state": "in_progress" - }, - } - skyhook.nvidia.com/status_config-skyhook: in_progress -spec: - taints: - - effect: NoSchedule - key: node.kubernetes.io/unschedulable -status: - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: config-skyhook -status: - status: in_progress - nodeState: - (values(@)): - - baxter|3.2.1: - name: baxter - state: in_progress - version: '3.2.1' - stage: apply - image: ghcr.io/nvidia/skyhook/agentless - spencer|3.2.3: - name: spencer - state: in_progress - version: '3.2.3' - stage: apply - image: ghcr.io/nvidia/skyhook/agentless - dexter|1.2.3: - name: dexter - state: in_progress - version: '1.2.3' - stage: apply - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - in_progress ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-dexter-1.2.3 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - enemies=aliens - lives=3 - enemies.cheat=true - enemies.cheat.level=noGoodRotten - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: config-skyhook-baxter-3.2.1 - namespace: skyhook - labels: - skyhook.nvidia.com/name: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - game.properties: | - enemies=aliens - lives=3 - enemies.cheat=true - enemies.cheat.level=noGoodRotten - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: config-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: config-skyhook -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_config-skyhook')): true - (contains(@, 'skyhook.nvidia.com/nodeState_config-skyhook')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dexter"')): true - (contains(@, '"baxter"')): true - (contains(@, '"spencer"')): true - (contains(@, '"3.2.3"')): true diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml index bbab487f..a3b8cdfa 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml @@ -21,7 +21,7 @@ metadata: name: config-skyhook spec: timeouts: - assert: 360s + assert: 240s ## 5 steps with full package lifecycles including mid-flight updates catch: ## if errors, print the most important info - get: apiVersion: v1 @@ -34,32 +34,356 @@ spec: name: config-skyhook format: yaml steps: - - try: + - name: deploy + description: Reset state, create the skyhook, and assert initial config completes + try: - script: content: | ## remove annotation/labels from last run ../skyhook-cli reset config-skyhook --confirm 2>/dev/null || true - create: file: skyhook.yaml + ## sequential: node/skyhook transition through lifecycle stages - assert: - file: assert.yaml - - try: + ## node in_progress with all packages at apply stage + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_config-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): + { + "baxter|3.2.1": { + "name": "baxter", + "version": "3.2.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + } + skyhook.nvidia.com/status_config-skyhook: in_progress + spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable + status: + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook in_progress + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: config-skyhook + status: + status: in_progress + nodeState: + (values(@)): + - baxter|3.2.1: + name: baxter + state: in_progress + version: '3.2.1' + stage: apply + image: ghcr.io/nvidia/skyhook/agentless + spencer|3.2.3: + name: spencer + state: in_progress + version: '3.2.3' + stage: apply + image: ghcr.io/nvidia/skyhook/agentless + dexter|1.2.3: + name: dexter + state: in_progress + version: '1.2.3' + stage: apply + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - in_progress + ## parallel: configmaps are distinct steady-state resources + - assert: + file: assert-cm-deploy.yaml + - name: update while running + description: Apply an update while the skyhook is still running and verify it is accepted + try: - apply: file: update-while-running.yaml + ## sequential: node/skyhook transition through lifecycle stages + - assert: + ## node complete with all packages at post-interrupt + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_config-skyhook: complete + annotations: + ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): + { + "baxter|3.2.1": { + "name": "baxter", + "version": "3.2.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + } + } + skyhook.nvidia.com/status_config-skyhook: complete + status: + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete (gen 3) + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: config-skyhook + status: + status: complete + observedGeneration: 3 + nodeState: + (values(@)): + - dexter|1.2.3: + name: dexter + state: complete + version: '1.2.3' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + baxter|3.2.1: + name: baxter + state: complete + version: '3.2.1' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + spencer|3.2.3: + name: spencer + state: complete + version: '3.2.3' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + ## parallel: configmaps are distinct steady-state resources - assert: - file: assert-update-while-running.yaml - - try: + file: assert-cm-update-while-running.yaml + - name: update + description: Apply a standard config update and assert it completes + try: - apply: file: update.yaml + ## sequential: node/skyhook transition through lifecycle stages - assert: - file: assert-update.yaml - - try: + ## node complete with all packages at post-interrupt + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_config-skyhook: complete + annotations: + ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): + { + "baxter|3.2.1": { + "name": "baxter", + "version": "3.2.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + } + } + skyhook.nvidia.com/status_config-skyhook: complete + status: + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete (gen 4) + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: config-skyhook + status: + status: complete + observedGeneration: 4 + nodeState: + (values(@)): + - dexter|1.2.3: + name: dexter + state: complete + version: '1.2.3' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + baxter|3.2.1: + name: baxter + state: complete + version: '3.2.1' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + spencer|3.2.3: + name: spencer + state: complete + version: '3.2.3' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + ## parallel: configmaps are distinct steady-state resources + - assert: + file: assert-cm-update.yaml + - name: no-interrupt update + description: Apply a config update with no interrupt and verify the node is not rebooted + try: - apply: file: update-no-interrupt.yaml + ## sequential: node/skyhook transition through lifecycle stages + - assert: + ## node complete - baxter/dexter at config stage (no interrupt), spencer at post-interrupt + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_config-skyhook: complete + annotations: + ("skyhook.nvidia.com/nodeState_config-skyhook" && parse_json("skyhook.nvidia.com/nodeState_config-skyhook")): + { + "baxter|3.2.1": { + "name": "baxter", + "version": "3.2.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + } + } + skyhook.nvidia.com/status_config-skyhook: complete + status: + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/config-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete (gen 5) + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: config-skyhook + status: + status: complete + observedGeneration: 5 + nodeState: + (values(@)): + - dexter|1.2.3: + name: dexter + state: complete + version: '1.2.3' + stage: config + image: ghcr.io/nvidia/skyhook/agentless + baxter|3.2.1: + name: baxter + state: complete + version: '3.2.1' + stage: config + image: ghcr.io/nvidia/skyhook/agentless + spencer|3.2.3: + name: spencer + state: complete + version: '3.2.3' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + ## parallel: configmaps are distinct steady-state resources - assert: - file: assert-update-no-interrupt.yaml - - try: + file: assert-cm-update-no-interrupt.yaml + - name: glob update + description: Apply a config update using glob-based package selection and assert it completes + try: - apply: file: update-glob.yaml + ## parallel: all docs checked simultaneously — the ConfigMap data ("changed via glob") + ## acts as a natural discriminator preventing matches against the previous step's state - assert: + timeout: 270s file: assert-update-glob.yaml diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/update-glob.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/update-glob.yaml index d24b7f87..82b7b965 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/update-glob.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/update-glob.yaml @@ -14,18 +14,31 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Full spec to avoid strategic merge ambiguity — only dexter remains with glob configInterrupt apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook metadata: name: config-skyhook spec: + nodeSelectors: + matchLabels: + skyhook.nvidia.com/test-node: skyhooke2e + interruptionBudget: + count: 1 packages: dexter: - # Add a globbed configInterrupt that matches at least one key + version: "1.2.3" + image: ghcr.io/nvidia/skyhook/agentless + interrupt: + type: reboot + # Globbed configInterrupt that matches at least one key configInterrupts: "*.properties": type: service services: [rsyslog] + env: + - name: SLEEP_LEN + value: "1" # Change a key that should match the glob configMap: game.properties: | diff --git a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml index b1d18a2f..cb9f969e 100644 --- a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -24,7 +24,9 @@ spec: assert: 240s exec: 90s steps: - - try: + - name: deploy and delete + description: Deploy a skyhook, verify completion metrics, delete it, and confirm all metrics are cleaned up + try: - script: content: | ## remove annotation from last run diff --git a/k8s-tests/chainsaw/skyhook/depends-on/README.md b/k8s-tests/chainsaw/skyhook/depends-on/README.md index 5242a4bb..f5036e3a 100644 --- a/k8s-tests/chainsaw/skyhook/depends-on/README.md +++ b/k8s-tests/chainsaw/skyhook/depends-on/README.md @@ -22,6 +22,5 @@ Validates that package dependencies (dependsOn) work correctly, ensuring package ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with all assertions inline (pods, skyhook status) for sequential ordering to verify dependency execution order - `skyhook.yaml` - Skyhook with package dependencies -- `assert.yaml` - State assertions diff --git a/k8s-tests/chainsaw/skyhook/depends-on/assert-skyhook.yaml b/k8s-tests/chainsaw/skyhook/depends-on/assert-skyhook.yaml deleted file mode 100644 index 06bcf3a0..00000000 --- a/k8s-tests/chainsaw/skyhook/depends-on/assert-skyhook.yaml +++ /dev/null @@ -1,149 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: aa-fast-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "aa-fast", - "version": "1.2.3", - "skyhook": "depends-on", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: bb-slow-1.2 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "bb-slow", - "version": "1.2", - "skyhook": "depends-on", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: aa-fast-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "aa-fast", - "version": "1.2.3", - "skyhook": "depends-on", - "stage": "config", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: bb-slow-1.2 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "bb-slow", - "version": "1.2", - "skyhook": "depends-on", - "stage": "config", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: cc-last-5.4.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "cc-last", - "version": "5.4.3", - "skyhook": "depends-on", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: depends-on - skyhook.nvidia.com/package: cc-last-5.4.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "cc-last", - "version": "5.4.3", - "skyhook": "depends-on", - "stage": "config", - "image": "ghcr.io/nvidia/skyhook/agentless" - } ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: depends-on -status: - status: complete - nodeState: - (values(@)): - - aa-fast|1.2.3: - image: ghcr.io/nvidia/skyhook/agentless - name: aa-fast - stage: config - state: complete - version: 1.2.3 - bb-slow|1.2: - image: ghcr.io/nvidia/skyhook/agentless - name: bb-slow - stage: config - state: complete - version: "1.2" - cc-last|5.4.3: - image: ghcr.io/nvidia/skyhook/agentless - name: cc-last - stage: config - state: complete - version: 5.4.3 - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete diff --git a/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml index 2d7ce1ce..aaef11f4 100644 --- a/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml @@ -21,15 +21,162 @@ metadata: name: depends-on spec: timeouts: - assert: 180s + assert: 90s steps: - - try: - ## setup step, skyhook to complete + - name: setup + description: Reset node state, create skyhook with dependencies, and assert it completes + try: - script: content: | ## remove annotation from last run ../skyhook-cli reset depends-on --confirm 2>/dev/null || true - create: file: skyhook.yaml + ## sequential: pods get cleaned up between stages, dependency order: aa-fast → bb-slow → cc-last - assert: - file: assert-skyhook.yaml + ## aa-fast apply pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: aa-fast-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa-fast", + "version": "1.2.3", + "skyhook": "depends-on", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + - assert: + ## bb-slow apply pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: bb-slow-1.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "bb-slow", + "version": "1.2", + "skyhook": "depends-on", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + - assert: + ## aa-fast config pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: aa-fast-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "aa-fast", + "version": "1.2.3", + "skyhook": "depends-on", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + - assert: + ## bb-slow config pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: bb-slow-1.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "bb-slow", + "version": "1.2", + "skyhook": "depends-on", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + - assert: + ## cc-last apply pod (depends on bb-slow) + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: cc-last-5.4.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "cc-last", + "version": "5.4.3", + "skyhook": "depends-on", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + - assert: + ## cc-last config pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: depends-on + skyhook.nvidia.com/package: cc-last-5.4.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "cc-last", + "version": "5.4.3", + "skyhook": "depends-on", + "stage": "config", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ## assert final skyhook status + - assert: + ## skyhook complete with all packages at config stage + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: depends-on + status: + status: complete + nodeState: + (values(@)): + - aa-fast|1.2.3: + image: ghcr.io/nvidia/skyhook/agentless + name: aa-fast + stage: config + state: complete + version: 1.2.3 + bb-slow|1.2: + image: ghcr.io/nvidia/skyhook/agentless + name: bb-slow + stage: config + state: complete + version: "1.2" + cc-last|5.4.3: + image: ghcr.io/nvidia/skyhook/agentless + name: cc-last + stage: config + state: complete + version: 5.4.3 + nodeStatus: + (values(@)): + - complete diff --git a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml index 2b4ce149..1ff5bbbc 100644 --- a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -35,7 +35,9 @@ spec: name: failure-skyhook format: yaml steps: - - try: + - name: failure + description: Create a skyhook with a failing package and verify node enters erroring state with correct metrics + try: - script: content: | ## remove annotation from last run diff --git a/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml index 67e1b3d1..f1548757 100644 --- a/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/hello-world/chainsaw-test.yaml @@ -22,7 +22,9 @@ metadata: spec: skip: true steps: - - try: + - name: verify + description: Apply a configmap and assert it was created correctly + try: - apply: file: configmap.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/README.md b/k8s-tests/chainsaw/skyhook/interrupt-grouping/README.md index a91630ea..f4c66af5 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/README.md +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/README.md @@ -22,6 +22,5 @@ Validates that interrupts are correctly grouped around services or reboots, ensu ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with all assertions inline (pods, nodes, skyhook status, ConfigMap) for sequential ordering through apply, config, interrupt, and post-interrupt stages - `skyhook.yaml` - Skyhook with grouped interrupt packages -- `assert*.yaml` - State assertions for interrupt grouping diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml deleted file mode 100644 index 71365650..00000000 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/assert.yaml +++ /dev/null @@ -1,241 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ---- -kind: Node -apiVersion: v1 -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_interrupt-grouping: in_progress - annotations: - skyhook.nvidia.com/status_interrupt-grouping: in_progress -spec: - taints: - - effect: NoSchedule - key: node.kubernetes.io/unschedulable -status: - (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt-grouping - skyhook.nvidia.com/package: dax-1.2.3 - annotations: - skyhook.nvidia.com/package: >- - {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"apply","image":"ghcr.io/nvidia/skyhook/agentless"} - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt-grouping -spec: - initContainers: - - name: dax-init - image: ghcr.io/nvidia/skyhook/agentless:1.2.3 - - name: dax-apply - image: ghcr.io/nvidia/skyhook/agentless:6.2.0 - args: - ([0]): apply - ([1]): /root - (length(@)): 3 - - name: dax-applycheck - image: ghcr.io/nvidia/skyhook/agentless:6.2.0 - args: - ([0]): apply-check - ([1]): /root - (length(@)): 3 - (tolerations[?key == 'nvidia.com/gpu']): - - operator: Exists ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt-grouping - skyhook.nvidia.com/package: dax-1.2.3 - annotations: - skyhook.nvidia.com/package: >- - {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"config","image":"ghcr.io/nvidia/skyhook/agentless"} - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt-grouping -spec: - initContainers: - - name: dax-init - - name: dax-config - args: - ([0]): config - ([1]): /root - (length(@)): 3 - - name: dax-configcheck - args: - ([0]): config-check - ([1]): /root - (length(@)): 3 - (tolerations[?key == 'nvidia.com/gpu']): - - operator: Exists ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt-grouping - skyhook.nvidia.com/package: dax-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "dax", - "version": "1.2.3", - "skyhook": "interrupt-grouping", - "stage": "interrupt", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt-grouping -spec: - initContainers: - - name: interrupt - args: - ([0]): interrupt - ([1]): /root - (parse_json(base64_decode([3]))): - { - "type": "service_restart", - "services": [ - "containerd", - "cron", - "foobar" - ] - } - (length(@)): 4 - (tolerations[?key == 'nvidia.com/gpu']): - - operator: Exists ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt-grouping - skyhook.nvidia.com/package: dax-1.2.3 - annotations: - skyhook.nvidia.com/package: >- - {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"post-interrupt","image":"ghcr.io/nvidia/skyhook/agentless"} - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt-grouping -spec: - initContainers: - - name: dax-init - - name: dax-post-interrupt - args: - ([0]): post-interrupt - ([1]): /root - (length(@)): 3 - - name: dax-post-interruptcheck - args: - ([0]): post-interrupt-check - ([1]): /root - (length(@)): 3 - (tolerations[?key == 'nvidia.com/gpu']): - - operator: Exists ---- -kind: Node -apiVersion: v1 -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_interrupt-grouping: complete - annotations: - (length("skyhook.nvidia.com/version_interrupt-grouping") >= `6`): true - skyhook.nvidia.com/nodeState_interrupt-grouping: >- - {"dax|1.2.3":{"name":"dax","version":"1.2.3","image":"ghcr.io/nvidia/skyhook/agentless","stage":"post-interrupt","state":"complete"},"zeb|2.1.4":{"name":"zeb","version":"2.1.4","image":"ghcr.io/nvidia/skyhook/agentless","stage":"post-interrupt","state":"complete"}} - skyhook.nvidia.com/status_interrupt-grouping: complete - (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist -status: - (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -kind: Skyhook -apiVersion: skyhook.nvidia.com/v1alpha1 -metadata: - name: interrupt-grouping - annotations: - ## test that version info is present - ## the quotes are to escape the . and / in the key - (length("skyhook.nvidia.com/version") >= `6`): true -status: - status: complete - (to_number(observedGeneration) == `2` || to_number(observedGeneration) == `3`): true ## migrate seems to add 1, but not in all cases it seems - nodeState: - (values(@)): - - dax|1.2.3: - name: dax - stage: post-interrupt - state: complete - version: 1.2.3 - image: ghcr.io/nvidia/skyhook/agentless - zeb|2.1.4: - name: zeb - stage: post-interrupt - state: complete - version: 2.1.4 - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: interrupt-grouping - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: interrupt-grouping -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true - (contains(@, 'skyhook.nvidia.com/nodeState_interrupt-grouping')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dax"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml index de9a1c47..d8857076 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml @@ -33,15 +33,256 @@ spec: name: interrupt-grouping format: yaml timeouts: - assert: 360s ## needs to be long to run at the same time as the other interrupt test, they we fight each other to cordon nodes - # skip: true + assert: 180s ## needs buffer since it fights with the interrupt test for cordoning nodes steps: - - try: + - name: grouping + description: Create an interrupt skyhook with group labels and verify nodes are processed respecting group constraints + try: - script: content: | ## remove annotation from last run ../skyhook-cli reset interrupt-grouping --confirm 2>/dev/null || true - create: file: skyhook.yaml + ## sequential: dax pod transitions through apply → config → interrupt → post-interrupt stages - assert: - file: assert.yaml + ## node in_progress, cordoned + resource: + kind: Node + apiVersion: v1 + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt-grouping: in_progress + annotations: + skyhook.nvidia.com/status_interrupt-grouping: in_progress + spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable + status: + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## dax apply pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt-grouping + skyhook.nvidia.com/package: dax-1.2.3 + annotations: + skyhook.nvidia.com/package: >- + {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"apply","image":"ghcr.io/nvidia/skyhook/agentless"} + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt-grouping + spec: + initContainers: + - name: dax-init + image: ghcr.io/nvidia/skyhook/agentless:1.2.3 + - name: dax-apply + image: ghcr.io/nvidia/skyhook/agentless:6.2.0 + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: dax-applycheck + image: ghcr.io/nvidia/skyhook/agentless:6.2.0 + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 + (tolerations[?key == 'nvidia.com/gpu']): + - operator: Exists + - assert: + ## dax config pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt-grouping + skyhook.nvidia.com/package: dax-1.2.3 + annotations: + skyhook.nvidia.com/package: >- + {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"config","image":"ghcr.io/nvidia/skyhook/agentless"} + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt-grouping + spec: + initContainers: + - name: dax-init + - name: dax-config + args: + ([0]): config + ([1]): /root + (length(@)): 3 + - name: dax-configcheck + args: + ([0]): config-check + ([1]): /root + (length(@)): 3 + (tolerations[?key == 'nvidia.com/gpu']): + - operator: Exists + - assert: + ## dax interrupt pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt-grouping + skyhook.nvidia.com/package: dax-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "dax", + "version": "1.2.3", + "skyhook": "interrupt-grouping", + "stage": "interrupt", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt-grouping + spec: + initContainers: + - name: interrupt + args: + ([0]): interrupt + ([1]): /root + (parse_json(base64_decode([3]))): + { + "type": "service_restart", + "services": [ + "containerd", + "cron", + "foobar" + ] + } + (length(@)): 4 + (tolerations[?key == 'nvidia.com/gpu']): + - operator: Exists + - assert: + ## dax post-interrupt pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt-grouping + skyhook.nvidia.com/package: dax-1.2.3 + annotations: + skyhook.nvidia.com/package: >- + {"name":"dax","version":"1.2.3","skyhook":"interrupt-grouping","stage":"post-interrupt","image":"ghcr.io/nvidia/skyhook/agentless"} + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt-grouping + spec: + initContainers: + - name: dax-init + - name: dax-post-interrupt + args: + ([0]): post-interrupt + ([1]): /root + (length(@)): 3 + - name: dax-post-interruptcheck + args: + ([0]): post-interrupt-check + ([1]): /root + (length(@)): 3 + (tolerations[?key == 'nvidia.com/gpu']): + - operator: Exists + - assert: + ## node complete, uncordoned + resource: + kind: Node + apiVersion: v1 + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt-grouping: complete + annotations: + (length("skyhook.nvidia.com/version_interrupt-grouping") >= `6`): true + skyhook.nvidia.com/nodeState_interrupt-grouping: >- + {"dax|1.2.3":{"name":"dax","version":"1.2.3","image":"ghcr.io/nvidia/skyhook/agentless","stage":"post-interrupt","state":"complete"},"zeb|2.1.4":{"name":"zeb","version":"2.1.4","image":"ghcr.io/nvidia/skyhook/agentless","stage":"post-interrupt","state":"complete"}} + skyhook.nvidia.com/status_interrupt-grouping: complete + (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist + status: + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/interrupt-grouping/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete + resource: + kind: Skyhook + apiVersion: skyhook.nvidia.com/v1alpha1 + metadata: + name: interrupt-grouping + annotations: + ## test that version info is present + ## the quotes are to escape the . and / in the key + (length("skyhook.nvidia.com/version") >= `6`): true + status: + status: complete + (to_number(observedGeneration) == `2` || to_number(observedGeneration) == `3`): true ## migrate seems to add 1, but not in all cases it seems + nodeState: + (values(@)): + - dax|1.2.3: + name: dax + stage: post-interrupt + state: complete + version: 1.2.3 + image: ghcr.io/nvidia/skyhook/agentless + zeb|2.1.4: + name: zeb + stage: post-interrupt + state: complete + version: 2.1.4 + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + - assert: + ## node-meta configmap + resource: + kind: ConfigMap + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: interrupt-grouping + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: interrupt-grouping + data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_interrupt-grouping')): true + (contains(@, 'skyhook.nvidia.com/nodeState_interrupt-grouping')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dax"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt/README.md b/k8s-tests/chainsaw/skyhook/interrupt/README.md index 2fa71aad..ce0d0e5c 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/README.md +++ b/k8s-tests/chainsaw/skyhook/interrupt/README.md @@ -27,10 +27,12 @@ Validates the interrupt feature of the skyhook operator, including pod draining, ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with lifecycle assertions inline (pods, nodes, skyhook status) for sequential ordering - `skyhook.yaml` - Skyhook with interrupt configuration -- `pre-pods.yaml` - Pods to create before the skyhook -- `assert*.yaml` - State assertions +- `pod.yaml` - Pods to create before the skyhook (drain-on and important-stuff) +- `assert-important-stuff.yaml` - Assertion for the important-stuff pod (used to verify wait-for behavior) +- `assert-drain-me.yaml` - Assertion for the drain-on pod (used to verify drain behavior) +- `assert-cm-b.yaml` - ConfigMap assertions for final package state ## Notes diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml deleted file mode 100644 index 4e865a95..00000000 --- a/k8s-tests/chainsaw/skyhook/interrupt/assert-a.yaml +++ /dev/null @@ -1,84 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt - skyhook.nvidia.com/package: jason-1.3.2 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "jason", - "version": "1.3.2", - "skyhook": "interrupt", - "stage": "apply", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt -spec: - initContainers: - - name: jason-init - image: ghcr.io/nvidia/skyhook/agentless:1.3.2 - - name: jason-apply - image: ghcr.io/nvidia/skyhook/agentless:3.2.3 - args: - ([0]): apply - ([1]): /root - (length(@)): 3 - - name: jason-applycheck - image: ghcr.io/nvidia/skyhook/agentless:3.2.3 - args: - ([0]): apply-check - ([1]): /root - (length(@)): 3 ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_interrupt: in_progress - annotations: - ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): - { - "jason|1.3.2": { - "name": "jason", - "version": "1.3.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - } - } - skyhook.nvidia.com/status_interrupt: in_progress -spec: - taints: - - effect: NoSchedule - key: node.kubernetes.io/unschedulable -status: - (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): - - reason: "Not Erroring" - status: "False" ---- diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml deleted file mode 100644 index 95711d58..00000000 --- a/k8s-tests/chainsaw/skyhook/interrupt/assert-b.yaml +++ /dev/null @@ -1,272 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## REMOVE THIS part of the test. When adding more around invalid package cleanup, it seems like this might not make sense to be a test. This commit does break this, -## but making this test pass has implications for resting a node. If you reset a node it would mean you want to uninstall all the packages on the node. -## I could go either way on this, but for now I'm going to remove it. -# --- -# kind: Pod -# apiVersion: v1 -# metadata: -# namespace: skyhook -# labels: -# skyhook.nvidia.com/name: interrupt -# skyhook.nvidia.com/package: invalid-1.2.3 -# annotations: -# ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): -# { -# "name": "invalid", -# "version": "1.2.3", -# "skyhook": "interrupt", -# "stage": "uninstall", -# "image": "ghcr.io/nvidia/skyhook/agentless" -# } -# ownerReferences: -# - apiVersion: skyhook.nvidia.com/v1alpha1 -# kind: Skyhook -# name: interrupt -# spec: -# initContainers: -# - name: invalid-init -# - name: invalid-uninstall -# args: -# ([0]): uninstall -# ([1]): /root -# (length(@)): 3 -# - name: invalid-uninstallcheck -# args: -# ([0]): uninstall-check -# ([1]): /root -# (length(@)): 3 ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_interrupt: in_progress - annotations: - ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): - { - "jason|1.3.2": { - "name": "jason", - "version": "1.3.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "apply", - "state": "in_progress" - }, - "john|1.2.3": { - "name": "john", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "apply", - "state": "in_progress" - } - } - skyhook.nvidia.com/cordon_interrupt: "true" - skyhook.nvidia.com/status_interrupt: in_progress -spec: - taints: - - effect: NoSchedule - key: node.kubernetes.io/unschedulable -status: - (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: interrupt-dexter-1.2.3 - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: interrupt -data: - game.properties: | - enemies=aliens - lives=3 - enemies.cheat=true - enemies.cheat.level=noGoodRotten - ui.properties: | - color.good=purple - color.bad=yellow - allow.textmode=true - how.nice.to.look=fairlyNice ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: interrupt - skyhook.nvidia.com/package: dexter-1.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "dexter", - "version": "1.2.3", - "skyhook": "interrupt", - "stage": "interrupt", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: interrupt -spec: - initContainers: - - name: interrupt - args: - ([0]): interrupt - ([1]): /root - (parse_json(base64_decode([3]))): - { - "type": "node_restart", - } - (length(@)): 4 ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_interrupt: complete - annotations: - ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): - { - "baxter|3.3": { - "name": "baxter", - "version": "3.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "dexter|1.2.3": { - "name": "dexter", - "version": "1.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "foobar|1.2": { - "name": "foobar", - "version": "1.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - }, - "jason|1.3.2": { - "name": "jason", - "version": "1.3.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - }, - "spencer|3.2.3": { - "name": "spencer", - "version": "3.2.3", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - } - } - skyhook.nvidia.com/status_interrupt: complete - (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist -status: - (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: interrupt -status: - status: complete - observedGeneration: 2 - nodeState: - (values(@)): - - baxter|3.3: - name: baxter - stage: post-interrupt - state: complete - version: "3.3" - image: ghcr.io/nvidia/skyhook/agentless - dexter|1.2.3: - name: dexter - stage: post-interrupt - state: complete - version: 1.2.3 - image: ghcr.io/nvidia/skyhook/agentless - foobar|1.2: - name: foobar - stage: config - state: complete - version: "1.2" - image: ghcr.io/nvidia/skyhook/agentless - spencer|3.2.3: - name: spencer - stage: config - state: complete - version: 3.2.3 - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/skyhook-node-meta: interrupt - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: interrupt -data: - (length(@)): 3 - labels.json: - (contains(@, 'skyhook.nvidia.com/test-node')): true - (contains(@, 'skyhook.nvidia.com/status_interrupt')): true - annotations.json: - (contains(@, 'skyhook.nvidia.com/status_interrupt')): true - (contains(@, 'skyhook.nvidia.com/nodeState_interrupt')): true - packages.json: - (contains(@, '"agentVersion"')): true - (contains(@, '"dexter"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt/assert-cm-b.yaml b/k8s-tests/chainsaw/skyhook/interrupt/assert-cm-b.yaml new file mode 100644 index 00000000..b715637d --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/interrupt/assert-cm-b.yaml @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ConfigMaps for interrupt assert-b (parallel: distinct steady-state resources) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: interrupt-dexter-1.2.3 + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: interrupt +data: + game.properties: | + enemies=aliens + lives=3 + enemies.cheat=true + enemies.cheat.level=noGoodRotten + ui.properties: | + color.good=purple + color.bad=yellow + allow.textmode=true + how.nice.to.look=fairlyNice +--- +kind: ConfigMap +apiVersion: v1 +metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/skyhook-node-meta: interrupt + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: interrupt +data: + (length(@)): 3 + labels.json: + (contains(@, 'skyhook.nvidia.com/test-node')): true + (contains(@, 'skyhook.nvidia.com/status_interrupt')): true + annotations.json: + (contains(@, 'skyhook.nvidia.com/status_interrupt')): true + (contains(@, 'skyhook.nvidia.com/nodeState_interrupt')): true + packages.json: + (contains(@, '"agentVersion"')): true + (contains(@, '"dexter"')): true diff --git a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml index 1a1f8c13..404d6bc4 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml @@ -22,7 +22,7 @@ metadata: spec: timeouts: - assert: 240s + assert: 180s ## slow packages (SLEEP_LEN=8), interruptionBudget count=1, contention with interrupt-grouping catch: ## if errors, print the most important info - get: apiVersion: v1 @@ -35,7 +35,9 @@ spec: name: interrupt format: yaml steps: - - try: + - name: interrupt + description: Verify nodes cordon and drain before applying packages, then uncordon after interrupt completes + try: - script: content: | ## remove annotation from last run @@ -62,8 +64,78 @@ spec: value: 'true' - create: file: skyhook.yaml + ## sequential: pod/node transition through lifecycle stages (was assert-a.yaml) - assert: - file: assert-a.yaml + ## jason apply pod exists + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt + skyhook.nvidia.com/package: jason-1.3.2 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "jason", + "version": "1.3.2", + "skyhook": "interrupt", + "stage": "apply", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt + spec: + initContainers: + - name: jason-init + image: ghcr.io/nvidia/skyhook/agentless:1.3.2 + - name: jason-apply + image: ghcr.io/nvidia/skyhook/agentless:3.2.3 + args: + ([0]): apply + ([1]): /root + (length(@)): 3 + - name: jason-applycheck + image: ghcr.io/nvidia/skyhook/agentless:3.2.3 + args: + ([0]): apply-check + ([1]): /root + (length(@)): 3 + - assert: + ## node in_progress with jason at config complete, cordoned + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): + { + "jason|1.3.2": { + "name": "jason", + "version": "1.3.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + } + } + skyhook.nvidia.com/status_interrupt: in_progress + spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable + status: + (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): + - reason: "Not Erroring" + status: "False" - assert: file: assert-important-stuff.yaml - assert: @@ -72,5 +144,184 @@ spec: file: assert-important-stuff.yaml - error: file: assert-drain-me.yaml + ## sequential: node/pod/skyhook transition through lifecycle stages (was assert-b.yaml) + - assert: + ## node in_progress with jason/dexter/john, cordoned + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): + { + "jason|1.3.2": { + "name": "jason", + "version": "1.3.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + }, + "john|1.2.3": { + "name": "john", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "apply", + "state": "in_progress" + } + } + skyhook.nvidia.com/cordon_interrupt: "true" + skyhook.nvidia.com/status_interrupt: in_progress + spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable + status: + (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## dexter interrupt pod exists + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: interrupt + skyhook.nvidia.com/package: dexter-1.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "dexter", + "version": "1.2.3", + "skyhook": "interrupt", + "stage": "interrupt", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: interrupt + spec: + initContainers: + - name: interrupt + args: + ([0]): interrupt + ([1]): /root + (parse_json(base64_decode([3]))): + { + "type": "node_restart", + } + (length(@)): 4 + - assert: + ## node complete with all packages done, uncordoned + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_interrupt: complete + annotations: + ("skyhook.nvidia.com/nodeState_interrupt" && parse_json("skyhook.nvidia.com/nodeState_interrupt")): + { + "baxter|3.3": { + "name": "baxter", + "version": "3.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "dexter|1.2.3": { + "name": "dexter", + "version": "1.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "foobar|1.2": { + "name": "foobar", + "version": "1.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "jason|1.3.2": { + "name": "jason", + "version": "1.3.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + }, + "spencer|3.2.3": { + "name": "spencer", + "version": "3.2.3", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + } + } + skyhook.nvidia.com/status_interrupt: complete + (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist + status: + (conditions[?type == 'skyhook.nvidia.com/interrupt/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/interrupt/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: interrupt + status: + status: complete + observedGeneration: 2 + nodeState: + (values(@)): + - baxter|3.3: + name: baxter + stage: post-interrupt + state: complete + version: "3.3" + image: ghcr.io/nvidia/skyhook/agentless + dexter|1.2.3: + name: dexter + stage: post-interrupt + state: complete + version: 1.2.3 + image: ghcr.io/nvidia/skyhook/agentless + foobar|1.2: + name: foobar + stage: config + state: complete + version: "1.2" + image: ghcr.io/nvidia/skyhook/agentless + spencer|3.2.3: + name: spencer + stage: config + state: complete + version: 3.2.3 + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + ## parallel: configmaps are distinct steady-state resources - assert: - file: assert-b.yaml + file: assert-cm-b.yaml diff --git a/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml index 46bae90e..6bd8976b 100644 --- a/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/pod-finalizer/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -23,7 +23,9 @@ spec: timeouts: assert: 180s steps: - - try: + - name: finalization + description: Create a pod with a skyhook finalizer and verify it updates the node annotation on completion + try: - script: content: | ## remove annotation from last run diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/README.md b/k8s-tests/chainsaw/skyhook/runtime-required/README.md index 24daab58..cca8857c 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/README.md +++ b/k8s-tests/chainsaw/skyhook/runtime-required/README.md @@ -38,11 +38,7 @@ The test explicitly validates node isolation by blocking one node while the othe ## Files -- `chainsaw-test.yaml` - Main test configuration with multi-phase flow -- `skyhook.yaml` - Runtime-required skyhook definition -- `assert-node1-complete-node2-blocked.yaml` - Sequential assertions: node1 complete, node2 blocked (proves isolation) -- `assert-node2-complete.yaml` - Assertion: node2 complete after unblocking -- `assert.yaml` - Final validation: both nodes complete +- `chainsaw-test.yaml` - Main test configuration with all assertions inline (pods, nodes, skyhook status) for sequential ordering through the multi-phase flow; the skyhook resource is also defined inline ## Notes diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/assert-node1-complete-node2-blocked.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/assert-node1-complete-node2-blocked.yaml deleted file mode 100644 index 681a3298..00000000 --- a/k8s-tests/chainsaw/skyhook/runtime-required/assert-node1-complete-node2-blocked.yaml +++ /dev/null @@ -1,77 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Sequential assertions proving per-node isolation -# Pods run on kind-worker while kind-worker2 remains blocked - -# 1. Apply stage pod on kind-worker -apiVersion: v1 -kind: Pod -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: runtime-required - skyhook.nvidia.com/package: spencer-3.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'apply'): true -spec: - nodeName: kind-worker ---- -# 2. Config stage pod on kind-worker -apiVersion: v1 -kind: Pod -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: runtime-required - skyhook.nvidia.com/package: spencer-3.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'config'): true -spec: - nodeName: kind-worker ---- -# 3. kind-worker node complete with taint removed -apiVersion: v1 -kind: Node -metadata: - name: kind-worker - labels: - skyhook.nvidia.com/status_runtime-required: complete - annotations: - skyhook.nvidia.com/status_runtime-required: complete -spec: - (!taints || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true ---- -# 4. kind-worker2 still blocked with taint present (proves per-node isolation) -apiVersion: v1 -kind: Node -metadata: - name: kind-worker2 - labels: - skyhook.nvidia.com/status_runtime-required: blocked - annotations: - skyhook.nvidia.com/status_runtime-required: blocked -spec: - taints: - - key: test-block - value: "true" - effect: NoSchedule - - key: skyhook.nvidia.com - value: runtime-required - effect: NoSchedule diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/assert-node2-complete.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/assert-node2-complete.yaml deleted file mode 100644 index 9b84f510..00000000 --- a/k8s-tests/chainsaw/skyhook/runtime-required/assert-node2-complete.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Assert kind-worker2 completes after unblocking - -# 1. Apply stage pod on kind-worker2 -apiVersion: v1 -kind: Pod -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: runtime-required - skyhook.nvidia.com/package: spencer-3.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'apply'): true -spec: - nodeName: kind-worker2 ---- -# 2. Config stage pod on kind-worker2 -apiVersion: v1 -kind: Pod -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: runtime-required - skyhook.nvidia.com/package: spencer-3.2.3 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'config'): true -spec: - nodeName: kind-worker2 ---- -# 3. kind-worker2 node complete with taint removed -apiVersion: v1 -kind: Node -metadata: - name: kind-worker2 - labels: - skyhook.nvidia.com/status_runtime-required: complete - annotations: - skyhook.nvidia.com/status_runtime-required: complete -spec: - (!taints || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml deleted file mode 100644 index 88f0237e..00000000 --- a/k8s-tests/chainsaw/skyhook/runtime-required/assert.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This assertion validates per-node runtime-required behavior: -# - Each node with the test label should have status=complete -# - Each node's runtime-required taint should be removed independently -# - Both kind-worker and kind-worker2 are tested (multi-node isolation) -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/runtime-required-test: "true" - skyhook.nvidia.com/status_runtime-required: complete - annotations: - skyhook.nvidia.com/status_runtime-required: complete -spec: - # Verify the runtime-required taint has been removed from this node - (!taints || length(taints)==`0` || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml index 9752782b..4b2da6c0 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml @@ -21,7 +21,7 @@ metadata: name: runtime-required spec: timeouts: - assert: 240s + assert: 120s catch: ## if errors, print the most important info - get: apiVersion: v1 @@ -53,13 +53,92 @@ spec: - name: apply-skyhook try: - create: - file: skyhook.yaml + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: runtime-required + spec: + runtimeRequired: true + nodeSelectors: + matchLabels: + skyhook.nvidia.com/runtime-required-test: "true" + packages: + spencer: + version: "3.2.3" + image: ghcr.io/nvidia/skyhook/agentless + env: + - name: SLEEP_LEN + value: "2" + # Phase 3: Assert node1 complete sequence (apply pod -> config pod -> node complete -> node2 blocked) - name: assert-node1-complete-node2-blocked try: + ## sequential: pods get cleaned up between stages on kind-worker + - assert: + ## spencer apply pod on kind-worker + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: runtime-required + skyhook.nvidia.com/package: spencer-3.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'apply'): true + spec: + nodeName: kind-worker + - assert: + ## spencer config pod on kind-worker + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: runtime-required + skyhook.nvidia.com/package: spencer-3.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'config'): true + spec: + nodeName: kind-worker + - assert: + ## kind-worker complete, runtime-required taint removed + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker + labels: + skyhook.nvidia.com/status_runtime-required: complete + annotations: + skyhook.nvidia.com/status_runtime-required: complete + spec: + (!taints || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true - assert: - file: assert-node1-complete-node2-blocked.yaml + ## kind-worker2 still blocked with taint (proves per-node isolation) + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker2 + labels: + skyhook.nvidia.com/status_runtime-required: blocked + annotations: + skyhook.nvidia.com/status_runtime-required: blocked + spec: + taints: + - key: test-block + value: "true" + effect: NoSchedule + - key: skyhook.nvidia.com + value: runtime-required + effect: NoSchedule # Phase 4: Unblock kind-worker2 - name: unblock-node2 @@ -72,14 +151,70 @@ spec: # Phase 5: Assert node2 complete sequence (apply pod -> config pod -> node complete) - name: assert-node2-complete try: + ## sequential: pods get cleaned up between stages on kind-worker2 - assert: - file: assert-node2-complete.yaml + ## spencer apply pod on kind-worker2 + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: runtime-required + skyhook.nvidia.com/package: spencer-3.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'apply'): true + spec: + nodeName: kind-worker2 + - assert: + ## spencer config pod on kind-worker2 + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: runtime-required + skyhook.nvidia.com/package: spencer-3.2.3 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package").stage == 'config'): true + spec: + nodeName: kind-worker2 + - assert: + ## kind-worker2 complete, runtime-required taint removed + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker2 + labels: + skyhook.nvidia.com/status_runtime-required: complete + annotations: + skyhook.nvidia.com/status_runtime-required: complete + spec: + (!taints || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true # Phase 6: Assert final state (both nodes complete) - name: assert-final-state try: - assert: - file: assert.yaml + resource: + # This assertion validates per-node runtime-required behavior: + # - Each node with the test label should have status=complete + # - Each node's runtime-required taint should be removed independently + # - Both kind-worker and kind-worker2 are tested (multi-node isolation) + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/runtime-required-test: "true" + skyhook.nvidia.com/status_runtime-required: complete + annotations: + skyhook.nvidia.com/status_runtime-required: complete + spec: + # Verify the runtime-required taint has been removed from this node + (!taints || length(taints)==`0` || length(taints[?key == 'skyhook.nvidia.com' && effect == 'NoSchedule' && value == 'runtime-required'])==`0`): true + finally: - script: content: | diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml deleted file mode 100644 index 390d9c16..00000000 --- a/k8s-tests/chainsaw/skyhook/runtime-required/skyhook.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - labels: - app.kubernetes.io/part-of: skyhook-operator - app.kubernetes.io/created-by: skyhook-operator - name: runtime-required -spec: - runtimeRequired: true - nodeSelectors: - matchLabels: - skyhook.nvidia.com/runtime-required-test: "true" - packages: - spencer: - version: "3.2.3" - image: ghcr.io/nvidia/skyhook/agentless - env: - - name: SLEEP_LEN - value: "2" diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml index cb279ba0..1070a126 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -24,7 +24,9 @@ spec: assert: 240s exec: 90s steps: - - try: + - name: deploy + description: Reset state, apply a skyhook with three packages, and verify all complete with correct metrics + try: - script: content: | ## remove annotation from last run diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml index 9f740e9a..b49eca90 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -34,7 +34,9 @@ spec: name: simple-update-skyhook format: yaml steps: - - try: + - name: deploy + description: Reset state, apply the skyhook, and assert it completes successfully + try: - script: content: | ## remove annotation/labels from last run @@ -43,7 +45,9 @@ spec: file: skyhook.yaml - assert: file: assert.yaml - - try: + - name: update + description: Patch the skyhook with new packages and assert the update completes + try: - patch: ## this test is very odd, it adds stuff... not makes it so file: update.yaml diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/README.md b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/README.md index 7453b2d6..2dc65d2b 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/README.md +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/README.md @@ -21,7 +21,7 @@ Tests the operator's ability to handle state migration during operator version u ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with all assertions inline (nodes, skyhook status) for sequential ordering through pre-migration and post-migration states - `skyhook.yaml` - Skyhook definition ## Notes diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml deleted file mode 100644 index 62938c31..00000000 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/assert.yaml +++ /dev/null @@ -1,85 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - annotations: - skyhook.nvidia.com/status_skyhook-upgrade: complete - skyhook.nvidia.com/version_skyhook-upgrade: v0.5.0 - ("skyhook.nvidia.com/nodeState_skyhook-upgrade" && parse_json("skyhook.nvidia.com/nodeState_skyhook-upgrade")): - { - "foobar": { ## assert this bad format exists before migration - "name": "foobar", - "version": "1.1", - "stage": "config", - "state": "complete" - }, - "foobar|1.2": { - "name": "foobar", - "version": "1.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - } - } ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_skyhook-upgrade: complete - annotations: - ("skyhook.nvidia.com/nodeState_skyhook-upgrade" && parse_json("skyhook.nvidia.com/nodeState_skyhook-upgrade")): - { - "foobar|1.2": { - "name": "foobar", - "version": "1.2", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "config", - "state": "complete" - } - } - skyhook.nvidia.com/status_skyhook-upgrade: complete -status: - (conditions[?type == 'skyhook.nvidia.com/skyhook-upgrade/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/skyhook-upgrade/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: skyhook-upgrade -status: - status: complete - nodeState: - (values(@)): - - foobar|1.2: - name: foobar - state: complete - version: '1.2' - image: ghcr.io/nvidia/skyhook/agentless - stage: config - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml index 078e1cd5..f0c60d07 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml @@ -22,9 +22,11 @@ metadata: spec: skip: true timeouts: - assert: 300s + assert: 120s steps: - - try: + - name: upgrade migration + description: Seed nodes with old-format annotations, create skyhook, and verify the annotation migration completes + try: - script: content: | ## remove annotation from last run @@ -38,5 +40,79 @@ spec: skyhook.nvidia.com/version_skyhook-upgrade=v0.5.0 - create: file: skyhook.yaml + ## sequential: same node transitions from pre-migration to post-migration state - assert: - file: assert.yaml + ## node has old-format annotation before migration + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + annotations: + skyhook.nvidia.com/status_skyhook-upgrade: complete + skyhook.nvidia.com/version_skyhook-upgrade: v0.5.0 + ("skyhook.nvidia.com/nodeState_skyhook-upgrade" && parse_json("skyhook.nvidia.com/nodeState_skyhook-upgrade")): + { + "foobar": { + "name": "foobar", + "version": "1.1", + "stage": "config", + "state": "complete" + }, + "foobar|1.2": { + "name": "foobar", + "version": "1.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + } + } + - assert: + ## node post-migration: old format removed, only new format remains + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_skyhook-upgrade: complete + annotations: + ("skyhook.nvidia.com/nodeState_skyhook-upgrade" && parse_json("skyhook.nvidia.com/nodeState_skyhook-upgrade")): + { + "foobar|1.2": { + "name": "foobar", + "version": "1.2", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "config", + "state": "complete" + } + } + skyhook.nvidia.com/status_skyhook-upgrade: complete + status: + (conditions[?type == 'skyhook.nvidia.com/skyhook-upgrade/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/skyhook-upgrade/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: skyhook-upgrade + status: + status: complete + nodeState: + (values(@)): + - foobar|1.2: + name: foobar + state: complete + version: '1.2' + image: ghcr.io/nvidia/skyhook/agentless + stage: config + nodeStatus: + (values(@)): + - complete diff --git a/k8s-tests/chainsaw/skyhook/strict-order/README.md b/k8s-tests/chainsaw/skyhook/strict-order/README.md index ab13ce93..0185fb53 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/README.md +++ b/k8s-tests/chainsaw/skyhook/strict-order/README.md @@ -1,75 +1,77 @@ # Strict Order Test -## Purpose - -Validates per-node priority ordering where nodes process skyhooks in priority order **independently**. This test proves that the ordering is **per-node, not global** - one node can progress through multiple priorities while another node is still on an earlier priority. - -## Test Scenario - -The test uses **2 nodes throughout** to validate per-node behavior for all features (priority, pause, disable, waiting): - -### Phase 1: Setup and Block Node 2 -1. Reset state from previous runs -2. Label both `kind-worker` and `kind-worker2` with `strict-order-test=true` -3. Temporarily block `kind-worker2` with `test-block=true:NoSchedule` taint (to create timing difference) - -### Phase 2: Apply Skyhooks (Both Nodes Targeted, Worker Runs First) -4. Apply skyhooks: - - Priority 1: `strict-order-skyhook-zzz` (not paused, not disabled) - - Priority 2: `strict-order-skyhook-b` (paused initially) - - Priority 2: `strict-order-skyhook-c` (not paused, not disabled) - - Priority 2: `strict-order-skyhook-d` (disabled) -5. Validate metrics: 2 nodes targeted per skyhook -6. Result: worker completes priority 1 and reaches priority 2 (paused at b), worker2 can't start (blocked by taint) - -### Phase 3: Assert Worker at Priority 2, Worker2 Blocked at Priority 1 -7. **KEY ASSERTION with pod checks**: - - Worker has zzz pods (priority 1 complete) and is paused at b (priority 2) - - Worker2 has NO pods yet (blocked by taint at priority 1) - - This proves worker reached priority 2 while worker2 stuck at priority 1 -8. Validate metrics: per-node status counts showing one node complete, one blocked - -### Phase 4: Unpause and Unblock Simultaneously (CRITICAL TEST) -9. Unpause `strict-order-skyhook-b` via patch -10. Remove `test-block` taint from worker2 in same step -11. Both nodes now running concurrently at different priorities - -### Phase 5: Assert Concurrent Different Priorities (CRITICAL) -12. **CRITICAL ASSERTION with pod checks**: - - Worker has pods for priority 2 skyhooks (b or c) - - Worker2 has pods for priority 1 skyhook (zzz) - - **This definitively proves per-node ordering**: worker is ahead in priority queue during concurrent execution -13. In old global ordering, ALL nodes would need to complete priority 1 before ANY node could start priority 2 - -### Phase 6: Assert Both Nodes Complete -14. Wait for both nodes to complete all skyhooks -15. Assert final state: both nodes have zzz, b, c complete; d disabled -16. Validate metrics: 2 nodes complete for each skyhook - -## Key Features Tested - -- **Per-node priority ordering** (nodes don't wait for each other between priorities) -- **Concurrent execution at different priorities** (proves ordering is per-node, not global) -- **Alphabetical ordering for same-priority skyhooks** (b before c at priority 2) -- **Per-node pause behavior** (both nodes pause at priority 2, then both unpause independently) -- **Per-node disable behavior** (disabled skyhook doesn't block other skyhooks on any node) -- **Per-node waiting status** (based on completion of higher-priority skyhooks on THAT node) -- **Blocked status** (node can't start due to external conditions like taints) +Tests both `sequencing: node` (default, per-node ordering) and `sequencing: all` (opt-in, global ordering). + +## Skyhook Layout + +``` +Priority 1: zzz sequencing: node (default) +Priority 2: gate sequencing: all (global sync point) +Priority 3: aa, b (paused), c, d (disabled) +``` + +## Timeline + +A taint on `kind-worker2` blocks it from running pods initially. Time flows left to right. Both nodes run in parallel after the taint is removed. + +``` + assert 3 assert 3b rm taint assert 5 unpause b assert 7 + ▼ ▼ ▼ ▼ ▼ ▼ + +kind-worker zzz ■■■■■■■■■■■ gate ■■■■■■■ ···· waiting (gate holds) ···· b ■■■■ c ■■■■ ✓ + ╲ +kind-worker2 ░░░░░░ blocked ░░░░░░░ zzz ■■■■■■■■■ gate ■■■■■ b ■■■■ c ■■■■ ✓ + ▲ + gate on worker + zzz on worker2 + run CONCURRENTLY here + (per-node ordering in action) + +■ = running/complete ░ = blocked (taint) · = waiting (gate sequencing:all) +``` + +**Observed pod execution order** (from `kubectl get pods -w`): + +``` + worker: zzz apply → config → interrupt → post-interrupt + worker: gate apply ──┐ per-node: worker starts + worker2: zzz apply ──┘ concurrent after taint removed gate while worker2 on zzz + worker2: zzz config ──┐ + worker: gate config ──┘ concurrent + worker2: zzz interrupt → post-interrupt + worker2: gate apply → config worker waits (sequencing:all) + worker: b apply ──┐ + worker2: b apply ──┘ BOTH start b at same time gate released! + (b and c complete on both nodes) +``` + +**What each assertion checks:** + +- **Phase 3**: worker completed zzz, worker2 still blocked (taint). Proves `sequencing: node` — worker moved ahead. +- **Phase 3b**: worker past gate, b paused, worker2 still blocked. Proves `sequencing: all` — gate holds worker from priority 3. +- **Phase 5**: both nodes completed zzz + gate. b still paused. **Stable state** — no race. Proves gate released once both cleared it. +- **Phase 7**: everything complete. b before c (alphabetical). d disabled/skipped. + +## What This Proves + +| Phase | Behavior | How | +|-------|----------|-----| +| 3 | **Per-node ordering** | worker moves from prio 1 → 2 while worker2 is stuck on prio 1 | +| 3b | **`sequencing: all` blocks** | worker finished gate but can't start prio 3 until worker2 catches up | +| 5 | **Gate releases** | both nodes past gate, b still paused = stable assertion point | +| 6-7 | **Alphabetical ordering** | b processes before c at priority 3 | +| 7 | **Disabled skip** | d is skipped on both nodes | ## Files -- `chainsaw-test.yaml` - Main test configuration with 6 phases -- `skyhook.yaml` - Multiple skyhooks with different priorities targeting both nodes -- `skyhook-pause-update.yaml` - Skyhooks with pause annotation removed -- `skyhook-disable-update.yaml` - Skyhooks with disable annotation (optional, not used in current test) -- `assert-node1-priority1-complete-node2-blocked.yaml` - Phase 3 assertion: worker at priority 2 (paused), worker2 blocked at priority 1 -- `assert-concurrent-different-priorities.yaml` - Phase 5 CRITICAL assertion: worker completed priority 1, worker2 still on priority 1 (proves per-node ordering via node annotations) -- `assert-multiple-skyhooks-in-progress.yaml` - Phase 5 CRITICAL assertion: zzz (priority 1) and b (priority 2) both in_progress simultaneously (proves concurrent execution at different priorities - impossible in old global ordering) -- `assert-both-nodes-complete.yaml` - Phase 6 assertion: both nodes complete all skyhooks - -## Notes - -- Uses dedicated label `skyhook.nvidia.com/strict-order-test=true` on both worker nodes throughout entire test -- Worker2 is blocked only temporarily to create timing difference, not excluded from test -- This is fundamentally a **per-node ordering test** using 2 nodes throughout -- **Critical proof**: Pod assertions show worker running priority 2 pods while worker2 runs priority 1 pods concurrently +| File | Phase | Purpose | +|------|-------|---------| +| `chainsaw-test.yaml` | — | Main test, 7 phases | +| `skyhook.yaml` | 2 | Skyhook definitions (zzz, gate, b, c, d) | +| `skyhook-pause-update.yaml` | 6 | Unpause b | +| `skyhook-disable-update.yaml` | — | Disable state patches (not used in current flow) | +| `assert-node1-priority1-complete-node2-blocked.yaml` | 3 | worker past zzz, worker2 blocked | +| `assert-gate-blocks-worker.yaml` | 3b | worker past gate, b paused, worker2 still blocked | +| `assert-gate-released-b-paused.yaml` | 5 | Both past gate, b still paused (stable) | +| `assert-both-nodes-complete.yaml` | 7 | Everything complete | +| `assert-concurrent-different-priorities.yaml` | — | Kept, not referenced by current test | +| `assert-multiple-skyhooks-in-progress.yaml` | — | Kept, not referenced by current test | diff --git a/k8s-tests/chainsaw/skyhook/strict-order/assert-both-nodes-complete.yaml b/k8s-tests/chainsaw/skyhook/strict-order/assert-both-nodes-complete.yaml index 9965a4f2..0f227952 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/assert-both-nodes-complete.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/assert-both-nodes-complete.yaml @@ -23,6 +23,8 @@ metadata: skyhook.nvidia.com/strict-order-test: "true" annotations: skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: complete skyhook.nvidia.com/status_strict-order-skyhook-b: complete skyhook.nvidia.com/status_strict-order-skyhook-c: complete skyhook.nvidia.com/status_strict-order-skyhook-d: disabled @@ -36,6 +38,8 @@ metadata: skyhook.nvidia.com/strict-order-test: "true" annotations: skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: complete skyhook.nvidia.com/status_strict-order-skyhook-b: complete skyhook.nvidia.com/status_strict-order-skyhook-c: complete skyhook.nvidia.com/status_strict-order-skyhook-d: disabled diff --git a/k8s-tests/chainsaw/skyhook/strict-order/assert-gate-blocks-worker.yaml b/k8s-tests/chainsaw/skyhook/strict-order/assert-gate-blocks-worker.yaml new file mode 100644 index 00000000..181117fb --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/strict-order/assert-gate-blocks-worker.yaml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Phase 3b ASSERTION: proves sequencing:all blocks priority 3 +# +# aa is the FIRST skyhook alphabetically at priority 3, NOT paused, NOT disabled, +# yet its skyhook-level status is "waiting". The ONLY possible reason aa is waiting +# is because gate (priority 2, sequencing:all) is not globally complete — worker2 +# hasn't finished gate yet. This is the unambiguous proof that sequencing:all works. + +# 1. Worker: completed zzz, gate is in progress or complete, b is paused +apiVersion: v1 +kind: Node +metadata: + name: kind-worker + labels: + skyhook.nvidia.com/strict-order-test: "true" + annotations: + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: in_progress + skyhook.nvidia.com/status_strict-order-skyhook-aa: waiting + skyhook.nvidia.com/status_strict-order-skyhook-b: paused + +--- +# 3. PROOF: aa (priority 3, first alphabetically, NOT paused, NOT disabled) is waiting +# because gate's sequencing:all blocks it — no other explanation possible +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: strict-order-skyhook-aa +status: + status: waiting diff --git a/k8s-tests/chainsaw/skyhook/strict-order/assert-concurrent-different-priorities.yaml b/k8s-tests/chainsaw/skyhook/strict-order/assert-gate-released-b-paused.yaml similarity index 52% rename from k8s-tests/chainsaw/skyhook/strict-order/assert-concurrent-different-priorities.yaml rename to k8s-tests/chainsaw/skyhook/strict-order/assert-gate-released-b-paused.yaml index 8523f20a..5dc5026f 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/assert-concurrent-different-priorities.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/assert-gate-released-b-paused.yaml @@ -14,15 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 +# Phase 5 CRITICAL ASSERTION: sequencing: all gate has released, b still paused +# Both nodes completed zzz and gate. The gate's sequencing: all held worker back +# until worker2 caught up. Now both are past the gate, but b is still paused. +# This is a stable, deterministic state — no race conditions. -# Phase 5 CRITICAL ASSERTION: Node annotations proving per-node ordering during concurrent execution -# Worker has completed priority 1 and reached priority 2 -# Worker2 is still on priority 1 (at any stage) -# This definitively proves ordering is per-node, not global - -# 1. Worker node has completed priority 1 (zzz) +# 1. Worker node: completed zzz and gate, aa is processing or complete, b is paused apiVersion: v1 kind: Node metadata: @@ -31,8 +28,11 @@ metadata: skyhook.nvidia.com/strict-order-test: "true" annotations: skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: complete + skyhook.nvidia.com/status_strict-order-skyhook-b: paused --- -# 2. Worker2 node has NOT completed priority 1 yet (proves it's behind worker) +# 2. Worker2 node: completed zzz and gate, aa is processing or complete, b is paused apiVersion: v1 kind: Node metadata: @@ -40,4 +40,7 @@ metadata: labels: skyhook.nvidia.com/strict-order-test: "true" annotations: - ("skyhook.nvidia.com/status_strict-order-skyhook-zzz" != "complete"): true + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: complete + skyhook.nvidia.com/status_strict-order-skyhook-b: paused diff --git a/k8s-tests/chainsaw/skyhook/strict-order/assert-multiple-skyhooks-in-progress.yaml b/k8s-tests/chainsaw/skyhook/strict-order/assert-multiple-skyhooks-in-progress.yaml deleted file mode 100644 index 2ae6145c..00000000 --- a/k8s-tests/chainsaw/skyhook/strict-order/assert-multiple-skyhooks-in-progress.yaml +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# CRITICAL PROOF: Priority 1 and Priority 2 skyhooks running concurrently -# This proves per-node ordering - in the old global ordering system, ALL nodes had to finish priority 1 before ANY node could start priority 2 -# Now we can have zzz (priority 1) and b (priority 2) running at the same time because nodes work independently -# Worker2 is on priority 1 (zzz) while Worker is on priority 2 (b) - -# Skyhook zzz (priority 1) transitioning to in_progress (worker2 was just unblocked) -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-zzz -status: - (status == 'blocked' || status == 'in_progress'): true ---- -# Skyhook b (priority 2) has nodes in_progress - concurrent with zzz! -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-b -status: - status: in_progress ---- -# Skyhook c (priority 2) is waiting - all nodes waiting for b to complete (alphabetical ordering) -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-c -status: - status: waiting ---- -# Skyhook d (priority 2) is disabled - doesn't block anything -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-d -status: - status: disabled ---- -# Skyhook c (priority 2) is waiting - alphabetically after b within same priority -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-c -status: - status: waiting ---- -# Skyhook d (priority 2) is disabled - doesn't block other skyhooks -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: strict-order-skyhook-d -status: - status: disabled diff --git a/k8s-tests/chainsaw/skyhook/strict-order/assert-node1-priority1-complete-node2-blocked.yaml b/k8s-tests/chainsaw/skyhook/strict-order/assert-node1-priority1-complete-node2-blocked.yaml index ecd2587e..876c04d2 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/assert-node1-priority1-complete-node2-blocked.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/assert-node1-priority1-complete-node2-blocked.yaml @@ -17,10 +17,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# Phase 2 CRITICAL ASSERTION: Node annotations proving per-node priority ordering -# Worker reaches priority 2 (paused at b) while worker2 is blocked at priority 1 +# Phase 3 CRITICAL ASSERTION: Node annotations proving per-node priority ordering +# Worker reaches priority 2 (gate) while worker2 is blocked at priority 1 -# 1. Worker node at priority 2 (paused at b) +# 1. Worker node completed priority 1 (zzz) and moved to priority 2 (gate) apiVersion: v1 kind: Node metadata: @@ -29,7 +29,6 @@ metadata: skyhook.nvidia.com/strict-order-test: "true" annotations: skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete - skyhook.nvidia.com/status_strict-order-skyhook-b: paused --- # 2. Worker2 blocked on priority 1 (proves per-node ordering) apiVersion: v1 diff --git a/k8s-tests/chainsaw/skyhook/strict-order/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/strict-order/chainsaw-test.yaml index 549f7030..c09c634c 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/chainsaw-test.yaml @@ -21,7 +21,7 @@ metadata: name: strict-order-skyhook spec: timeouts: - assert: 60s + assert: 90s exec: 90s catch: ## if errors, print the most important info - get: @@ -43,6 +43,8 @@ spec: fi ## remove annotation/labels from last run ../skyhook-cli reset strict-order-skyhook-zzz --confirm 2>/dev/null || true + ../skyhook-cli reset strict-order-skyhook-gate --confirm 2>/dev/null || true + ../skyhook-cli reset strict-order-skyhook-aa --confirm 2>/dev/null || true ../skyhook-cli reset strict-order-skyhook-b --confirm 2>/dev/null || true ../skyhook-cli reset strict-order-skyhook-c --confirm 2>/dev/null || true ../skyhook-cli reset strict-order-skyhook-d --confirm 2>/dev/null || true @@ -61,6 +63,8 @@ spec: content: | ## Validate 2 nodes are targeted per skyhook ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-zzz + ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-gate + ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-aa ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-b ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-c ../../metrics_test.py skyhook_node_target_count 2 -t skyhook_name=strict-order-skyhook-d @@ -68,7 +72,7 @@ spec: ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-d -t status=disabled ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-b -t status=paused - # Phase 3: Assert per-node ordering - worker at priority 2 (paused), worker2 blocked at priority 1 + # Phase 3: Assert per-node ordering - worker completes zzz, moves to gate; worker2 blocked at priority 1 - name: assert-worker-priority2-worker2-blocked try: - sleep: @@ -80,37 +84,109 @@ spec: ## Validate per-node status counts ../../metrics_test.py skyhook_node_status_count 1 -t skyhook_name=strict-order-skyhook-zzz -t status=complete ../../metrics_test.py skyhook_node_status_count 1 -t skyhook_name=strict-order-skyhook-zzz -t status=blocked - ../../metrics_test.py skyhook_node_status_count 1 -t skyhook_name=strict-order-skyhook-b -t status=paused ../../metrics_test.py skyhook_package_state_count 1 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-zzz -t state=complete - - # Phase 4: Unpause b AND unblock worker2 simultaneously (CRITICAL TEST) - - name: unpause-and-unblock-simultaneously + + + # Unblock worker2 only (keep b paused) so it can catch up through zzz and gate + - name: unblock-worker2 try: - - patch: - file: skyhook-pause-update.yaml + - assert: + file: assert-gate-blocks-worker.yaml - script: content: | - ## Remove blocking taint from worker2 - both nodes now running at different priorities + ## Remove blocking taint from worker2 - it will catch up through zzz then gate kubectl taint node kind-worker2 test-block- 2>/dev/null || true - - sleep: - duration: 2s - - # Phase 5: Assert concurrent different priorities (CRITICAL - proves per-node ordering) - - name: assert-concurrent-different-priorities + + + # Phase 3b: Assert sequencing: all gate blocks worker from priority 3 + # Worker completes gate (priority 2) but must WAIT for worker2 to also complete gate + # This proves sequencing: all enforces global ordering + - name: assert-gate-blocks-worker + timeouts: + assert: 120s ## zzz needs time to complete before gate state can be verified + try: + - script: + content: | + ## b is paused, aa is waiting (blocked by gate's sequencing: all — the key proof) + ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-b -t status=paused + ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-aa -t status=waiting + - assert: + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker2 + annotations: + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: in_progress + skyhook.nvidia.com/status_strict-order-skyhook-aa: waiting + skyhook.nvidia.com/status_strict-order-skyhook-b: paused + - assert: + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker + annotations: + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: waiting + skyhook.nvidia.com/status_strict-order-skyhook-b: paused + - assert: + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker + annotations: + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: in_progress + skyhook.nvidia.com/status_strict-order-skyhook-b: paused + - assert: + resource: + apiVersion: v1 + kind: Node + metadata: + name: kind-worker2 + labels: + skyhook.nvidia.com/strict-order-test: "true" + annotations: + skyhook.nvidia.com/status_strict-order-skyhook-zzz: complete + skyhook.nvidia.com/status_strict-order-skyhook-gate: complete + skyhook.nvidia.com/status_strict-order-skyhook-aa: in_progress + skyhook.nvidia.com/status_strict-order-skyhook-b: paused + + # Phase 5: Assert gate releases - both nodes complete gate, b still paused (STABLE STATE) + # This is the key assertion: sequencing: all held worker back until worker2 caught up, + # and now both have cleared the gate. b is still paused so this state is deterministic. + - name: assert-gate-released-b-paused timeouts: assert: 120s try: - assert: - file: assert-concurrent-different-priorities.yaml + file: assert-gate-released-b-paused.yaml + - sleep: + duration: 5s - assert: - file: assert-multiple-skyhooks-in-progress.yaml + file: assert-gate-released-b-paused.yaml - script: content: | - ## Validate metrics showing nodes at different priorities concurrently - ../../metrics_test.py skyhook_status 0 -t skyhook_name=strict-order-skyhook-b -t status=paused - - # Phase 6: Assert both nodes complete all skyhooks + ## Both nodes should have completed gate + ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-gate -t status=complete + ## b should still be paused + ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-b -t status=paused + + # Phase 6: Unpause b - both nodes can now process priority 3 + - name: unpause-b + try: + - patch: + file: skyhook-pause-update.yaml + + # Phase 7: Assert both nodes complete all skyhooks - name: assert-both-nodes-complete + timeouts: + assert: 180s try: - assert: file: assert-both-nodes-complete.yaml @@ -118,10 +194,14 @@ spec: content: | ## Validate final state - both nodes complete ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-zzz -t status=complete + ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-gate -t status=complete + ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-aa -t status=complete ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-b -t status=complete ../../metrics_test.py skyhook_node_status_count 2 -t skyhook_name=strict-order-skyhook-c -t status=complete ../../metrics_test.py skyhook_status 1 -t skyhook_name=strict-order-skyhook-d -t status=disabled ../../metrics_test.py skyhook_package_state_count 2 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-zzz -t state=complete + ../../metrics_test.py skyhook_package_state_count 2 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-gate -t state=complete + ../../metrics_test.py skyhook_package_state_count 2 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-aa -t state=complete ../../metrics_test.py skyhook_package_state_count 2 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-b -t state=complete ../../metrics_test.py skyhook_package_state_count 2 -t package_name=foobar -t package_version=1.2 -t skyhook_name=strict-order-skyhook-c -t state=complete finally: diff --git a/k8s-tests/chainsaw/skyhook/strict-order/skyhook-disable-update.yaml b/k8s-tests/chainsaw/skyhook/strict-order/skyhook-disable-update.yaml deleted file mode 100644 index 1349230a..00000000 --- a/k8s-tests/chainsaw/skyhook/strict-order/skyhook-disable-update.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - labels: - app.kubernetes.io/part-of: skyhook-operator - app.kubernetes.io/created-by: skyhook-operator - name: strict-order-skyhook-zzz - annotations: - skyhook.nvidia.com/disable: "false" - skyhook.nvidia.com/pause: "false" -spec: - priority: 1 - nodeSelectors: - matchLabels: - skyhook.nvidia.com/strict-order-test: "true" - packages: - foobar: - interrupt: - type: service - services: [cron] - version: "1.2" - image: ghcr.io/nvidia/skyhook/agentless - env: - - name: SLEEP_LEN - value: "3" ## making faster so the test works for asserting node condition ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - labels: - app.kubernetes.io/part-of: skyhook-operator - app.kubernetes.io/created-by: skyhook-operator - name: strict-order-skyhook-b - annotations: - skyhook.nvidia.com/disable: "false" - skyhook.nvidia.com/pause: "false" -spec: - priority: 2 - nodeSelectors: - matchLabels: - skyhook.nvidia.com/strict-order-test: "true" - packages: - foobar: - interrupt: - type: service - services: [cron] - version: "1.2" - image: ghcr.io/nvidia/skyhook/agentless - env: - - name: SLEEP_LEN - value: "3" ## making faster so the test works for asserting node condition ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - labels: - app.kubernetes.io/part-of: skyhook-operator - app.kubernetes.io/created-by: skyhook-operator - name: strict-order-skyhook-c - annotations: - skyhook.nvidia.com/disable: "false" - skyhook.nvidia.com/pause: "false" -spec: - priority: 2 - nodeSelectors: - matchLabels: - skyhook.nvidia.com/strict-order-test: "true" - packages: - foobar: - version: "1.2" - image: ghcr.io/nvidia/skyhook/agentless - env: - - name: SLEEP_LEN - value: "1" ## making faster so the test works for asserting node condition ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - labels: - app.kubernetes.io/part-of: skyhook-operator - app.kubernetes.io/created-by: skyhook-operator - name: strict-order-skyhook-d - annotations: - skyhook.nvidia.com/disable: "false" - skyhook.nvidia.com/pause: "false" -spec: - priority: 2 - nodeSelectors: - matchLabels: - skyhook.nvidia.com/strict-order-test: "true" - packages: - foobar: - version: "1.2" - image: ghcr.io/nvidia/skyhook/agentless - env: - - name: SLEEP_LEN - value: "1" ## making faster so the test works for asserting node condition diff --git a/k8s-tests/chainsaw/skyhook/strict-order/skyhook-pause-update.yaml b/k8s-tests/chainsaw/skyhook/strict-order/skyhook-pause-update.yaml index c6b67140..bfa2ffd7 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/skyhook-pause-update.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/skyhook-pause-update.yaml @@ -42,6 +42,29 @@ spec: --- apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: strict-order-skyhook-aa + annotations: + skyhook.nvidia.com/disable: "false" + skyhook.nvidia.com/pause: "false" +spec: + priority: 3 + nodeSelectors: + matchLabels: + skyhook.nvidia.com/strict-order-test: "true" + packages: + foobar: + version: "1.2" + image: ghcr.io/nvidia/skyhook/agentless + env: + - name: SLEEP_LEN + value: "1" +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook metadata: labels: app.kubernetes.io/part-of: skyhook-operator @@ -51,7 +74,7 @@ metadata: skyhook.nvidia.com/disable: "false" skyhook.nvidia.com/pause: "false" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" @@ -62,7 +85,7 @@ spec: services: [cron] version: "1.2" image: ghcr.io/nvidia/skyhook/agentless - env: + env: - name: SLEEP_LEN value: "3" ## making faster so the test works for asserting node condition --- @@ -77,7 +100,7 @@ metadata: skyhook.nvidia.com/disable: "false" skyhook.nvidia.com/pause: "false" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" @@ -100,7 +123,7 @@ metadata: skyhook.nvidia.com/disable: "true" skyhook.nvidia.com/pause: "false" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" diff --git a/k8s-tests/chainsaw/skyhook/strict-order/skyhook.yaml b/k8s-tests/chainsaw/skyhook/strict-order/skyhook.yaml index 9f2e4c96..64ae51a8 100644 --- a/k8s-tests/chainsaw/skyhook/strict-order/skyhook.yaml +++ b/k8s-tests/chainsaw/skyhook/strict-order/skyhook.yaml @@ -37,7 +37,53 @@ spec: image: ghcr.io/nvidia/skyhook/agentless env: - name: SLEEP_LEN - value: "1" ## making faster so the test works for asserting node condition + value: "3" ## making faster so the test works for asserting node condition +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: strict-order-skyhook-gate + annotations: + skyhook.nvidia.com/disable: "false" +spec: + priority: 2 + sequencing: all ## global sync point: all nodes must complete before priority 3 starts + nodeSelectors: + matchLabels: + skyhook.nvidia.com/strict-order-test: "true" + packages: + foobar: + version: "1.2" + image: ghcr.io/nvidia/skyhook/agentless + env: + - name: SLEEP_LEN + value: "3" ## fast so the test works for asserting node condition +--- +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + labels: + app.kubernetes.io/part-of: skyhook-operator + app.kubernetes.io/created-by: skyhook-operator + name: strict-order-skyhook-aa + annotations: + skyhook.nvidia.com/disable: "false" + skyhook.nvidia.com/pause: "false" +spec: + priority: 3 + nodeSelectors: + matchLabels: + skyhook.nvidia.com/strict-order-test: "true" + packages: + foobar: + version: "1.2" + image: ghcr.io/nvidia/skyhook/agentless + env: + - name: SLEEP_LEN + value: "1" --- apiVersion: skyhook.nvidia.com/v1alpha1 kind: Skyhook @@ -50,7 +96,7 @@ metadata: skyhook.nvidia.com/disable: "false" skyhook.nvidia.com/pause: "true" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" @@ -61,7 +107,7 @@ spec: services: [cron] version: "1.2" image: ghcr.io/nvidia/skyhook/agentless - env: + env: - name: SLEEP_LEN value: "5" ## making faster so the test works for asserting node condition --- @@ -76,7 +122,7 @@ metadata: skyhook.nvidia.com/disable: "false" skyhook.nvidia.com/pause: "false" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" @@ -84,7 +130,7 @@ spec: foobar: version: "1.2" image: ghcr.io/nvidia/skyhook/agentless - env: + env: - name: SLEEP_LEN value: "5" ## making faster so the test works for asserting node condition --- @@ -99,7 +145,7 @@ metadata: skyhook.nvidia.com/disable: "true" skyhook.nvidia.com/pause: "false" spec: - priority: 2 + priority: 3 nodeSelectors: matchLabels: skyhook.nvidia.com/strict-order-test: "true" @@ -107,6 +153,6 @@ spec: foobar: version: "1.2" image: ghcr.io/nvidia/skyhook/agentless - env: + env: - name: SLEEP_LEN value: "5" ## making faster so the test works for asserting node condition diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml index d52807d2..2c166df0 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # @@ -34,7 +34,9 @@ spec: name: taint-scheduling format: yaml steps: - - try: + - name: taint scheduling + description: Taint nodes, create skyhook, verify it is blocked, then update to tolerate taints and verify completion + try: - script: content: | ## remove annotation from last run diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/README.md b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/README.md index 62dc292f..5caa03b9 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/README.md +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/README.md @@ -30,8 +30,9 @@ Validates that uninstall and upgrade modes work correctly when packages are remo ## Files -- `chainsaw-test.yaml` - Main test configuration +- `chainsaw-test.yaml` - Main test configuration with lifecycle assertions inline (nodes, pods, skyhook status) for sequential ordering - `skyhook.yaml` - Initial skyhook with packages - `update.yaml` - Update with version changes - `update-no-packages.yaml` - Final update removing all packages -- `assert*.yaml` - State assertions for each phase +- `assert-cm-install.yaml` - ConfigMap assertions after initial install (parallel) +- `assert-cm-update.yaml` - ConfigMap assertions after upgrade (parallel) diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-cm-update.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-cm-update.yaml new file mode 100644 index 00000000..3568d025 --- /dev/null +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-cm-update.yaml @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ConfigMaps after upgrade (parallel: distinct steady-state resources) +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: uninstall-upgrade-skyhook-nullptr-2.0.1 + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: uninstall-upgrade-skyhook +data: + game.properties: | + changed + ui.properties: | + changed +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: uninstall-upgrade-skyhook-dogs-1.2.5 + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + blockOwnerDeletion: true + controller: true + kind: Skyhook + name: uninstall-upgrade-skyhook +data: + game.properties: | + changed + ui.properties: | + changed diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-install.yaml similarity index 100% rename from k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert.yaml rename to k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-install.yaml diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml deleted file mode 100644 index 9b61bfa0..00000000 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update-no-packages.yaml +++ /dev/null @@ -1,151 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress - annotations: - ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): - { - "dogs|1.2.5": { - "name": "dogs", - "version": "1.2.5", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "uninstall", - "state": "in_progress" - }, - "nullptr|2.0.1": { - "name": "nullptr", - "version": "2.0.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "uninstall", - "state": "in_progress" - }, - } - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress - (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist -status: - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - skyhook.nvidia.com/package: dogs-1.2.5 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "dogs", - "version": "1.2.5", - "skyhook": "uninstall-upgrade-skyhook", - "stage": "uninstall", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: uninstall-upgrade-skyhook -spec: - initContainers: - - name: dogs-init - - name: dogs-uninstall - args: - ([0]): uninstall - ([1]): /root - (length(@)): 3 - - name: dogs-uninstallcheck - args: - ([0]): uninstall-check - ([1]): /root - (length(@)): 3 ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - skyhook.nvidia.com/package: nullptr-2.0.1 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "nullptr", - "version": "2.0.1", - "skyhook": "uninstall-upgrade-skyhook", - "stage": "uninstall", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: uninstall-upgrade-skyhook -spec: - initContainers: - - name: nullptr-init - - name: nullptr-uninstall - args: - ([0]): uninstall - ([1]): /root - (length(@)): 3 - - name: nullptr-uninstallcheck - args: - ([0]): uninstall-check - ([1]): /root - (length(@)): 3 ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete - annotations: - skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook: '{}' - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete -status: - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: uninstall-upgrade-skyhook -status: - status: complete - observedGeneration: 6 - completeNodes: 1/1 - packageList: "" - nodesInProgress: 0 - nodeState: - (values(@)): {} - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml deleted file mode 100644 index db0454f2..00000000 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/assert-update.yaml +++ /dev/null @@ -1,268 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress - annotations: - ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): - { - "dogs|1.2.5": { - "name": "dogs", - "version": "1.2.5", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "uninstall", - "state": "skipped" - }, - "dogs|1.2.6": { - "name": "dogs", - "version": "1.2.6", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "uninstall", - "state": "in_progress" - }, - "nullptr|2.0.1": { - "name": "nullptr", - "version": "2.0.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "upgrade", - "state": "in_progress" - }, - "cats|6.2.0": { - "name": "cats", - "version": "6.2.0", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "uninstall", - "state": "in_progress" - } - } - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress -spec: - taints: - - effect: NoSchedule - key: node.kubernetes.io/unschedulable -status: - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): - - reason: "Incomplete" - status: "True" - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - skyhook.nvidia.com/package: cats-6.2.0 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "cats", - "version": "6.2.0", - "skyhook": "uninstall-upgrade-skyhook", - "stage": "uninstall", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: uninstall-upgrade-skyhook -spec: - initContainers: - - name: cats-init - - name: cats-uninstall - args: - ([0]): uninstall - ([1]): /root - (length(@)): 3 - - name: cats-uninstallcheck - args: - ([0]): uninstall-check - ([1]): /root - (length(@)): 3 ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - skyhook.nvidia.com/package: dogs-1.2.6 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "dogs", - "version": "1.2.6", - "skyhook": "uninstall-upgrade-skyhook", - "stage": "uninstall", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: uninstall-upgrade-skyhook -spec: - initContainers: - - name: dogs-init - - name: dogs-uninstall - args: - ([0]): uninstall - ([1]): /root - (length(@)): 3 - - name: dogs-uninstallcheck - args: - ([0]): uninstall-check - ([1]): /root - (length(@)): 3 ---- -kind: Pod -apiVersion: v1 -metadata: - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - skyhook.nvidia.com/package: nullptr-2.0.1 - annotations: - ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): - { - "name": "nullptr", - "version": "2.0.1", - "skyhook": "uninstall-upgrade-skyhook", - "stage": "upgrade", - "image": "ghcr.io/nvidia/skyhook/agentless" - } - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - kind: Skyhook - name: uninstall-upgrade-skyhook -spec: - initContainers: - - name: nullptr-init - - name: nullptr-upgrade - args: - ([0]): upgrade - ([1]): /root - (length(@)): 3 - - name: nullptr-upgradecheck - args: - ([0]): upgrade-check - ([1]): /root - (length(@)): 3 ---- -apiVersion: v1 -kind: Node -metadata: - labels: - skyhook.nvidia.com/test-node: skyhooke2e - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete - annotations: - ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): - { - "dogs|1.2.5": { - "name": "dogs", - "version": "1.2.5", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - }, - "nullptr|2.0.1": { - "name": "nullptr", - "version": "2.0.1", - "image": "ghcr.io/nvidia/skyhook/agentless", - "stage": "post-interrupt", - "state": "complete" - } - } - skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete -status: - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): - - reason: "Complete" - status: "False" - (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): - - reason: "Not Erroring" - status: "False" ---- -apiVersion: skyhook.nvidia.com/v1alpha1 -kind: Skyhook -metadata: - name: uninstall-upgrade-skyhook -status: - status: complete - observedGeneration: 4 - completeNodes: 1/1 - packageList: dogs:1.2.5,nullptr:2.0.1 - nodesInProgress: 0 - nodeState: - (values(@)): - - dogs|1.2.5: - name: dogs - state: complete - version: '1.2.5' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - nullptr|2.0.1: - name: nullptr - state: complete - version: '2.0.1' - stage: post-interrupt - image: ghcr.io/nvidia/skyhook/agentless - nodeStatus: - # grab values should be one and is complete - (values(@)): - - complete ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: uninstall-upgrade-skyhook-nullptr-2.0.1 - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: uninstall-upgrade-skyhook -data: - game.properties: | - changed - ui.properties: | - changed ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: uninstall-upgrade-skyhook-dogs-1.2.5 - namespace: skyhook - labels: - skyhook.nvidia.com/name: uninstall-upgrade-skyhook - ownerReferences: - - apiVersion: skyhook.nvidia.com/v1alpha1 - blockOwnerDeletion: true - controller: true - kind: Skyhook - name: uninstall-upgrade-skyhook -data: - game.properties: | - changed - ui.properties: | - changed diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml index e2f31630..a1c8c22e 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml @@ -21,7 +21,7 @@ metadata: name: uninstall-upgrade-skyhook spec: timeouts: - assert: 420s + assert: 180s catch: ## if errors, print the most important info - get: apiVersion: v1 @@ -34,24 +34,405 @@ spec: name: uninstall-upgrade-skyhook format: yaml steps: - - try: + - name: initial install + description: Reset node state, create skyhook with packages, and verify initial installation completes + try: - script: content: | ## remove annotation/labels from last run ../skyhook-cli reset uninstall-upgrade-skyhook --confirm 2>/dev/null || true - create: file: skyhook.yaml + ## parallel: all docs checked simultaneously — in_progress and complete Node checks + ## match the same node at different lifecycle points, works because Chainsaw retries + ## until all docs match at once - assert: - file: assert.yaml - - try: + file: assert-install.yaml + - name: upgrade + description: Update skyhook with new package versions and verify the upgrade completes + try: - update: file: update.yaml + ## sequential: node/pods transition through uninstall/upgrade stages - assert: - file: assert-update.yaml - - try: + ## node in_progress with uninstall/upgrade stages + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "dogs|1.2.5": { + "name": "dogs", + "version": "1.2.5", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "skipped" + }, + "dogs|1.2.6": { + "name": "dogs", + "version": "1.2.6", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + "nullptr|2.0.1": { + "name": "nullptr", + "version": "2.0.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "upgrade", + "state": "in_progress" + }, + "cats|6.2.0": { + "name": "cats", + "version": "6.2.0", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + } + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + spec: + taints: + - effect: NoSchedule + key: node.kubernetes.io/unschedulable + status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## cats uninstall pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + skyhook.nvidia.com/package: cats-6.2.0 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "cats", + "version": "6.2.0", + "skyhook": "uninstall-upgrade-skyhook", + "stage": "uninstall", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: uninstall-upgrade-skyhook + spec: + initContainers: + - name: cats-init + - name: cats-uninstall + args: + ([0]): uninstall + ([1]): /root + (length(@)): 3 + - name: cats-uninstallcheck + args: + ([0]): uninstall-check + ([1]): /root + (length(@)): 3 + - assert: + ## dogs uninstall pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + skyhook.nvidia.com/package: dogs-1.2.6 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "dogs", + "version": "1.2.6", + "skyhook": "uninstall-upgrade-skyhook", + "stage": "uninstall", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: uninstall-upgrade-skyhook + spec: + initContainers: + - name: dogs-init + - name: dogs-uninstall + args: + ([0]): uninstall + ([1]): /root + (length(@)): 3 + - name: dogs-uninstallcheck + args: + ([0]): uninstall-check + ([1]): /root + (length(@)): 3 + - assert: + ## nullptr upgrade pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + skyhook.nvidia.com/package: nullptr-2.0.1 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "nullptr", + "version": "2.0.1", + "skyhook": "uninstall-upgrade-skyhook", + "stage": "upgrade", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: uninstall-upgrade-skyhook + spec: + initContainers: + - name: nullptr-init + - name: nullptr-upgrade + args: + ([0]): upgrade + ([1]): /root + (length(@)): 3 + - name: nullptr-upgradecheck + args: + ([0]): upgrade-check + ([1]): /root + (length(@)): 3 + - assert: + ## node complete after upgrade + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "dogs|1.2.5": { + "name": "dogs", + "version": "1.2.5", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + }, + "nullptr|2.0.1": { + "name": "nullptr", + "version": "2.0.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "post-interrupt", + "state": "complete" + } + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete + status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## skyhook complete after upgrade + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: uninstall-upgrade-skyhook + status: + status: complete + observedGeneration: 4 + completeNodes: 1/1 + packageList: dogs:1.2.5,nullptr:2.0.1 + nodesInProgress: 0 + nodeState: + (values(@)): + - dogs|1.2.5: + name: dogs + state: complete + version: '1.2.5' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + nullptr|2.0.1: + name: nullptr + state: complete + version: '2.0.1' + stage: post-interrupt + image: ghcr.io/nvidia/skyhook/agentless + nodeStatus: + (values(@)): + - complete + ## parallel: configmaps are distinct steady-state resources + - assert: + file: assert-cm-update.yaml + - name: uninstall + description: Remove all packages from the skyhook and verify the uninstall completes + try: - sleep: duration: 5s - update: file: update-no-packages.yaml + ## sequential: node/pods transition through uninstall to empty state + - assert: + ## node in_progress with packages being uninstalled + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + annotations: + ("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook" && parse_json("skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook")): + { + "dogs|1.2.5": { + "name": "dogs", + "version": "1.2.5", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + "nullptr|2.0.1": { + "name": "nullptr", + "version": "2.0.1", + "image": "ghcr.io/nvidia/skyhook/agentless", + "stage": "uninstall", + "state": "in_progress" + }, + } + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: in_progress + (!taints || length(taints)==`0` || (taints && !not_null(taints))): true ## taints should be empty or not exist + status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Incomplete" + status: "True" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" + - assert: + ## dogs uninstall pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + skyhook.nvidia.com/package: dogs-1.2.5 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "dogs", + "version": "1.2.5", + "skyhook": "uninstall-upgrade-skyhook", + "stage": "uninstall", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: uninstall-upgrade-skyhook + spec: + initContainers: + - name: dogs-init + - name: dogs-uninstall + args: + ([0]): uninstall + ([1]): /root + (length(@)): 3 + - name: dogs-uninstallcheck + args: + ([0]): uninstall-check + ([1]): /root + (length(@)): 3 + - assert: + ## nullptr uninstall pod + resource: + kind: Pod + apiVersion: v1 + metadata: + namespace: skyhook + labels: + skyhook.nvidia.com/name: uninstall-upgrade-skyhook + skyhook.nvidia.com/package: nullptr-2.0.1 + annotations: + ("skyhook.nvidia.com/package" && parse_json("skyhook.nvidia.com/package")): + { + "name": "nullptr", + "version": "2.0.1", + "skyhook": "uninstall-upgrade-skyhook", + "stage": "uninstall", + "image": "ghcr.io/nvidia/skyhook/agentless" + } + ownerReferences: + - apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + name: uninstall-upgrade-skyhook + spec: + initContainers: + - name: nullptr-init + - name: nullptr-uninstall + args: + ([0]): uninstall + ([1]): /root + (length(@)): 3 + - name: nullptr-uninstallcheck + args: + ([0]): uninstall-check + ([1]): /root + (length(@)): 3 + - assert: + ## node complete with empty nodeState + resource: + apiVersion: v1 + kind: Node + metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete + annotations: + skyhook.nvidia.com/nodeState_uninstall-upgrade-skyhook: '{}' + skyhook.nvidia.com/status_uninstall-upgrade-skyhook: complete + status: + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/NotReady']): + - reason: "Complete" + status: "False" + (conditions[?type == 'skyhook.nvidia.com/uninstall-upgrade-skyhook/Erroring']): + - reason: "Not Erroring" + status: "False" - assert: - file: assert-update-no-packages.yaml + ## skyhook complete with no packages + resource: + apiVersion: skyhook.nvidia.com/v1alpha1 + kind: Skyhook + metadata: + name: uninstall-upgrade-skyhook + status: + status: complete + observedGeneration: 6 + completeNodes: 1/1 + packageList: "" + nodesInProgress: 0 + nodeState: + (values(@)): {} + nodeStatus: + (values(@)): + - complete diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml index 1b4cf3a3..aaeac785 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml @@ -34,7 +34,9 @@ spec: name: validate-packages format: yaml steps: - - try: + - name: invalid package + description: Create skyhook with an invalid package and verify the validation error is reported + try: - script: content: | ## remove annotation/labels from last run @@ -45,7 +47,9 @@ spec: duration: 5s - assert: file: assert.yaml - - try: + - name: fix package + description: Update skyhook with a valid package and verify it completes successfully + try: - update: file: update.yaml - assert: diff --git a/operator/api/v1alpha1/skyhook_types.go b/operator/api/v1alpha1/skyhook_types.go index a9e07f28..3ef756fd 100644 --- a/operator/api/v1alpha1/skyhook_types.go +++ b/operator/api/v1alpha1/skyhook_types.go @@ -41,15 +41,10 @@ type SkyhookSpec struct { // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster // Important: Run "make" to regenerate code after modifying this file - // Serial tells skyhook if it allowed to run in parallel or not when applying packages + // Serial tells skyhook if it allowed to run in packages in parallel. If true, the operator will run one package at a time. //+kubebuilder:default=false Serial bool `json:"serial,omitempty"` - // Pause halt the operator from proceeding. THIS is for admin use to stop skyhook if there is an issue or - // concert without needing to delete to ad in discovery of the issue. - //+kubebuilder:default=false - Pause bool `json:"pause,omitempty"` - // PodNonInterruptLabels are a set of labels we want to monitor pods for whether they Interruptible PodNonInterruptLabels metav1.LabelSelector `json:"podNonInterruptLabels,omitempty"` @@ -87,6 +82,30 @@ type SkyhookSpec struct { //+kubebuilder:validation:Minimum=1 //+kubebuilder:default=200 Priority int `json:"priority,omitempty"` + + // Sequencing controls whether priority ordering is enforced globally or per-node. + // "node" (default): a node can proceed past this skyhook independently once it completes on that node. + // "all": all nodes must complete this skyhook before any node starts the next priority. + //+kubebuilder:validation:Enum=all;node + //+kubebuilder:default="node" + Sequencing SequencingMode `json:"sequencing,omitempty"` +} + +// SequencingMode controls whether priority ordering is enforced globally or per-node +type SequencingMode string + +const ( + // SequencingNode allows each node to progress past this skyhook independently + // as soon as it completes on that node (per-node ordering) + SequencingNode SequencingMode = "node" + // SequencingAll requires all nodes to complete this skyhook before any node + // starts the next priority level (global ordering) + SequencingAll SequencingMode = "all" +) + +// IsPerNodeSequencing returns true if this skyhook uses per-node priority ordering +func (spec *SkyhookSpec) IsPerNodeSequencing() bool { + return spec.Sequencing != SequencingAll } // BuildGraph turns packages in the a graph of dependencies diff --git a/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml b/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml index 82a617ab..89f2f3bb 100644 --- a/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml +++ b/operator/config/crd/bases/skyhook.nvidia.com_skyhooks.yaml @@ -495,12 +495,6 @@ spec: type: object description: Packages are the DAG of packages to be applied to nodes. type: object - pause: - default: false - description: |- - Pause halt the operator from proceeding. THIS is for admin use to stop skyhook if there is an issue or - concert without needing to delete to ad in discovery of the issue. - type: boolean podNonInterruptLabels: description: PodNonInterruptLabels are a set of labels we want to monitor pods for whether they Interruptible @@ -559,10 +553,20 @@ spec: description: This skyhook is required to have been completed before any workloads can start type: boolean + sequencing: + default: node + description: |- + Sequencing controls whether priority ordering is enforced globally or per-node. + "node" (default): a node can proceed past this skyhook independently once it completes on that node. + "all": all nodes must complete this skyhook before any node starts the next priority. + enum: + - all + - node + type: string serial: default: false - description: Serial tells skyhook if it allowed to run in parallel - or not when applying packages + description: Serial tells skyhook if it allowed to run in packages + in parallel. If true, the operator will run one package at a time. type: boolean type: object status: diff --git a/operator/internal/controller/cluster_state_v2.go b/operator/internal/controller/cluster_state_v2.go index 1734a7b0..802f70e3 100644 --- a/operator/internal/controller/cluster_state_v2.go +++ b/operator/internal/controller/cluster_state_v2.go @@ -324,8 +324,9 @@ func GetNextSkyhook(skyhooks []SkyhookNodes) SkyhookNodes { } // IsNodeReadyForSkyhook checks if a node has completed all higher-priority skyhooks. -// This enables per-node priority ordering: a node can proceed to lower-priority skyhooks -// as soon as higher-priority ones complete on that specific node, regardless of other nodes. +// The check depends on each predecessor's sequencing mode: +// - sequencing: node (default) — checks per-node completion on that specific node +// - sequencing: all — checks global completion (all nodes must be done) func IsNodeReadyForSkyhook(nodeName string, skyhook SkyhookNodes, allSkyhooks []SkyhookNodes) bool { targetPriority := skyhook.GetSkyhook().Spec.Priority targetName := skyhook.GetSkyhook().Name @@ -346,15 +347,49 @@ func IsNodeReadyForSkyhook(nodeName string, skyhook SkyhookNodes, allSkyhooks [] continue } - // Check if this higher-priority skyhook targets this node and is incomplete on this node - _, nodeInOther := other.GetNode(nodeName) - if nodeInOther != nil && !nodeInOther.IsComplete() { - return false + if other.GetSkyhook().Spec.IsPerNodeSequencing() { + // Per-node: check if THIS node completed the predecessor + _, nodeInOther := other.GetNode(nodeName) + if nodeInOther != nil && !nodeInOther.IsComplete() { + return false + } + } else { + // Global (sequencing: all): predecessor must be globally complete + if !other.IsComplete() { + return false + } } } return true } +// isBlockedByGlobalPredecessor checks if any higher-priority skyhook with sequencing: all +// is not yet globally complete, which would block this skyhook. +func isBlockedByGlobalPredecessor(skyhook SkyhookNodes, allSkyhooks []SkyhookNodes) bool { + targetPriority := skyhook.GetSkyhook().Spec.Priority + targetName := skyhook.GetSkyhook().Name + + for _, other := range allSkyhooks { + if other.IsDisabled() { + continue + } + + otherPriority := other.GetSkyhook().Spec.Priority + otherName := other.GetSkyhook().Name + + if otherPriority > targetPriority || + (otherPriority == targetPriority && otherName >= targetName) { + continue + } + + // Only sequencing: all predecessors create skyhook-level waiting + if !other.GetSkyhook().Spec.IsPerNodeSequencing() && !other.IsComplete() { + return true + } + } + return false +} + // SkyhookNodes wraps the skyhook and nodes that it pertains too type SkyhookNodes interface { CollectNodeStatus() v1alpha1.Status @@ -817,8 +852,12 @@ func IntrospectSkyhook(skyhook SkyhookNodes, allSkyhooks []SkyhookNodes) bool { case hasMissingPolicy: collectNodeStatus = v1alpha1.StatusBlocked - // Per-node priority: Don't set skyhook-level waiting status - // Individual nodes will be set to waiting in IntrospectNode if they're blocked by higher-priority skyhooks + default: + // Check if any higher-priority skyhook with sequencing: all blocks this skyhook globally + if isBlockedByGlobalPredecessor(skyhook, allSkyhooks) { + collectNodeStatus = v1alpha1.StatusWaiting + } + // Per-node waiting (for sequencing: node predecessors) is handled in IntrospectNode } } else if hasMissingPolicy { // Even if all nodes are complete, if policy is missing, we should still be blocked @@ -934,7 +973,8 @@ func IntrospectNode(node wrapper.SkyhookNode, skyhook SkyhookNodes, allSkyhooks } // Check per-node priority: if this node is waiting on higher-priority skyhooks - if !node.IsComplete() && !IsNodeReadyForSkyhook(node.GetNode().Name, skyhook, allSkyhooks) { + // Skip when skyhook is already globally waiting (sequencing: all) — nodes inherit via isSkyhookControlledNodeStatus + if !node.IsComplete() && skyhookStatus != v1alpha1.StatusWaiting && !IsNodeReadyForSkyhook(node.GetNode().Name, skyhook, allSkyhooks) { if nodeStatus != v1alpha1.StatusWaiting { node.SetStatus(v1alpha1.StatusWaiting) } diff --git a/operator/internal/controller/cluster_state_v2_test.go b/operator/internal/controller/cluster_state_v2_test.go index 6e54c092..f582a946 100644 --- a/operator/internal/controller/cluster_state_v2_test.go +++ b/operator/internal/controller/cluster_state_v2_test.go @@ -276,6 +276,111 @@ var _ = Describe("IsNodeReadyForSkyhook", func() { // skyhook_a should be ready (no higher priority or earlier name) Expect(IsNodeReadyForSkyhook("node-1", skyhookA, allSkyhooks)).To(BeTrue()) }) + + // sequencing: all tests + makeSkyhookNodesMockWithSequencing := func(name string, priority int, sequencing v1alpha1.SequencingMode, nodeCompletions map[string]bool, complete bool) *skyhookNodesMock.MockSkyhookNodes { + mock := skyhookNodesMock.NewMockSkyhookNodes(GinkgoT()) + + skyhook := &v1alpha1.Skyhook{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.SkyhookSpec{Priority: priority, Sequencing: sequencing}, + } + mock.EXPECT().GetSkyhook().Return(wrapper.NewSkyhookWrapper(skyhook)).Maybe() + mock.EXPECT().IsDisabled().Return(false).Maybe() + mock.EXPECT().IsComplete().Return(complete).Maybe() + + for nodeName, isComplete := range nodeCompletions { + nodeWrapper := wrapperMock.NewMockSkyhookNode(GinkgoT()) + nodeWrapper.EXPECT().IsComplete().Return(isComplete).Maybe() + mock.EXPECT().GetNode(nodeName).Return(v1alpha1.StatusComplete, wrapper.SkyhookNode(nodeWrapper)).Maybe() + } + + return mock + } + + It("sequencing: all should block when predecessor is not globally complete", func() { + // skyhook_a (priority 1, sequencing: all): node-1 complete, node-2 incomplete → not globally complete + skyhookA := makeSkyhookNodesMockWithSequencing("skyhook-a", 1, v1alpha1.SequencingAll, map[string]bool{"node-1": true, "node-2": false}, false) + skyhookB := makeSkyhookNodesMockWithSequencing("skyhook-b", 2, v1alpha1.SequencingNode, map[string]bool{"node-1": false, "node-2": false}, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + + // node-1 completed skyhook_a, but sequencing: all means ALL nodes must complete + // Since node-2 is incomplete on skyhook_a, node-1 should NOT be ready for skyhook_b + Expect(IsNodeReadyForSkyhook("node-1", skyhookB, allSkyhooks)).To(BeFalse()) + }) + + It("sequencing: all should allow when predecessor is globally complete", func() { + skyhookA := makeSkyhookNodesMockWithSequencing("skyhook-a", 1, v1alpha1.SequencingAll, map[string]bool{"node-1": true, "node-2": true}, true) + skyhookB := makeSkyhookNodesMockWithSequencing("skyhook-b", 2, v1alpha1.SequencingNode, map[string]bool{"node-1": false, "node-2": false}, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + + // Both nodes complete on skyhook_a, so node-1 should be ready for skyhook_b + Expect(IsNodeReadyForSkyhook("node-1", skyhookB, allSkyhooks)).To(BeTrue()) + }) + + It("mixed sequencing: node predecessor allows per-node, all predecessor blocks globally", func() { + // skyhook_a (priority 1, sequencing: node): node-1 complete, node-2 incomplete + skyhookA := makeSkyhookNodesMockWithSequencing("skyhook-a", 1, v1alpha1.SequencingNode, map[string]bool{"node-1": true, "node-2": false}, false) + // skyhook_gate (priority 2, sequencing: all): node-1 complete, node-2 incomplete → not globally complete + skyhookGate := makeSkyhookNodesMockWithSequencing("skyhook-gate", 2, v1alpha1.SequencingAll, map[string]bool{"node-1": true, "node-2": false}, false) + // skyhook_c (priority 3) + skyhookC := makeSkyhookNodesMockWithSequencing("skyhook-c", 3, v1alpha1.SequencingNode, map[string]bool{"node-1": false, "node-2": false}, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookGate, skyhookC} + + // node-1 is ready for skyhook_gate (per-node on skyhook_a, and node-1 completed it) + Expect(IsNodeReadyForSkyhook("node-1", skyhookGate, allSkyhooks)).To(BeTrue()) + // node-1 is NOT ready for skyhook_c (sequencing: all on gate, node-2 incomplete on gate) + Expect(IsNodeReadyForSkyhook("node-1", skyhookC, allSkyhooks)).To(BeFalse()) + }) +}) + +var _ = Describe("isBlockedByGlobalPredecessor", func() { + makeSkyhookNodesMockForBlocked := func(name string, priority int, sequencing v1alpha1.SequencingMode, complete bool, disabled bool) *skyhookNodesMock.MockSkyhookNodes { + mock := skyhookNodesMock.NewMockSkyhookNodes(GinkgoT()) + skyhook := &v1alpha1.Skyhook{ + ObjectMeta: metav1.ObjectMeta{Name: name}, + Spec: v1alpha1.SkyhookSpec{Priority: priority, Sequencing: sequencing}, + } + mock.EXPECT().GetSkyhook().Return(wrapper.NewSkyhookWrapper(skyhook)).Maybe() + mock.EXPECT().IsDisabled().Return(disabled).Maybe() + mock.EXPECT().IsComplete().Return(complete).Maybe() + return mock + } + + It("should return true when a sequencing: all predecessor is incomplete", func() { + skyhookA := makeSkyhookNodesMockForBlocked("skyhook-a", 1, v1alpha1.SequencingAll, false, false) + skyhookB := makeSkyhookNodesMockForBlocked("skyhook-b", 2, v1alpha1.SequencingNode, false, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + Expect(isBlockedByGlobalPredecessor(skyhookB, allSkyhooks)).To(BeTrue()) + }) + + It("should return false when a sequencing: all predecessor is complete", func() { + skyhookA := makeSkyhookNodesMockForBlocked("skyhook-a", 1, v1alpha1.SequencingAll, true, false) + skyhookB := makeSkyhookNodesMockForBlocked("skyhook-b", 2, v1alpha1.SequencingNode, false, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + Expect(isBlockedByGlobalPredecessor(skyhookB, allSkyhooks)).To(BeFalse()) + }) + + It("should return false when predecessor uses sequencing: node", func() { + skyhookA := makeSkyhookNodesMockForBlocked("skyhook-a", 1, v1alpha1.SequencingNode, false, false) + skyhookB := makeSkyhookNodesMockForBlocked("skyhook-b", 2, v1alpha1.SequencingNode, false, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + Expect(isBlockedByGlobalPredecessor(skyhookB, allSkyhooks)).To(BeFalse()) + }) + + It("should skip disabled sequencing: all predecessors", func() { + skyhookA := makeSkyhookNodesMockForBlocked("skyhook-a", 1, v1alpha1.SequencingAll, false, true) + skyhookB := makeSkyhookNodesMockForBlocked("skyhook-b", 2, v1alpha1.SequencingNode, false, false) + + allSkyhooks := []SkyhookNodes{skyhookA, skyhookB} + Expect(isBlockedByGlobalPredecessor(skyhookB, allSkyhooks)).To(BeFalse()) + }) }) var _ = Describe("BuildState ordering", func() { diff --git a/operator/internal/wrapper/node.go b/operator/internal/wrapper/node.go index b241de6f..910796db 100644 --- a/operator/internal/wrapper/node.go +++ b/operator/internal/wrapper/node.go @@ -32,44 +32,75 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// there are 2 interface to reflect functions that need a skyhook and node -// and ones that just need a node +// There are two interfaces: one for code that needs both a Skyhook and a Node, +// and one for code that only needs a Node (e.g. to avoid extra API calls). -// SkyhookNode wraps a node with a supporting skyhook +// SkyhookNode wraps a Node with its associated Skyhook. Use it when you need +// full Skyhook spec and graph to drive sequencing, status, and conditions. type SkyhookNode interface { SkyhookNodeOnly + + // GetSkyhook returns the Skyhook associated with this node, or nil if only a name was set. GetSkyhook() *Skyhook + // GetComplete returns the list of package names that are complete on this node. GetComplete() []string + // SetStatus updates the node's Skyhook status in annotations/labels and on the Skyhook; uncordons if Complete. SetStatus(status v1alpha1.Status) + // IsComplete reports whether all packages for this Skyhook are complete on this node. IsComplete() bool + // ProgressSkipped marks progress as skipped for sequencing (e.g. when dependencies are not run). ProgressSkipped() + // IsPackageComplete reports whether the given package is complete on this node (considering interrupts and updates). IsPackageComplete(_package v1alpha1.Package) bool + // RunNext returns the next package(s) that should run according to the dependency graph and current completion. RunNext() ([]*v1alpha1.Package, error) + // NextStage returns the next stage for the given package given its current state and config. NextStage(_package *v1alpha1.Package) *v1alpha1.Stage + // HasInterrupt reports whether the package has an interrupt (e.g. wait-for-input) that blocks progression. HasInterrupt(_package v1alpha1.Package) bool + // UpdateCondition refreshes Skyhook-related node conditions (NotReady and Erroring) from current package state. UpdateCondition() + // HasSkyhookAnnotations reports whether the node has any Skyhook operator annotations. HasSkyhookAnnotations() bool } -// SkyhookNodeOnly wraps the node with just a skyhook name +// SkyhookNodeOnly wraps a Node with only a Skyhook name. Use it when you need +// node-level operations (state, taints, cordon, version) without loading the +// full Skyhook; helps reduce API calls and avoids stubbing full Skyhooks. type SkyhookNodeOnly interface { + // Status returns the current Skyhook status for this node from annotations, or StatusUnknown if unset. Status() v1alpha1.Status - // SetStatus is in both interfaces, does more if skyhook is not nil + // SetStatus updates the node's Skyhook status in annotations/labels and on the Skyhook; uncordons if Complete. SetStatus(status v1alpha1.Status) + // PackageStatus returns the status for the named package if present in node state. PackageStatus(name string) (*v1alpha1.PackageStatus, bool) + // SetVersion writes the current operator version into the node's annotations for this Skyhook. SetVersion() + // GetVersion returns the operator version stored in the node's annotations for this Skyhook. GetVersion() string + // Migrate updates stored node state/annotations to the current schema when the operator version changes. Migrate(logger logr.Logger) error + // State returns the persisted NodeState for this node (from memory or annotations). State() (v1alpha1.NodeState, error) + // SetState persists the given NodeState to the node's annotations and in-memory state. SetState(state v1alpha1.NodeState) error + // RemoveState removes persisted state for the given package ref and updates annotations. RemoveState(_package v1alpha1.PackageRef) error + // Upsert creates or updates state for a package (image, state, stage, restarts, containerSHA) and persists it. Upsert(_package v1alpha1.PackageRef, image string, state v1alpha1.State, stage v1alpha1.Stage, restarts int32, containerSHA string) error + // GetNode returns the underlying Kubernetes Node. GetNode() *corev1.Node + // Taint adds a NoSchedule taint with the given key and the Skyhook name as value. Taint(key string) + // RemoveTaint removes the taint with the given key from the node. RemoveTaint(key string) + // Cordon marks the node unschedulable and records the cordon in annotations for this Skyhook. Cordon() + // Uncordon marks the node schedulable and removes this Skyhook's cordon annotation if present. Uncordon() + // Reset clears Skyhook-related state and annotations so the node can be reconfigured from scratch. Reset() + // Changed reports whether the node has in-memory changes that need to be written back to the API. Changed() bool } @@ -90,7 +121,7 @@ func NewSkyhookNodeOnly(node *corev1.Node, skyhookName string) (SkyhookNodeOnly, return ret, nil } -// Convert will upgrade this to be the full interface if you have a skyhook +// Convert upgrades a SkyhookNodeOnly to a full SkyhookNode when a Skyhook object is available. func Convert(node SkyhookNodeOnly, skyhook *v1alpha1.Skyhook) (SkyhookNode, error) { ret := node.(*skyhookNode) ret.skyhook = &Skyhook{Skyhook: skyhook} @@ -105,6 +136,7 @@ func Convert(node SkyhookNodeOnly, skyhook *v1alpha1.Skyhook) (SkyhookNode, erro return ret, nil } +// NewSkyhookNode creates a full SkyhookNode from a Node and a Skyhook (node + graph + name). func NewSkyhookNode(node *corev1.Node, skyhook *v1alpha1.Skyhook) (SkyhookNode, error) { t, err := NewSkyhookNodeOnly(node, skyhook.Name) @@ -124,16 +156,17 @@ type skyhookNode struct { updated bool } -// GetSkyhook implements sskyhookNode. +// GetSkyhook returns the Skyhook associated with this node, or nil if only a name was set. func (node *skyhookNode) GetSkyhook() *Skyhook { return node.skyhook } -// GetNode implements sskyhookNode. +// GetNode returns the underlying Kubernetes Node. func (node *skyhookNode) GetNode() *corev1.Node { return node.Node } +// SetStatus updates the node's Skyhook status in annotations/labels and on the Skyhook status; also uncordons if status is Complete. func (node *skyhookNode) SetStatus(status v1alpha1.Status) { s, ok := node.Annotations[fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhookName)] @@ -159,6 +192,7 @@ func (node *skyhookNode) SetStatus(status v1alpha1.Status) { } } +// Status returns the current Skyhook status for this node from annotations, or StatusUnknown if unset. func (node *skyhookNode) Status() v1alpha1.Status { status, ok := node.Annotations[fmt.Sprintf("%s/status_%s", v1alpha1.METADATA_PREFIX, node.skyhookName)] if !ok { @@ -167,6 +201,7 @@ func (node *skyhookNode) Status() v1alpha1.Status { return v1alpha1.GetStatus(status) } +// State returns the persisted NodeState for this node (from memory or annotations). func (node *skyhookNode) State() (v1alpha1.NodeState, error) { if node.nodeState != nil { @@ -190,6 +225,7 @@ func (node *skyhookNode) State() (v1alpha1.NodeState, error) { return ret, nil } +// PackageStatus returns the status for the named package if present in node state. func (node *skyhookNode) PackageStatus(name string) (*v1alpha1.PackageStatus, bool) { packageStatus := node.nodeState.Get(name) if packageStatus != nil { @@ -199,6 +235,7 @@ func (node *skyhookNode) PackageStatus(name string) (*v1alpha1.PackageStatus, bo return nil, false } +// SetVersion writes the current operator version into the node's annotations for this Skyhook. func (node *skyhookNode) SetVersion() { current := node.GetVersion() @@ -218,6 +255,7 @@ func (node *skyhookNode) SetVersion() { node.updated = true } +// GetVersion returns the operator version stored in the node's annotations for this Skyhook. func (node *skyhookNode) GetVersion() string { version, ok := node.Annotations[fmt.Sprintf("%s/version_%s", v1alpha1.METADATA_PREFIX, node.skyhookName)] if !ok { @@ -226,6 +264,7 @@ func (node *skyhookNode) GetVersion() string { return version } +// Migrate updates stored node state/annotations to the current schema when the operator version changes. func (node *skyhookNode) Migrate(logger logr.Logger) error { from := node.GetVersion() @@ -251,6 +290,7 @@ func (node *skyhookNode) Migrate(logger logr.Logger) error { return nil } +// SetState persists the given NodeState to the node's annotations and in-memory state. func (node *skyhookNode) SetState(state v1alpha1.NodeState) error { if node == nil || state == nil { return nil @@ -275,6 +315,7 @@ func (node *skyhookNode) SetState(state v1alpha1.NodeState) error { return nil } +// RemoveState removes persisted state for the given package ref and updates annotations. func (node *skyhookNode) RemoveState(_package v1alpha1.PackageRef) error { changed := node.nodeState.RemoveState(_package) if changed { @@ -284,6 +325,7 @@ func (node *skyhookNode) RemoveState(_package v1alpha1.PackageRef) error { return nil } +// Upsert creates or updates state for a package (image, state, stage, restarts, containerSHA) and persists it. func (node *skyhookNode) Upsert(_package v1alpha1.PackageRef, image string, state v1alpha1.State, stage v1alpha1.Stage, restarts int32, containerSHA string) error { changed := node.nodeState.Upsert(_package, image, state, stage, restarts, containerSHA) if changed { @@ -296,18 +338,22 @@ func (node *skyhookNode) Upsert(_package v1alpha1.PackageRef, image string, stat return nil } +// IsPackageComplete reports whether the given package is complete on this node (considering interrupts and updates). func (node *skyhookNode) IsPackageComplete(_package v1alpha1.Package) bool { return node.nodeState.IsPackageComplete(_package, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) } +// IsComplete reports whether all packages for this Skyhook are complete on this node. func (node *skyhookNode) IsComplete() bool { return node.nodeState.IsComplete(node.skyhook.Spec.Packages, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) } +// GetComplete returns the list of package names that are complete on this node. func (node *skyhookNode) GetComplete() []string { return node.nodeState.GetComplete(node.skyhook.Spec.Packages, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) } +// ProgressSkipped marks progress as skipped for sequencing (e.g. when dependencies are not run). func (node *skyhookNode) ProgressSkipped() { if node.nodeState.ProgressSkipped(node.skyhook.Spec.Packages, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) { node.skyhook.Updated = true @@ -315,6 +361,7 @@ func (node *skyhookNode) ProgressSkipped() { } } +// RunNext returns the next package(s) that should run according to the dependency graph and current completion. func (node *skyhookNode) RunNext() ([]*v1alpha1.Package, error) { complete := node.GetComplete() @@ -334,18 +381,22 @@ func (node *skyhookNode) RunNext() ([]*v1alpha1.Package, error) { return toRun, nil } +// NextStage returns the next stage for the given package given its current state and config. func (node *skyhookNode) NextStage(_package *v1alpha1.Package) *v1alpha1.Stage { return node.nodeState.NextStage(_package, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) } +// Changed reports whether the node has in-memory changes that need to be written back to the API. func (node *skyhookNode) Changed() bool { return node.updated } +// HasInterrupt reports whether the package has an interrupt (e.g. wait-for-input) that blocks progression. func (node *skyhookNode) HasInterrupt(_package v1alpha1.Package) bool { return node.nodeState.HasInterrupt(_package, node.skyhook.GetConfigInterrupts(), node.skyhook.GetConfigUpdates()) } +// Taint adds a NoSchedule taint with the given key and the Skyhook name as value. func (node *skyhookNode) Taint(key string) { // dont add it if it exists already, dups will error @@ -367,6 +418,7 @@ func (node *skyhookNode) Taint(key string) { node.updated = true } +// RemoveTaint removes the taint with the given key from the node. func (node *skyhookNode) RemoveTaint(key string) { if len(node.Spec.Taints) == 0 { @@ -397,6 +449,7 @@ func (node *skyhookNode) HasSkyhookAnnotations() bool { return false } +// Cordon marks the node unschedulable and records the cordon in annotations for this Skyhook. func (node *skyhookNode) Cordon() { _, ok := node.Annotations[fmt.Sprintf("%s/cordon_%s", v1alpha1.METADATA_PREFIX, node.skyhookName)] if !node.Spec.Unschedulable || !ok { @@ -406,6 +459,7 @@ func (node *skyhookNode) Cordon() { } } +// Uncordon marks the node schedulable and removes this Skyhook's cordon annotation if present. func (node *skyhookNode) Uncordon() { // if we hold a cordon remove it, also we dont want to remove a cordon if we dont have one... @@ -417,6 +471,7 @@ func (node *skyhookNode) Uncordon() { } } +// Reset clears Skyhook-related state and annotations so the node can be reconfigured from scratch. func (node *skyhookNode) Reset() { delete(node.skyhook.Status.NodeState, node.Name) @@ -432,6 +487,7 @@ func (node *skyhookNode) Reset() { node.updated = true } +// UpdateCondition refreshes Skyhook-related node conditions (NotReady and Erroring) from current package state. func (node *skyhookNode) UpdateCondition() { readyReason, errorReason := "Incomplete", "Not Erroring" errorCondFound, condFound := false, false diff --git a/operator/internal/wrapper/skyhook.go b/operator/internal/wrapper/skyhook.go index ef16a742..45c1b8cf 100644 --- a/operator/internal/wrapper/skyhook.go +++ b/operator/internal/wrapper/skyhook.go @@ -38,6 +38,9 @@ func NewSkyhookWrapper(s *v1alpha1.Skyhook) *Skyhook { type Skyhook struct { *v1alpha1.Skyhook // nodes []*corev1.Node + // Updated is set to true when the skyhook has been updated, used to track changes to the skyhook + // and to determine if the skyhook needs to be updated in the API + // this is used to avoid unnecessary API calls Updated bool }