From fb33aefc939dab4d4c0181cf8ec1dc3833ab54ca Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Thu, 14 May 2026 18:51:37 +0800 Subject: [PATCH 1/5] [docs]: snapshot single-delete documentation set (overview, scenarios, P0 design) Add the snapshot-single-delete documentation tree under docs/: 00-overview.md .. 14-limitations-and-todos.md - top-level walkthrough of the single snapshot delete path: APIs/fields, call chain, direction resolution, scope/stepDelete, commit DB swap, pull DB rewrite, group passthrough, hypervisor online commit, agent qemu-img, storage backend matrix, sibling rebase, full rebase & cleanup, premium/CDP, limitations & TODOs. scenarios/00-index.md, 01-multi-children-stepDelete.md, 02-local-running-delete-mid-with-3-children.md - end-to-end traces keyed on real ZSV captures. P0-fix-isOnline-decoupling/ - design notes for the P0 fix that decouples alive-chain membership from vmState in VolumeTree: 00-overview, 01-control-plane-reconciler, 02-data-plane-validation, 03-flowchain-recovery, 04-testing-strategy, 05-rollout-plan, 06-invariants-and-scope. Resolves: ZSV-10538 Change-Id: I75666b6a646a64796163756f6f766c7368777569 --- docs/snapshot-single-delete/00-overview.md | 50 ++++ .../01-api-and-fields.md | 56 ++++ docs/snapshot-single-delete/02-call-chain.md | 47 ++++ .../03-direction-resolution.md | 101 +++++++ .../04-scope-and-stepDelete.md | 102 +++++++ .../05-commit-db-swap.md | 105 +++++++ .../06-pull-db-rewrite.md | 119 ++++++++ .../07-group-passthrough.md | 98 +++++++ .../08-hypervisor-online-commit.md | 128 +++++++++ .../09-agent-qemu-img.md | 100 +++++++ .../10-storage-backend-matrix.md | 108 ++++++++ .../11-sibling-rebase.md | 113 ++++++++ .../12-fullrebase-and-cleanup.md | 96 +++++++ .../13-premium-and-cdp.md | 53 ++++ .../14-limitations-and-todos.md | 77 ++++++ .../scenarios/00-index.md | 23 ++ .../scenarios/01-multi-children-stepDelete.md | 128 +++++++++ ...ocal-running-delete-mid-with-3-children.md | 259 +++++++++++++++++ .../00-overview.md | 93 +++++++ .../01-control-plane-reconciler.md | 157 +++++++++++ .../02-data-plane-validation.md | 260 ++++++++++++++++++ .../03-flowchain-recovery.md | 76 +++++ .../04-testing-strategy.md | 63 +++++ .../05-rollout-plan.md | 55 ++++ .../06-invariants-and-scope.md | 23 ++ 25 files changed, 2490 insertions(+) create mode 100644 docs/snapshot-single-delete/00-overview.md create mode 100644 docs/snapshot-single-delete/01-api-and-fields.md create mode 100644 docs/snapshot-single-delete/02-call-chain.md create mode 100644 docs/snapshot-single-delete/03-direction-resolution.md create mode 100644 docs/snapshot-single-delete/04-scope-and-stepDelete.md create mode 100644 docs/snapshot-single-delete/05-commit-db-swap.md create mode 100644 docs/snapshot-single-delete/06-pull-db-rewrite.md create mode 100644 docs/snapshot-single-delete/07-group-passthrough.md create mode 100644 docs/snapshot-single-delete/08-hypervisor-online-commit.md create mode 100644 docs/snapshot-single-delete/09-agent-qemu-img.md create mode 100644 docs/snapshot-single-delete/10-storage-backend-matrix.md create mode 100644 docs/snapshot-single-delete/11-sibling-rebase.md create mode 100644 docs/snapshot-single-delete/12-fullrebase-and-cleanup.md create mode 100644 docs/snapshot-single-delete/13-premium-and-cdp.md create mode 100644 docs/snapshot-single-delete/14-limitations-and-todos.md create mode 100644 docs/snapshot-single-delete/scenarios/00-index.md create mode 100644 docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md create mode 100644 docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md create mode 100644 docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md diff --git a/docs/snapshot-single-delete/00-overview.md b/docs/snapshot-single-delete/00-overview.md new file mode 100644 index 00000000000..9dbddf12309 --- /dev/null +++ b/docs/snapshot-single-delete/00-overview.md @@ -0,0 +1,50 @@ +# 单快照节点删除(scope=single)— 总览 + +> 需求:ZSV-5799 "支持删除快照不删除链" +> 关联 MR:zstack#7674 / premium#10776 / zstack-utility#5743 +> 入口 API:`APIDeleteVolumeSnapshotGroupMsg`(含 `direction` + `scope` 字段) + +--- + +## 文档索引 + +| 文档 | 内容 | +|---|---| +| [01-api-and-fields.md](01-api-and-fields.md) | API 入口、字段、枚举定义 | +| [02-call-chain.md](02-call-chain.md) | 处理链路总览(Group → Tree → Storage) | +| [03-direction-resolution.md](03-direction-resolution.md) | `resolveDirection()` 决策表与 fromVOs 构建 | +| [04-scope-and-stepDelete.md](04-scope-and-stepDelete.md) | scope 分支与 stepDelete 递归 | +| [05-commit-db-swap.md](05-commit-db-swap.md) | Commit 路径 DB 翻转(最关键) | +| [06-pull-db-rewrite.md](06-pull-db-rewrite.md) | Pull / pullToVolume DB 改写 | +| [07-group-passthrough.md](07-group-passthrough.md) | Group 透传与并发、失败聚合 | +| [08-hypervisor-online-commit.md](08-hypervisor-online-commit.md) | 在线 libvirt blockCommit + pivot | +| [09-agent-qemu-img.md](09-agent-qemu-img.md) | agent 端 qemu-img 三种命令对比 | +| [10-storage-backend-matrix.md](10-storage-backend-matrix.md) | Local/NFS/SMP/SharedBlock/Ceph 后端差异 | +| [11-sibling-rebase.md](11-sibling-rebase.md) | 分叉链兄弟节点 rebase | +| [12-fullrebase-and-cleanup.md](12-fullrebase-and-cleanup.md) | fullRebase 树根删除与残留清理 | +| [13-premium-and-cdp.md](13-premium-and-cdp.md) | Premium / CDP / 灾备兼容性 | +| [14-limitations-and-todos.md](14-limitations-and-todos.md) | 已知限制 / TODO / FIXME | + +--- + +## 一图概览 + +``` +[祖父] ── [待删节点 X] ── [子 Y] ── ... + │ + ┌──────────┴───────────┐ + │ scope=single │ + │ direction=commit │ 在线VM 且 X≠latest + │ → Y 差量写入 X 文件 │ + │ → DB: 互换 path, Y.parent=X.parent + │ + │ direction=pull │ 离线 或 X=latest + │ → 祖父+X 合并入 Y(rebase) + │ → DB: Y.parent = X.parent +``` + +## 仓库根 + +- `/d/0zw/zw/zstack/` —— 开源主库 +- `/d/0zw/zw/premium/` —— Premium(独立 git) +- `/d/0zw/zw/zstack-utility/` —— Python agent diff --git a/docs/snapshot-single-delete/01-api-and-fields.md b/docs/snapshot-single-delete/01-api-and-fields.md new file mode 100644 index 00000000000..fd7a6f34034 --- /dev/null +++ b/docs/snapshot-single-delete/01-api-and-fields.md @@ -0,0 +1,56 @@ +# 01 — API 入口与字段定义 + +## 1.1 `APIDeleteVolumeSnapshotGroupMsg`(快照组删除) + +**文件**:`header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java:24` + +```java +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; + +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; // 默认保留旧行为 +``` + +REST 路径:`DELETE /volume-snapshots/group/{uuid}` + +## 1.2 `APIDeleteVolumeSnapshotMsg`(单快照删除) + +**文件**:`header/.../APIDeleteVolumeSnapshotMsg.java:49` + +```java +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; + +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; // 默认 chain,向后兼容 +``` + +REST 路径:`DELETE /volume-snapshots/{uuid}` + +## 1.3 枚举类 + +### `DeleteVolumeSnapshotDirection` — `header/.../DeleteVolumeSnapshotDirection.java:3` + +| 值 | 语义 | +|---|---| +| `Pull("pull")` | 下拉方向:父快照内容合入子快照 | +| `Commit("commit")` | 上提方向:子快照内容合入父快照 | +| `Auto("auto")` | 系统自动判断 | + +### `DeleteVolumeSnapshotScope` — `header/.../DeleteVolumeSnapshotScope.java:3` + +| 值 | 语义 | +|---|---| +| `Single("single")` | 只删除当前单节点,保留整条链 | +| `Chain("chain")` | 删除当前节点及所有后代(旧默认) | +| `Auto("auto")` | 系统自动判断(实际等同 single) | + +## 1.4 传递结构体 + +`VolumeSnapshotDeletionStructs` — `header/.../VolumeSnapshotDeletionStructs.java:5` +跨层透传 `direction + scope + 快照列表`。 + +## 1.5 兼容性 + +API 默认值 `scope = "chain"` 保持向后兼容;**必须显式传 `scope=single`** 才会触发新功能。 diff --git a/docs/snapshot-single-delete/02-call-chain.md b/docs/snapshot-single-delete/02-call-chain.md new file mode 100644 index 00000000000..54f98cd1c17 --- /dev/null +++ b/docs/snapshot-single-delete/02-call-chain.md @@ -0,0 +1,47 @@ +# 02 — 处理链路总览 + +## 2.1 快照组删除链路 + +``` +APIDeleteVolumeSnapshotGroupMsg + └─ VolumeSnapshotGroupBase.handle() GroupBase.java:163 + └─ handleDelete() GroupBase.java:187 + └─ DeleteVolumeSnapshotGroupInnerMsg (携带 scope/direction) + └─ While 循环每个 VolumeSnapshotVO GroupBase.java:212 + └─ DeleteVolumeSnapshotMsg(scope,direction) + └─ VolumeSnapshotTreeBase + └─ deletion() TreeBase.java:358 + ├─ scope=chain → deleteChainFlows() :487 + └─ scope=single → deleteSingleFlows() :828 + └─ stepDelete() :875 + ├─ 叶节点 → deleteVolumeSnapshotAndSyncVolumeSize + ├─ 单子节点 → resolveDirection → commit() / pull() + └─ 多子节点 → pull() (强制) +``` + +## 2.2 关键透传点 + +`VolumeSnapshotGroupBase.java:221-228`: +```java +DeleteVolumeSnapshotMsg rmsg = new DeleteVolumeSnapshotMsg(); +rmsg.setScope(msg.getScope()); +rmsg.setDirection(msg.getDirection()); +bus.makeTargetServiceIdByResourceUuid(rmsg, VolumeSnapshotConstant.SERVICE_ID, ...); +``` + +## 2.3 关键类索引 + +| 文件 | 作用 | +|---|---| +| `header/.../APIDeleteVolumeSnapshotMsg.java:49` | 单快照 API 入口 | +| `header/.../APIDeleteVolumeSnapshotGroupMsg.java:24` | 快照组 API 入口 | +| `storage/.../group/VolumeSnapshotGroupBase.java:212` | Group → 单快照消息分发 | +| `storage/.../VolumeSnapshotTreeBase.java:473` | scope 分支点 | +| `storage/.../VolumeSnapshotTreeBase.java:875` | stepDelete 递归 | +| `storage/.../VolumeSnapshotTreeBase.java:921` | commit() 流程 | +| `storage/.../VolumeSnapshotTreeBase.java:1097` | pull() 流程 | +| `storage/.../VolumeTree.java:364` | resolveDirection 决策 | +| `storage/.../VolumeTree.java:418/471` | updateDatabaseAfter Pull/Commit | +| `plugin/kvm/.../KVMHost.java:1043/1159` | 在线 commit/pull | +| `kvmagent/plugins/vm_plugin.py:3915` | libvirt blockCommit 核心 | +| `zstacklib/utils/linux.py:1389` | qcow2 工具函数 | diff --git a/docs/snapshot-single-delete/03-direction-resolution.md b/docs/snapshot-single-delete/03-direction-resolution.md new file mode 100644 index 00000000000..898a3f99cd4 --- /dev/null +++ b/docs/snapshot-single-delete/03-direction-resolution.md @@ -0,0 +1,101 @@ +# 03 — direction 决策(resolveDirection) + +## 3.1 核心代码 + +**文件**:`storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java:364` + +```java +public DeleteVolumeSnapshotDirection resolveDirection( + String targetSnapshotUuid, // 待删节点(dst, 老节点) + String childSnapshotUuid, // 待删节点的子节点(src, 新节点) + String initialDirection, // 用户传入的 direction + boolean targetSnapshotIsLatest, // 待删节点是否 latest + VmInstanceState vmState) { + + boolean online = + (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) + && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) + && getAliveChainSnapshotUuids().contains(childSnapshotUuid); + + boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + + if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Pull.toString()) + && shouldUseCommitStrategy) { + throw new IllegalArgumentException( + "the snapshot will be deleted by block 'commit', but the direction is 'pull', " + + "change the direction to 'commit' or 'auto'."); + } + + if (initialDirection == null) return DeleteVolumeSnapshotDirection.Commit; + + if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Auto.toString())) { + return shouldUseCommitStrategy + ? DeleteVolumeSnapshotDirection.Commit + : DeleteVolumeSnapshotDirection.Pull; + } + + return DeleteVolumeSnapshotDirection.fromString(initialDirection); +} +``` + +## 3.2 决策表 + +| current | targetIsLatest | online | initialDirection | 结果 | +|---|---|---|---|---| +| 任意 | 任意 | 任意 | `null` | **Commit**(兜底) | +| true | false | true | `pull` | **抛 IllegalArgumentException** | +| true | false | true | `auto` | **Commit** | +| 其它组合 | — | — | `auto` | **Pull** | +| 任意 | 任意 | 任意 | `commit` | **Commit** | +| 任意 | 任意 | 任意 | `pull`(合法) | **Pull** | + +## 3.3 关键字段含义 + +| 字段 | 含义 | +|---|---| +| `current` (`VolumeTree.current`,第38行) | 来自 `VolumeSnapshotTreeVO.current`,true 表示快照链尾连着活跃 volume | +| `targetSnapshotIsLatest` | 来自 `VolumeSnapshotVO.latest = 1`,调用方传 `currentRoot.isLatest()` | +| `aliveChain` | volume 沿 backing chain 上溯到根的所有节点,代表"qemu 当前持有的文件链" | + +## 3.4 调用方 + +`VolumeSnapshotTreeBase.java:904`: +```java +DeleteVolumeSnapshotDirection direction = volumeTree.resolveDirection( + currentRoot.getUuid(), // 待删节点 + child.getUuid(), // 子节点 + msg.getDirection(), // 用户传入 + currentRoot.isLatest(), // 来自 DB + vmState); +``` + +## 3.5 `VolumeTree.fromVOs()` 构建过程 + +`VolumeTree.java:260-327`: + +1. 校验:至多一个根(`parentUuid == null`)、至多一个 latest +2. 若 `current && 有 latest`,把 **volume 自身作为虚拟叶节点** 挂到 latest 之后(uuid = volume uuid) +3. HashMap 还原 parent/children +4. 从 volume 虚拟节点向上收集 `aliveChain` + +```java +// 步骤 3:构建树 +Map map = new HashMap<>(); +for (VolumeSnapshotInventory inv : invs) { + VolumeSnapshotLeaf leaf = map.computeIfAbsent(inv.getUuid(), k -> new VolumeSnapshotLeaf()); + leaf.inventory = inv; + if (inv.getParentUuid() != null) { + VolumeSnapshotLeaf parent = map.computeIfAbsent(inv.getParentUuid(), k -> new VolumeSnapshotLeaf()); + parent.children.add(leaf); + leaf.parent = parent; + } else { + tree.root = leaf; + } +} + +// 步骤 4:计算 aliveChain +if (tree.current) { + VolumeSnapshotLeaf leaf = tree.getSnapshotLeaf(volumeInv.getUuid()); + tree.aliveChain = leaf != null ? leaf.getAncestors() : new ArrayList<>(); +} +``` diff --git a/docs/snapshot-single-delete/04-scope-and-stepDelete.md b/docs/snapshot-single-delete/04-scope-and-stepDelete.md new file mode 100644 index 00000000000..09fc5722479 --- /dev/null +++ b/docs/snapshot-single-delete/04-scope-and-stepDelete.md @@ -0,0 +1,102 @@ +# 04 — scope 分支与 stepDelete 递归 + +## 4.1 scope 分支点 + +**文件**:`VolumeSnapshotTreeBase.java:473` + +```java +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + deleteChainFlows(); // 旧行为:删当前 + 所有后代 +} else { + deleteSingleFlows(); // single/auto:只删当前节点 +} +``` + +注意:`scope=auto` 也走 `deleteSingleFlows()` 分支;只有显式 `chain` 走级联删除。 + +## 4.2 stepDelete 完整代码 + +**文件**:`VolumeSnapshotTreeBase.java:875-918` + +```java +private void stepDelete(Completion completion) { + // 1) 从 DB 拉取整棵树最新状态 + List vos = Q.New(VolumeSnapshotVO.class) + .eq(VolumeSnapshotVO_.treeUuid, currentRoot.getTreeUuid()).list(); + boolean current = Q.New(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, currentRoot.getTreeUuid()) + .select(VolumeSnapshotTreeVO_.current).findValue(); + + // 2) 重建内存树 + VolumeTree volumeTree = VolumeTree.fromVOs(vos, current, VolumeInventory.valueOf(volume)); + List children = + volumeTree.getSnapshotLeaf(currentRoot.getUuid()).getChildren(); + + // 3) 终止条件:无子节点 + if (children.isEmpty()) { + deleteVolumeSnapshotAndSyncVolumeSize(completion); + return; + } + + // 4) 递归 completion + Completion comp = new Completion(completion) { + @Override public void success() { stepDelete(completion); } + @Override public void fail(ErrorCode e) { completion.fail(e); } + }; + + // 5) 找 online 子节点(vm running/paused 且在 aliveChain) + VolumeSnapshotLeaf onlineChild = children.stream() + .filter(c -> volumeTree.isOnline(current, currentRoot.getUuid(), c.getUuid(), vmState)) + .findFirst().orElse(null); + + VolumeSnapshotLeaf child = children.get(0); + + if (children.size() == 1) { + DeleteVolumeSnapshotDirection direction = volumeTree.resolveDirection( + currentRoot.getUuid(), child.getUuid(), + msg.getDirection(), currentRoot.isLatest(), vmState); + boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); + if (direction == Commit) commit(child, volumeTree, online, comp); + else pull(child, volumeTree, online, comp); + } else { + // 多子节点(分叉链) + if (onlineChild != null && child.getUuid().equals(onlineChild.getUuid())) { + child = children.get(1); // 优先处理非 online 子节点 + } + boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); + pull(child, volumeTree, online, comp); // 多子节点统一 pull + } +} +``` + +## 4.3 递归特性 + +| 维度 | 说明 | +|---|---| +| 终止条件 | `children.isEmpty()` | +| 每次递归 | 处理一个子节点;commit/pull 后子节点数 -1 | +| 最坏深度 | 子节点总数(**不是链深度**) | +| 多子节点策略 | 强制 pull;优先非 online 子节点 | +| 失败处理 | `comp.fail()` 直接上抛,**已完成的中间步骤不回滚**,依赖存储幂等 | + +## 4.4 多子节点优先非 online 原因 + +online 子节点的 backing file 正在被 qemu 持有写 I/O,修改它有风险; +先处理非 online 子节点,把它们逐个 pull 掉;最后 online 子节点剩一个,落入"单子节点"分支正常处理。 + +## 4.5 特殊短路 + +`VolumeSnapshotTreeBase.java:836`: +```java +if (VolumeSnapshotConstant.STORAGE_SNAPSHOT_TYPE.toString().equals(currentRoot.getType()) + || Objects.equals(currentRoot.getVolumeType(), VolumeType.Memory.toString())) { + deleteVolumeSnapshotAndSyncVolumeSize(completion); + return; +} +``` + +CDP / 存储快照 / 内存快照绕过 commit/pull,直接调用存储删除。 + +## 4.6 VmState 限制 + +`:854` 仅允许 `Running / Paused / Destroyed / Stopped / Destroying`,其它(如 Migrating / Unknown)直接失败。 diff --git a/docs/snapshot-single-delete/05-commit-db-swap.md b/docs/snapshot-single-delete/05-commit-db-swap.md new file mode 100644 index 00000000000..56240534b1c --- /dev/null +++ b/docs/snapshot-single-delete/05-commit-db-swap.md @@ -0,0 +1,105 @@ +# 05 — Commit DB 翻转(最关键) + +## 5.1 物理时序图 + +``` +Commit 前: + dst.qcow2 ←backing— src.qcow2 ←backing— grandchild.qcow2 + 父 子 孙 + +blockCommit(top=src, base=dst) 完成后: + dst.qcow2 内容 = 原 src 内容(src 的 delta 已 flush 进 dst 文件) + src.qcow2 已被 DELETE(VIR_DOMAIN_BLOCK_COMMIT_DELETE)或将被回收 + +期望逻辑: + src(保留) ← grandchild ← 但 uuid 不变,所以用 path 互换实现: + +DB 互换: + dst.installPath ← src 旧 path (dst 记录"指"已合并的文件) + src.installPath ← dst 旧 path (src 记录"指"待回收的文件) + src.parentUuid ← dst.parentUuid (跨过 dst) + src.distance -= 1 +``` + +## 5.2 为什么互换 path? + +- `blockCommit` 落地的物理文件是 dst 的路径,但数据是 src 的 +- 用户视角"保留的是子节点(src)" +- 互换后:dst 这条 DB 记录指向已合并文件,src 这条 DB 记录指向旧 dst 文件路径(即将被 `deleteVolumeSnapshotAndSyncVolumeSize` 删除) +- `cleanupAfterDeleteSingleSnapshot` 接下来按 `currentRoot.uuid`(dst 的 uuid)逻辑层删除,但物理文件路径已是旧 dst 文件,被回收 + +## 5.3 完整 SQL 操作(`VolumeTree.java:471-545`) + +```java +new SQLBatch() { + @Override + protected void scripts() { + // 1) src 及所有后代 distance -1 + List descendantsUuid = srcLeaf.getDescendants().stream() + .map(...uuid) + .filter(u -> !u.equals(srcLeaf.uuid) && !u.equals(volume.uuid)) + .toList(); + List vos = Q.New(VolumeSnapshotVO.class) + .in(VolumeSnapshotVO_.uuid, descendantsUuid).list(); + vos.forEach(vo -> vo.setDistance(vo.getDistance() - 1)); + + // 2) dst 是树根 → 新建 VolumeSnapshotTreeVO + VolumeSnapshotTreeVO newTree = null; + if (dstSnapshotInv.getParentUuid() == null) { + newTree = new VolumeSnapshotTreeVO(); + newTree.setUuid(Platform.getUuid()); + newTree.setVolumeUuid(volume.getUuid()); + newTree.setStatus(VolumeSnapshotTreeStatus.Completed); + newTree.setCurrent(descendantsUuid.contains(volume.getUuid())); + if (getAliveChainSnapshotUuids().contains(srcSnapshotInv.getUuid())) { + newTree.setCurrent(true); + } + dbf.persist(newTree); + } + if (!vos.isEmpty() && newTree != null) { + VolumeSnapshotTreeVO finalNewTree = newTree; + vos.forEach(vo -> vo.setTreeUuid(finalNewTree.getUuid())); + } + + // 3) dst 互换 installPath, size + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, dstSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.primaryStorageInstallPath, srcSnapshotInv.getPrimaryStorageInstallPath()) + .set(VolumeSnapshotVO_.size, srcSnapshotInv.getSize()) + .update(); + + // 4) GroupRef 同步 installPath + if (dstSnapshotInv.getGroupUuid() != null) { + sql(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, dstSnapshotInv.getGroupUuid()) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, dstSnapshotInv.getUuid()) + .set(VolumeSnapshotGroupRefVO_.volumeSnapshotInstallPath, + srcSnapshotInv.getPrimaryStorageInstallPath()) + .update(); + } + + // 5) src 互换 installPath,parentUuid 跨过 dst,distance -1 + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.primaryStorageInstallPath, dstSnapshotInv.getPrimaryStorageInstallPath()) + .set(VolumeSnapshotVO_.size, newInstallPathSize) + .set(VolumeSnapshotVO_.distance, srcSnapshotInv.getDistance() - 1) + .set(VolumeSnapshotVO_.parentUuid, dstSnapshotInv.getParentUuid()) + .set(VolumeSnapshotVO_.treeUuid, + newTree != null ? newTree.getUuid() : srcSnapshotInv.getTreeUuid()) + .update(); + + dbf.updateCollection(vos); + } +}.execute(); // 单事务原子提交 +``` + +## 5.4 commit() 主流程概览 + +`VolumeSnapshotTreeBase.java:921-1094`: + +1. `AllocatePrimaryStorageSpaceMsg` —— 预分配空间 +2. 分支: + - 在线 → `CommitVolumeSnapshotOnHypervisorMsg` → KVMHost → libvirt blockCommit + - 离线 → `CommitVolumeSnapshotOnPrimaryStorageMsg` → 存储后端 → qemu-img commit +3. 透传 `srcChildrenInstallPathInDb`(兄弟节点列表,见 11 节) +4. `updateDatabaseAfterCommit` —— DB 翻转 +5. 失败 rollback:通过 FlowChain 释放已分配存储空间 diff --git a/docs/snapshot-single-delete/06-pull-db-rewrite.md b/docs/snapshot-single-delete/06-pull-db-rewrite.md new file mode 100644 index 00000000000..027e6f8bd04 --- /dev/null +++ b/docs/snapshot-single-delete/06-pull-db-rewrite.md @@ -0,0 +1,119 @@ +# 06 — Pull / pullToVolume DB 改写 + +## 6.1 Pull 物理语义 + +``` +Pull 前: + grandparent ← src(待删) ← dst(子) ← descendants + +qemu-img rebase(dst → grandparent) 完成后: + dst.qcow2 文件中数据 = 原 dst delta + 原 src delta(合并) + dst 的 backing file = grandparent + src.qcow2 待删除 + +DB 改写: + dst.parentUuid ← src.parentUuid (跨过 src) + dst.distance -= 1 + dst.size = 合并后的实际大小 + 所有后代 distance -1 +``` + +## 6.2 `updateDatabaseAfterPull()` — `VolumeTree.java:418-469` + +```java +public void updateDatabaseAfterPull(VolumeSnapshotInventory srcSnapshotInv, + VolumeSnapshotLeaf dstSnapshotLeaf, long newInstallPathSize) { + + VolumeSnapshotInventory dstSnapshotInv = dstSnapshotLeaf.getInventory(); + + new SQLBatch() { + @Override + protected void scripts() { + // 1) 收集 dst 及所有后代(不含 volume 虚拟节点) + List descendantsUuid = dstSnapshotLeaf.getDescendants().stream() + .map(...uuid) + .filter(u -> !u.equals(volume.uuid)) + .toList(); + List vos = q(VolumeSnapshotVO.class) + .in(VolumeSnapshotVO_.uuid, descendantsUuid).list(); + + // 2) distance -1;dst 节点特殊处理 + vos.forEach(vo -> { + vo.setDistance(vo.getDistance() - 1); + if (vo.getUuid().equals(dstSnapshotInv.getUuid())) { + vo.setParentUuid(srcSnapshotInv.getParentUuid()); + vo.setSize(newInstallPathSize); + } + }); + + // 3) src 是树根 → 新建 VolumeSnapshotTreeVO,后代迁移 + VolumeSnapshotTreeVO newTree = null; + if (srcSnapshotInv.getParentUuid() == null) { + newTree = new VolumeSnapshotTreeVO(); + newTree.setCurrent(descendantsUuid.contains(volume.getUuid())); + newTree.setVolumeUuid(volume.getUuid()); + newTree.setUuid(Platform.getUuid()); + newTree.setStatus(VolumeSnapshotTreeStatus.Completed); + if (getAliveChainSnapshotUuids().contains(dstSnapshotInv.getUuid())) { + newTree.setCurrent(true); + } + dbf.persist(newTree); + VolumeSnapshotTreeVO finalNewTree = newTree; + vos.forEach(vo -> vo.setTreeUuid(finalNewTree.getUuid())); + } + + dbf.updateCollection(vos); + + // 4) 新树建好且 dst 就是 volume 自身(pull-to-volume 边界)→ 原树标记非 current + if (newTree != null && dstSnapshotInv.getUuid().equals(volume.getUuid()) + && q(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()).count() == 1) { + sql(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()) + .set(VolumeSnapshotTreeVO_.current, false).update(); + } + } + }.execute(); +} +``` + +## 6.3 `updateDatabaseAfterPullToVolume()` — `VolumeTree.java:396-416` + +特殊场景:dst 是 volume 自身(即 latest 快照被合并进活跃 volume 文件)。 + +```java +public void updateDatabaseAfterPullToVolume(VolumeSnapshotInventory srcSnapshotInv) { + new SQLBatch() { + @Override + protected void scripts() { + // 1) src(latest)标记为非 latest + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.latest, false).update(); + + // 2) src 的父节点成为新的 latest + if (srcSnapshotInv.getParentUuid() != null) { + sql(VolumeSnapshotVO.class) + .eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getParentUuid()) + .set(VolumeSnapshotVO_.latest, true).update(); + } + + // 3) src 是树根 → 整棵树 current=false(链空了) + if (srcSnapshotInv.getParentUuid() == null) { + sql(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()) + .set(VolumeSnapshotTreeVO_.current, false).update(); + } + } + }.execute(); +} +``` + +## 6.4 pull() 主流程概览(`VolumeSnapshotTreeBase.java:1097-1304`) + +1. `GetVolumeBackingChainFromPrimaryStorageMsg` —— 取祖父路径 +2. `AllocatePrimaryStorageSpaceMsg` +3. 分支: + - 在线 → `PullVolumeSnapshotOnHypervisorMsg` → libvirt block stream + - 离线 → `PullVolumeSnapshotOnPrimaryStorageMsg` → `qemu-img rebase` +4. `updateDatabaseAfterPull` / `updateDatabaseAfterPullToVolume` +5. 失败 rollback:释放分配空间 diff --git a/docs/snapshot-single-delete/07-group-passthrough.md b/docs/snapshot-single-delete/07-group-passthrough.md new file mode 100644 index 00000000000..e052063d211 --- /dev/null +++ b/docs/snapshot-single-delete/07-group-passthrough.md @@ -0,0 +1,98 @@ +# 07 — Group 透传与并发、失败聚合 + +## 7.1 入口排队 + +**`VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg)`** — `:163` + +```java +private void handle(APIDeleteVolumeSnapshotGroupMsg msg) { + thdf.chainSubmit(new ChainTask(msg) { + @Override + public String getSyncSignature() { return id; } // "volumeSnapshotGroup-" + @Override + public void run(SyncTaskChain chain) { + handleDelete(msg, new NoErrorCompletion(chain) { + @Override public void done() { chain.next(); } + }); + } + }); +} +``` + +按 group uuid 串行排队,防止同一 group 并发删除。 + +## 7.2 API → Inner 转发 + +**`handleDelete`** — `:187-210` + +```java +DeleteVolumeSnapshotGroupInnerMsg imsg = new DeleteVolumeSnapshotGroupInnerMsg(); +imsg.setUuid(msg.getUuid()); +imsg.setDeletionMode(msg.getDeletionMode()); +imsg.setScope(msg.getScope()); // ← 透传 +imsg.setDirection(msg.getDirection()); // ← 透传 + +overlaySend(imsg, new CloudBusCallBack(msg) { ... }); +// overlaySend:包成 VolumeSnapshotGroupOverlayMsg,路由到 VmInstance mailbox +// 保证"快照组删除"与"VM 状态变更"互斥 +``` + +## 7.3 真正的并行循环 + +**`handle(DeleteVolumeSnapshotGroupInnerMsg)`** — `:212-254` + +```java +SimpleFlowChain.of("delete-volume-snapshot-group") + .then("delete-volume-snapshots", trigger -> + new While<>(snapshots).step((snapshot, compl) -> { + DeleteVolumeSnapshotMsg rmsg = new DeleteVolumeSnapshotMsg(); + rmsg.setSnapshotUuid(snapshot.getUuid()); + rmsg.setVolumeUuid(snapshot.getVolumeUuid()); + rmsg.setTreeUuid(snapshot.getTreeUuid()); + rmsg.setDeletionMode(msg.getDeletionMode()); + rmsg.setScope(msg.getScope()); // ← 逐快照透传 + rmsg.setDirection(msg.getDirection()); // ← 逐快照透传 + + bus.makeTargetServiceIdByResourceUuid(rmsg, VolumeSnapshotConstant.SERVICE_ID, + getResourceIdToRouteMsg(snapshot)); + + bus.send(rmsg, new CloudBusCallBack(compl) { + @Override + public void run(MessageReply r) { + reply.addResult(new DeleteSnapshotGroupResult( + rmsg.getSnapshotUuid(), + rmsg.getVolumeUuid(), + r.getError())); + compl.done(); // 不短路 + } + }); + }, 5) // ← 并发度 5 + .run(new WhileDoneCompletion(msg) { + @Override + public void done(ErrorCodeList errs) { + trigger.next(); // 错误聚合在 reply.results + } + })) + .then("delete-vm-host-backup-files", trigger -> { + vmHostFileManager.cleanVmHostBackupFile(self.getUuid()); + trigger.next(); + }) + .done(() -> bus.reply(msg, reply)) + .error(errorCode -> { + reply.setError(errorCode); + bus.reply(msg, reply); + }) + .start(); +``` + +## 7.4 关键设计点 + +| 维度 | 说明 | +|---|---| +| 按卷分组 | `getEffectiveSnapshots()` 过滤出当前 VM 各卷的快照 | +| 并发度 | **5**(`While.step(..., 5)`) | +| 失败处理 | 每条独立 `compl.done()`,**不短路** | +| 错误聚合 | `reply.addResult(snapshotUuid, volumeUuid, errorCode)` | +| 整体回滚 | **无**;部分成功保留,返回结果列表 | +| 前置检查 | 删除流程**不**检查 `VolumeSnapshotGroupAvailability` | +| 入口唯一性 | `APIDeleteVolumeSnapshotGroupMsg` 与 `DeleteVolumeSnapshotGroupInnerMsg` 都只在此类处理 | diff --git a/docs/snapshot-single-delete/08-hypervisor-online-commit.md b/docs/snapshot-single-delete/08-hypervisor-online-commit.md new file mode 100644 index 00000000000..16e1ff0e9ee --- /dev/null +++ b/docs/snapshot-single-delete/08-hypervisor-online-commit.md @@ -0,0 +1,128 @@ +# 08 — Hypervisor 在线 commit(libvirt blockCommit + pivot) + +## 8.1 入口 + +**HTTP**:`POST /vm/volume/blockcommit`(`KVMConstant.KVM_BLOCK_COMMIT_VOLUME_PATH`) + +**Python**:`kvmagent/kvmagent/plugins/vm_plugin.py:9845` + +## 8.2 `do_block_commit()` 完整流程 + +`vm_plugin.py:3915-3983`: + +```python +def do_block_commit(self, task_spec, volume): + def do_block_commit_disk(task_spec, disk_name, top, base, active_commit): + def wait_job(_): + return not self._wait_for_block_job(disk_name, abort_on_error=True) + + def check_overlay_file(path): + if not active_commit: + return True + return self._check_target_disk_existing_by_path(path, True) + + def abort_block_commit_job(_): + flag = libvirt.VIR_DOMAIN_BLOCK_JOB_ABORT_ASYNC + if active_commit: + flag = libvirt.VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT + try: + if not self.domain.blockJobInfo(disk_name, 0): + return True + self.domain.blockJobAbort(disk_name, flag) + return True + except Exception as e: + logger.warn("pivot active layer failed, %s" % e) + return False + + # flags 组合 + if active_commit: + flags = libvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE + flags |= libvirt.VIR_DOMAIN_BLOCK_COMMIT_ACTIVE + else: + flags = libvirt.VIR_DOMAIN_BLOCK_COMMIT_DELETE + + # 发起 blockCommit + self.domain.blockCommit(disk_name, base, top, 0, flags) + touchQmpSocketWhenExists(task_spec.vmUuid) + + # 等数据同步 + if not linux.wait_callback_success(wait_job, timeout=d.get_remaining_timeout(), + ignore_exception_in_callback=True): + if not check_overlay_file(base): + raise kvmagent.KvmError('block commit failed') + + # pivot 或普通结束 + if not linux.wait_callback_success(abort_block_commit_job, d.get_remaining_timeout(), + ignore_exception_in_callback=True): + raise kvmagent.KvmError('block commit abort failed') + + # 确认 overlay(top)消失 + if not linux.wait_callback_success(check_overlay_file, base, d.get_remaining_timeout(), + ignore_exception_in_callback=True): + raise kvmagent.KvmError('block commit succeeded, but overlay file is not cleared') + + return base + + target_disk, disk_name = self._get_target_disk(volume) + top = get_volume_actual_installpath(task_spec.top) + base = get_volume_actual_installpath(task_spec.base) + install_path = VmPlugin.get_source_file_by_disk(target_disk) + active_commit = (top == install_path) # ← 关键判定 + + with BlockCommitDaemon(task_spec, self, disk_name, top=top, base=base, + active_commit=active_commit) as d: + return do_block_commit_disk(task_spec, disk_name, task_spec.top, + task_spec.base, active_commit) +``` + +## 8.3 libvirt flags 矩阵 + +| Flag | 作用 | +|---|---| +| `VIR_DOMAIN_BLOCK_COMMIT_DELETE` | 完成后自动删除 top 文件(非 active commit) | +| `VIR_DOMAIN_BLOCK_COMMIT_ACTIVE` | top 是活跃层,两阶段模式(需 pivot) | +| `VIR_DOMAIN_BLOCK_COMMIT_RELATIVE` | backing 用相对路径 | +| `VIR_DOMAIN_BLOCK_COMMIT_SHALLOW` | 只提交一层(**本代码未使用**) | + +## 8.4 Active commit 双阶段 pivot 流程 + +``` +Phase 1(数据同步): + blockCommit() → qemu 把 top delta 写进 base + VM 持续写 top,qemu 增量同步 + 轮询 blockJobInfo 直到 ready + +Phase 2(pivot): + blockJobAbort(VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT) + → qemu 原子切换活跃层 top → base + → top 变游离,VM 后续写直接落 base + +最后 check_overlay_file 确认 pivot 成功 +``` + +**为什么需要 pivot**:VM 正在运行,top 文件实时被写;不能直接删 top,必须先让 qemu 把活跃层切到 base。 + +## 8.5 关键辅助函数 + +`_get_snapshot_size()` — `vm_plugin.py:8946`: +```python +@staticmethod +def _get_snapshot_size(install_path): + size = linux.get_local_file_disk_usage(install_path) # du -sb(actual size) + if size is None or size == 0: + if install_path.startswith("/dev/"): + size = int(lvm.get_lv_size(install_path)) # LV 场景 + else: + size = linux.qcow2_virtualsize(install_path) # 兜底 + return size +``` + +返回 **actual size**(实际占用),SharedBlock 走 LV 大小。 + +## 8.6 active_commit 判定 + +```python +active_commit = (top == install_path) +``` + +`install_path` 是 libvirt domain XML 中 disk 当前的 source file,等于活跃层路径。当 `top` 等于活跃层时即 active commit。 diff --git a/docs/snapshot-single-delete/09-agent-qemu-img.md b/docs/snapshot-single-delete/09-agent-qemu-img.md new file mode 100644 index 00000000000..0d2f4398baf --- /dev/null +++ b/docs/snapshot-single-delete/09-agent-qemu-img.md @@ -0,0 +1,100 @@ +# 09 — agent 端 qemu-img 三种命令对比 + +## 9.1 三个函数定义 + +**文件**:`zstacklib/zstacklib/utils/linux.py:1389-1432` + +```python +# 1389:qcow2_commit +def qcow2_commit(top, base): + shell.call('%s -f qcow2 -b %s %s' % (qemu_img.subcmd('commit'), base, top)) + # qemu-img commit -f qcow2 -b + # 语义:top delta → base,base 内容更新,top 不被自动删 + +# 1395:qcow2_rebase(安全 rebase) +def qcow2_rebase(backing_file, target): + if backing_file: + fmt = get_img_fmt(backing_file) + backing_option = '-F %s -b "%s"' % (fmt, backing_file) + else: + backing_option = '-b "%s"' % backing_file + + # virtual size 一致性自动扩容 + top_virtual_size = int(qcow2_get_virtual_size(target)) + backing_chain = qcow2_get_backing_chain(target) + for idx, bf in enumerate(backing_chain): + if idx == len(backing_chain)-1 and get_img_fmt(bf) != 'qcow2': + break + bf_virtual_size = int(qcow2_get_virtual_size(bf)) + if bf_virtual_size < top_virtual_size: + qemu_img_resize(bf, top_virtual_size) + if bf == backing_file: + break + + with TempAccessible(target): + shell.call('%s -f qcow2 %s %s' % (qemu_img.subcmd('rebase'), backing_option, target)) + # qemu-img rebase -f qcow2 -F -b "" + +# 1416:qcow2_rebase_no_check(unsafe rebase) +def qcow2_rebase_no_check(backing_file, target, backing_fmt=None): + fmt = backing_fmt if backing_fmt else get_img_fmt(backing_file) + with TempAccessible(target): + shell.call('%s -F %s -u -f qcow2 -b "%s" %s' % ( + qemu_img.subcmd('rebase'), fmt, backing_file, target)) + # qemu-img rebase -F -u -f qcow2 -b "" +``` + +## 9.2 精确差异对比 + +| 函数 | 命令模板 | -u | 读旧 backing | 重写 delta | 用途 | +|---|---|---|---|---|---| +| `qcow2_commit` | `qemu-img commit -f qcow2 -b ` | — | 读 top | 否(合并) | top delta 合入 base | +| `qcow2_rebase` | `qemu-img rebase -f qcow2 -F -b ` | 无 | **读旧/新 backing** | **是** | 安全换 backing | +| `qcow2_rebase_no_check` | `qemu-img rebase -F -u -f qcow2 -b ` | **有** | 否 | 否 | 只改头部指针 | + +## 9.3 Unsafe rebase 数据语义 + +`-u`(unsafe): +- **不读取**旧 / 新 backing file 数据 +- **直接修改** target 文件 QCOW2 header 中的 `backing_file` 字段 +- 前提:新旧 backing 在 target 引用的块上**数据一致**(否则读出错误数据) + +在 single 删除场景,commit 完成后 base 的内容 = 原 src 内容,所以兄弟节点把 backing 从 src 改到 base 是**安全的**。 + +## 9.4 安全 rebase 的自动扩容 + +`qcow2_rebase` 遍历 backing chain,发现 backing 的 virtual size 比 target 小时,调用 `qemu_img_resize` 自动扩容,防止 rebase 后读越界。 + +## 9.5 SharedBlock LV 扩容(pull 时) + +**文件**:`shared_block_plugin.py:1247-1285` + +```python +total_required_size = self.get_total_required_size(dst_abs_path) +current_size = int(lvm.get_lv_size(dst_abs_path)) +if not cmd.fullRebase: + if current_size < total_required_size: + lvm.extend_lv_from_cmd(dst_abs_path, total_required_size, cmd, + extend_thin_by_specified_size=True) + with lvm.RecursiveOperateLv(src_abs_path, shared=True): + linux.qcow2_rebase(src_abs_path, dst_abs_path) +``` + +```python +# get_total_required_size — shared_block_plugin.py:967 +@staticmethod +def get_total_required_size(abs_path): + virtual_size = linux.qcow2_virtualsize(abs_path) + total_size = -1 + if linux.get_img_fmt(abs_path) == "qcow2": + try: + total_size = linux.qcow2_measure_required_size(abs_path) + # qemu-img measure:预测完整合并后的最小大小 + except Exception as e: + logger.warn(...) + if total_size > virtual_size or total_size == -1: + total_size = virtual_size + return total_size +``` + +**为什么 pull 需要扩 LV**:pull 把 src 数据合并进 dst,dst 物理占用上升;如果当前 LV 容量不够,提前扩容避免写入失败。 diff --git a/docs/snapshot-single-delete/10-storage-backend-matrix.md b/docs/snapshot-single-delete/10-storage-backend-matrix.md new file mode 100644 index 00000000000..a4789d22fa0 --- /dev/null +++ b/docs/snapshot-single-delete/10-storage-backend-matrix.md @@ -0,0 +1,108 @@ +# 10 — 存储后端支持矩阵 + +## 10.1 支持情况汇总 + +| 存储类型 | scope=single | 在线 commit | 离线 commit | pull | 备注 | +|---|---|---|---|---|---| +| **LocalStorage** | ✅ | KVMHost | `/localstorage/snapshot/offlinecommit` | `/localstorage/snapshot/offlinemerge` | qcow2 文件 | +| **NFS** | ✅ | KVMHost | `/nfsprimarystorage/offlinesnapshotcommit` | `/nfsprimarystorage/offlinesnapshotmerge` | qcow2 文件 | +| **SMP** | ✅ | KVMHost | `OFFLINE_COMMIT_SNAPSHOT_PATH` | `OFFLINE_MERGE_SNAPSHOT_PATH` | 共享挂载点 | +| **SharedBlock** | ✅ | KVMHost | 同 + 扩 LV | 同 + 扩 LV | LVM + qcow2 | +| **Ceph (RBD)** | ⚠️ 受限 | ❌ | ❌ | ❌ | RBD snapshot 不支持合并 | + +## 10.2 LocalStorage + +**Java**:`LocalStorageKvmBackend.java:3825/3846` + +```java +// 离线 commit +postRequest("/localstorage/snapshot/offlinecommit", cmd); +// 离线 pull +postRequest("/localstorage/snapshot/offlinemerge", cmd); +``` + +**Python**:`kvmagent/plugins/localstorage.py:835/859` + +```python +# offline_commit_snapshot +if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +``` + +```python +# offline_merge_snapshot +src_path = cmd.srcPath if not cmd.fullRebase else "" +if linux.qcow2_get_backing_file(cmd.destPath) == src_path: + return # 幂等 +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath, cmd.destPath) +else: + tmp = .../%s.qcow2 % uuid + qcow2.create_template_with_task_daemon(cmd.destPath, tmp, task_spec=cmd) + shell.call("mv %s %s" % (tmp, cmd.destPath)) +``` + +## 10.3 NFS + +**Java**:`NfsPrimaryStorageKVMBackend.java:1996/2031` + +**Python**:`nfs_primarystorage_plugin.py:601/625` + +逻辑与 LocalStorage 几乎一致(同样用 qcow2_commit / qcow2_rebase)。 + +## 10.4 SMP(SharedMountPoint) + +**Java**:`smp/KvmBackend.java:2443/2466` + +**Python**:`shared_mountpoint_plugin.py:483/506` + +逻辑同 NFS。 + +## 10.5 SharedBlock + +**Python**:`shared_block_plugin.py:1247/1285` + +```python +# offline_merge:扩 LV + 激活 LV + rebase +total_required_size = self.get_total_required_size(dst_abs_path) +if current_size < total_required_size: + lvm.extend_lv_from_cmd(dst, total_required_size, cmd, extend_thin_by_specified_size=True) +with lvm.RecursiveOperateLv(src_abs_path, shared=True): + linux.qcow2_rebase(src_abs_path, dst_abs_path) +``` + +```python +# offline_commit:commit 后清理 base 元数据 +with lvm.RecursiveOperateLv(top, shared=True): + if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + if cmd.topChildrenInstallPathInDb: + for c in cmd.topChildrenInstallPathInDb: + with lvm.RecursiveOperateLv(c, shared=True): + if linux.qcow2_get_backing_file(c) != base: + linux.qcow2_rebase_no_check(base, c) +lvm.delete_lv_meta(base) +``` + +## 10.6 Ceph + +`CephPrimaryStorageBase` **未实现** `CommitVolumeSnapshotOnPrimaryStorageMsg` / `PullVolumeSnapshotOnPrimaryStorageMsg`。 + +例外:`CephPrimaryStorageBase.java:2984` 临时快照删除时硬编码 `scope=Single, direction=Commit`,但仅用于撤销临时快照场景。 + +普通 RBD 快照:`cephdriver.py:87` 的 `delete_snapshot` 直接调 `rbd snap rm`,**不支持中间节点合并**。 + +**结论**:Ceph 普通快照不支持 `scope=single`。 + +## 10.7 在线场景统一走 KVMHost + +`plugin/kvm/.../KVMHost.java:1043/1159`: +- `commitVolumeSnapshot` → `POST /vm/volume/blockcommit` +- `pullVolumeSnapshot` → `POST /vm/volume/blockpull` + +所有支持的存储类型在 VM 在线时都走 libvirt blockCommit / blockPull,由 KVMHost 统一处理。 diff --git a/docs/snapshot-single-delete/11-sibling-rebase.md b/docs/snapshot-single-delete/11-sibling-rebase.md new file mode 100644 index 00000000000..e3269317bba --- /dev/null +++ b/docs/snapshot-single-delete/11-sibling-rebase.md @@ -0,0 +1,113 @@ +# 11 — 兄弟节点 rebase(分叉链关键) + +## 11.1 问题背景 + +当 commit 完成后,待删节点 X 还可能有除 src 外的其他子节点(兄弟节点),它们的 backing file 仍然指向 X 的旧物理路径,必须重新指向 base(dst)才能继续访问。 + +``` +分叉链示例: + X (待删) + / \ + src sibling1 + | | + descend ... + +commit(src → X) 完成后: + X 物理文件内容已变成 src 数据 + sibling1 的 backing 仍指向 X 旧路径 → 必须 rebase 到 base +``` + +## 11.2 Java 侧收集兄弟节点路径 + +**文件**:`VolumeSnapshotTreeBase.java:1012-1024` + +```java +// commit flow 内部 +List childrenInstallPath = child.getChildren().stream() + .map(c -> c.getInventory().getPrimaryStorageInstallPath()) + .collect(Collectors.toList()); +// child = src 节点 +// child.getChildren() = src 的所有子节点 + +// 在线消息 +CommitVolumeSnapshotOnHypervisorMsg cmsg = new CommitVolumeSnapshotOnHypervisorMsg(); +cmsg.setSrcChildrenInstallPathInDb(childrenInstallPath); + +// 离线消息(1044 行) +cmsg.setSrcChildrenInstallPathInDb(childrenInstallPath); +``` + +**注意**:变量名 `childrenInstallPath` 表面上像 src 的子节点,但实际语义是"待删节点 X(top)的子节点除 src 之外的兄弟节点"。代码命名上稍混乱,但 `topChildrenInstallPathInDb` 在 agent 侧含义明确:top(待删节点)所有子节点 → 它们的 backing 都需要 rebase 到 base。 + +KVMHost 透传:`KVMHost.java:1052` +```java +cmd.setTopChildrenInstallPathInDb(msg.getSrcChildrenInstallPathInDb()); +``` + +## 11.3 agent 侧循环 unsafe rebase + +### 在线 — `vm_plugin.py:9857` + +```python +vm.do_block_commit(cmd, cmd.volume) +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +rsp.size = VmPlugin._get_snapshot_size(cmd.base) +``` + +### 离线 LocalStorage — `localstorage.py:864-869` + +```python +if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +``` + +### 离线 SharedBlock — `shared_block_plugin.py:1299-1308` + +```python +with lvm.RecursiveOperateLv(top, shared=True): + if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + if cmd.topChildrenInstallPathInDb: + for c in cmd.topChildrenInstallPathInDb: + with lvm.RecursiveOperateLv(c, shared=True): + if linux.qcow2_get_backing_file(c) != base: + linux.qcow2_rebase_no_check(base, c) +``` + +## 11.4 兄弟节点 parentUuid 何时更新? + +**关键事实**:兄弟节点的 `parentUuid` **不在** `updateDatabaseAfterCommit` 里更新。 + +`updateDatabaseAfterCommit` 只更新: +- dst 的 path(互换) +- src 的 path、size、distance、parentUuid +- src 的所有后代的 distance + +兄弟节点(src 的兄弟,即 X 的其他子节点)的 DB `parentUuid` 仍指向 X。 + +**后续递归处理**: +- 下次 `stepDelete` 重新从 DB 构建 `VolumeTree` +- 此时 X 节点对应的物理文件路径已经是 src 数据(互换后) +- 但 DB 中兄弟节点仍挂在 X 下 → 物理 vs DB 不一致 + +这是 `VolumeTree.java:258` 注释中标记的 TODO: + +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// it is necessary to consider the dependency relationships of all snapshot nodes in the +// current snapshot tree within the VolumeSnapshotReferenceVO. +``` + +## 11.5 风险 + +- 分叉链中删中间节点时,兄弟节点物理 backing 与 DB parentUuid 暂时不一致 +- 若此时发生异常重启或并发操作,可能导致快照树状态混乱 +- 当前依赖"删除 X 后兄弟节点自然变成 X.parent 的子节点"这一物理事实,DB 修复留待后续操作 diff --git a/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md b/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md new file mode 100644 index 00000000000..5a17dfc955d --- /dev/null +++ b/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md @@ -0,0 +1,96 @@ +# 12 — fullRebase 与残留文件清理 + +## 12.1 fullRebase 触发 + +**触发条件**:src 快照是树根(`srcSnapshotInv.getParentUuid() == null`),即 src 没有 backing file。 +此时 pull 操作不能简单 rebase(没有新 backing 可指),必须把 dst 文件 flatten 成独立 qcow2。 + +Java 侧构造 `OfflineMergeSnapshotCmd` 时设置 `fullRebase = true`。 + +## 12.2 agent 侧实现(`localstorage.py:835-857`) + +```python +src_path = cmd.srcPath if not cmd.fullRebase else "" + +if linux.qcow2_get_backing_file(cmd.destPath) == src_path: + return # 幂等 + +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath, cmd.destPath) +else: + tmp = os.path.join(os.path.dirname(cmd.destPath), + '%s.qcow2' % uuidhelper.uuid()) + qcow2.create_template_with_task_daemon(cmd.destPath, tmp, task_spec=cmd) + shell.call("mv %s %s" % (tmp, cmd.destPath)) +``` + +## 12.3 `create_template_with_task_daemon` + +**文件**:`zstacklib/zstacklib/utils/qcow2.py:10` + +```python +def create_template_with_task_daemon(src, dst, task_spec, dst_format='qcow2', opts=None, **daemonargs): + t_shell = traceable_shell.get_shell(task_spec) + p_file = tempfile.mktemp() + + class ConvertTaskDaemon(plugin.TaskDaemon): + def _cancel(self): + traceable_shell.cancel_job_by_api(self.api_id) + linux.rm_file_force(self.dst_path) + + def _get_percent(self): + p = linux.tail_1(p_file, split=b"\r") + ... + + with ConvertTaskDaemon(dst, task_spec): + linux.create_template(src, dst, dst_format=dst_format, shell=t_shell, + progress_output=p_file, opts=opts) + # qemu-img convert -f qcow2 -O qcow2 -p +``` + +特性: +- 遍历整条 backing chain,输出独立 qcow2 +- 支持进度上报(`-p`) +- 流式转换,无内存限制 +- 通过 `TaskDaemon` 支持取消(取消时删临时文件) + +## 12.4 mv 替换的并发安全 + +- **文件系统场景**:`mv` 同 FS 内是 `rename(2)` 原子操作 +- **LVM 场景**:`lvm.lv_rename` 元数据级原子 +- **读取并发**:rename 前后读到的是旧/新文件,无半态损坏 +- 上层依赖 `chainSubmit` 串行化同一树的操作,避免读到中间状态 + +## 12.5 残留文件清理责任 + +| 场景 | 清理者 | +|---|---| +| 在线非 active commit | `VIR_DOMAIN_BLOCK_COMMIT_DELETE` 自动删 top | +| 在线 active commit | pivot 后 top 游离,由 `deleteVolumeSnapshotAndSyncVolumeSize` 清理 | +| 离线 commit/pull | `deleteVolumeSnapshotAndSyncVolumeSize` 下发 `VolumeSnapshotPrimaryStorageDeletionMsg` | +| SharedBlock commit | `lvm.delete_lv_meta(base)` 删元数据;LV 真删走 `delete_bits` → `lvm.delete_lv` | + +## 12.6 物理删除入口(`VolumeSnapshotTreeBase.java:1307`) + +```java +private void deleteVolumeSnapshotAndSyncVolumeSize(Completion completion) { + VolumeSnapshotPrimaryStorageDeletionMsg pmsg = new VolumeSnapshotPrimaryStorageDeletionMsg(); + pmsg.setUuid(currentRoot.getUuid()); + bus.makeTargetServiceIdByResourceUuid(pmsg, VolumeSnapshotConstant.SERVICE_ID, + currentRoot.getPrimaryStorageUuid()); + bus.send(pmsg, ...); +} +``` + +各存储后端处理 `VolumeSnapshotPrimaryStorageDeletionMsg`,调用各自的 `delete_bits` HTTP 端点。 + +## 12.7 失败补偿 TODO + +`VolumeSnapshotTreeBase.java:1325`: + +```java +//TODO add gc +logger.warn(String.format("failed to delete snapshot[uuid:%s] on primary storage[uuid:%s], ...")); +``` + +物理文件删除失败仅 warn 日志,**无 GC 补偿**,存在文件/LV 泄露风险。 diff --git a/docs/snapshot-single-delete/13-premium-and-cdp.md b/docs/snapshot-single-delete/13-premium-and-cdp.md new file mode 100644 index 00000000000..4bee8388f84 --- /dev/null +++ b/docs/snapshot-single-delete/13-premium-and-cdp.md @@ -0,0 +1,53 @@ +# 13 — Premium / CDP / 灾备兼容性 + +## 13.1 Premium 侧改动 + +搜索 `/d/0zw/zw/premium/` 中与单节点快照删除直接相关的代码: + +| 文件 | 说明 | +|---|---| +| `mevoco/.../VolumeSnapshotDeletionOverlayVmMsg.java`(第6行) | 6 行 OverlayMessage 壳,**无 scope/direction 业务逻辑** | +| `CreateDataVolumeFromVolumeSnapshotGroupFlow.java` | 创建数据卷流程,与删除无关 | +| `CreateRootTemplateFromVolumeSnapshotFlow.java` | 创建模板流程,与删除无关 | +| 阿里云 Hybrid | `AliyunSnapshotCascadeExtension`,**不走** single 路径 | + +**结论**:Premium **未重写** `VolumeSnapshotTreeBase` / `VolumeTree` / `VolumeSnapshotGroupBase`。single 删除完全由开源主库实现,Premium 无额外扩展。 + +## 13.2 CDP / StorageSnapshot 类型 + +`VolumeSnapshotTreeBase.java:836`: + +```java +if (VolumeSnapshotConstant.STORAGE_SNAPSHOT_TYPE.toString().equals(currentRoot.getType()) + || Objects.equals(currentRoot.getVolumeType(), VolumeType.Memory.toString())) { + deleteVolumeSnapshotAndSyncVolumeSize(new Completion(completion) { ... }); + return; +} +``` + +CDP / StorageSnapshot 类型 / Memory 快照绕过整个 commit/pull 逻辑,**直接调用存储层删除**。 + +原因: +- StorageSnapshot 是存储后端原生快照(如 RBD snapshot),ZStack 不掌握其链结构 +- Memory 快照不是 qcow2 文件链 +- 都不需要 commit/pull 合并 + +## 13.3 Ceph 不兼容 + +`CephPrimaryStorageBase` 未实现: +- `CommitVolumeSnapshotOnPrimaryStorageMsg` +- `PullVolumeSnapshotOnPrimaryStorageMsg` + +普通 RBD 快照在 `cephdriver.py:87` 通过 `rbd snap rm` 删除,**无中间节点合并能力**。 + +例外:`CephPrimaryStorageBase.java:2984` 临时快照场景硬编码 `scope=Single, direction=Commit`,但这只是 ZStack 层面的删除消息标志,实际不走 commit 逻辑。 + +## 13.4 灾备 / 备份 + +经搜索: +- **未发现**灾备/CDP/Backup 调用链直接发 `DeleteVolumeSnapshotGroupInnerMsg` +- **未发现**对 single 模式的额外 cascade / 索引同步逻辑 + +## 13.5 OverlayMsg 串行化 + +`VolumeSnapshotDeletionOverlayVmMsg` 作用:把删除消息包裹后路由到 `VmInstance` 的 mailbox,保证与 VM 状态变更操作互斥。Premium 侧的 OverlayMsg 与开源侧一致,无额外业务。 diff --git a/docs/snapshot-single-delete/14-limitations-and-todos.md b/docs/snapshot-single-delete/14-limitations-and-todos.md new file mode 100644 index 00000000000..dc8e4ddcf57 --- /dev/null +++ b/docs/snapshot-single-delete/14-limitations-and-todos.md @@ -0,0 +1,77 @@ +# 14 — 已知限制 / TODO / FIXME + +## 14.1 代码注释中的 TODO + +### `VolumeTree.java:258` +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// it is necessary to consider the dependency relationships of all snapshot nodes in the +// current snapshot tree within the VolumeSnapshotReferenceVO. +``` +链克隆 + single 删除同时启用时,`VolumeSnapshotReferenceVO` 依赖关系未处理。 + +### `VolumeTree.java:394` +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// the following three functions must take into account the dependencies within the snapshot chain. +``` +针对 `updateDatabaseAfterPullToVolume`、`updateDatabaseAfterPull`、`updateDatabaseAfterCommit`。 + +### `VolumeSnapshotTreeBase.java:355` +```java +// TODO: BUG FIX, when deleting a volume the cascade extension will send messages to all snapshots +// of this volume, which the oldest snapshot will delete descendant snapshots and set the volumeUuid +// to NULL for all snapshots, so the after messages are useless +``` +卷删除时级联消息冗余。 + +### `VolumeSnapshotTreeBase.java:1325` +```java +//TODO add gc +``` +物理文件删除失败无 GC 补偿。 + +### `VolumeSnapshotTreeBase.java:1520` +```java +//TODO: remove this +``` + +### `VolumeSnapshotTreeBase.java:2169` +```java +// TODO: refactor this: VolumeSnapshotGroupVO should has its own cascade extensions! +``` + +## 14.2 限制汇总 + +| 限制 | 位置 | 影响 | +|---|---|---| +| 链克隆 + single 不兼容 | `VolumeTree.java:258, 394` | `VolumeSnapshotReferenceVO` 依赖未维护,可能误删共享数据 | +| 物理删除无 GC | `VolumeSnapshotTreeBase.java:1325` | 文件/LV 泄露 | +| 卷删除级联消息冗余 | `VolumeSnapshotTreeBase.java:355` | 性能浪费,无功能影响 | +| 兄弟节点 parentUuid 暂不一致 | `updateDatabaseAfterCommit` | DB 与物理短暂不一致,依赖后续递归修复 | +| pull 但需 commit 抛 RuntimeException | `VolumeTree.java:371` | 未封装 ErrorCode,前端体验差 | +| VmState 限制 | `VolumeSnapshotTreeBase.java:854` | Migrating / Unknown 状态直接失败 | +| Ceph RBD 不支持 | `CephPrimaryStorageBase` | 无 commit/pull 实现,普通 RBD 快照无法 single 删除 | +| Group 无 Availability 检查 | `VolumeSnapshotGroupBase.java:212` | 删除前不检查组成员状态 | +| Group 并发度固定 5 | `:243` | 大组删除可能耗时长,但不可调 | +| Group 无整体回滚 | `:212-254` | 部分成功保留,需调用方处理错误列表 | + +## 14.3 设计取舍 + +| 决策 | 理由 | +|---|---| +| 默认 `scope=chain` | 保持向后兼容,避免老 API 调用方行为突变 | +| 多子节点强制 pull | commit 会改 dst 路径,破坏其它兄弟语义 | +| 优先非 online 子节点 | 避开 qemu 持有的活跃 backing 链 | +| commit 用 path 互换 | 避免修改快照 uuid,保持外部引用稳定 | +| 失败不回滚 | 存储操作不可逆,靠幂等性支持重试 | +| 删除前不查 GroupAvailability | 由下层 `isOperationAllowed` 自校验,避免重复 | + +## 14.4 后续改进建议(基于代码) + +1. **Ceph 支持**:考虑用 `rbd snap flatten` + RBD clone 实现单节点删除 +2. **GC 机制**:为 `deleteVolumeSnapshotAndSyncVolumeSize` 失败的物理文件加 GC 任务 +3. **错误码封装**:`resolveDirection` 的 `IllegalArgumentException` 换成 `ErrorCode` +4. **链克隆兼容**:`VolumeSnapshotReferenceVO` 在 commit/pull DB 更新时同步处理 +5. **并发度可配**:Group 删除并发度做成 GlobalConfig +6. **VmState 扩展**:评估 Migrating 等状态的支持 diff --git a/docs/snapshot-single-delete/scenarios/00-index.md b/docs/snapshot-single-delete/scenarios/00-index.md new file mode 100644 index 00000000000..bf9c2aaf1f3 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/00-index.md @@ -0,0 +1,23 @@ +# 单盘快照删除 — 场景梳理索引 + +本目录收录"现状代码逻辑梳理"性质的场景文档(与加固设计 spec 隔离)。 +每个文件聚焦一种具体的(存储类型 × VM 状态 × 树结构)组合,按 stepDelete 轮次推演当前实现行为。 + +| 文件 | 存储 | VM 状态 | 树结构 / 待删节点 | +|---|---|---|---| +| `01-multi-children-stepDelete.md` | 通用 | 通用 | 抽象骨架:X→A→{B,C,D},待删 A,多子节点 stepDelete 决策算法 | +| `02-local-running-delete-mid-with-3-children.md` | LocalStorage | Running | 1→2→{3,4,5→vol},待删快照2,含在线 commit + vol.installPath 同步 | +| `03-local-stopped-delete-mid-with-3-children.md` | LocalStorage | Stopped | 同上树结构,全程离线 pull → `offline_merge_snapshot` → `qcow2_rebase`,差量散到每个 child,无 libvirt,无 path 互换 | +| `04-deleteSingleFlows-online-offline-decision.md` | 通用 | 通用 | `deleteSingleFlows` / `stepDelete` / `resolveDirection` / `isOnline` / `commit` / `pull` 中 online 与 direction 的判定时序、四象限到 agent 入口映射 | +| `05-local-stopped-direction-commit-actual.md` | LocalStorage | Stopped | 1→2→{3,4,5→vol},待删快照2,**实测**记录(ZSV 真实环境抓 API uuid 全程 agent POST),direction=Commit + scope=single;轮 1/2 `offlinemerge`,轮 3 `offlinecommit`,轮 4 `delete`;修正源码推演 3 处偏差(child 顺序、VO_2 直接删、vol.installPath 不互换)| + +> 当前实现 Bug 清单已独立成档:`../bugs.md`(位于 `docs/snapshot-single-delete/bugs.md`)。**P0 修复已落地**(拆 `isOnline` 为 `isOnAliveChain` + `isHypervisorOperation`,`resolveDirection` 解耦 vmState),覆盖 Bug 0/1/3/7;剩余 P0/P1(Bug 2/4/5/6)见 bugs.md。 + +> API 参数(`scope` / `direction`)重构提案:`../proposals/scope-direction-api-redesign.md`,覆盖 Bug 2 / Bug 8 / Bug 9。 + +> 待补场景候选(按需追加): +> - NFS / SMP / SharedBlock + 在线 / 离线 各组合 +> - 删根节点(dst 是树根,触发 newTree 创建) +> - 分叉链 + 在线 active commit 链上有多级 snapshot +> - fullRebase 路径(pull 大文件) +> - 快照组(多卷并发) diff --git a/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md b/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md new file mode 100644 index 00000000000..dc3b371eeb7 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md @@ -0,0 +1,128 @@ +# 15. 多子节点 stepDelete 处理逻辑(现状梳理) + +> 本文档属于"当前实现梳理",与 `04-scope-and-stepDelete.md` 互补:04 讲 scope/递归框架,本文聚焦 **currentRoot 有多个直接子节点时** 的具体决策与执行顺序。 +> 源码:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java` `deleteSingleFlows()` / `stepDelete()`,行号 828–919(5.5.6 基线)。 + +--- + +## 15.1 调用入口 + +``` +deleteSingleFlows() (行 828) + └─ flow "delete-single-volume-snapshot" + ├─ 类型分流:StorageSnapshot / Memory → 直接 deleteVolumeSnapshotAndSyncVolumeSize + ├─ vmState 校验(Running / Paused / Destroyed / Stopped / Destroying) + └─ stepDelete() ◄── 递归核心 +``` + +`currentRoot` = 待删快照(不是它的兄弟)。后续讨论的 children 都是 **currentRoot 的直接子节点**。 + +--- + +## 15.2 stepDelete 单轮决策表(行 875–919) + +每轮重读 DB 重建 `VolumeTree`,处理一个子节点后递归再调一次 stepDelete。 + +``` +1. vos = Q(VolumeSnapshotVO).eq(treeUuid).list() +2. volumeTree = VolumeTree.fromVOs(vos, current, volumeInv) +3. children = volumeTree.getSnapshotLeaf(currentRoot.uuid).getChildren() + +┌──────────────────────────┬────────────────────────────────────────────────────┐ +│ children.size() │ 行为 │ +├──────────────────────────┼────────────────────────────────────────────────────┤ +│ 0 │ deleteVolumeSnapshotAndSyncVolumeSize(终态,删自身) │ +│ 1 │ resolveDirection → commit 或 pull │ +│ ≥ 2 (多子节点) │ 选一个非 alive chain 上的 child → 离线 pull │ +└──────────────────────────┴────────────────────────────────────────────────────┘ +``` + +### 多子节点选择算法(行 912–918) + +```java +onlineChild = children.stream() + .filter(c -> volumeTree.isOnline(current, currentRoot.uuid, c.uuid, vmState)) + .findFirst().orElse(null); + +child = children.get(0); +if (onlineChild != null && Objects.equals(child.uuid, onlineChild.uuid)) { + child = children.get(1); // 避开 alive 子节点,挑下一个 +} +boolean online = volumeTree.isOnline(current, currentRoot.uuid, child.uuid, vmState); +pull(child, volumeTree, online, comp); +``` + +要点: +- **永远先离线 pull 非 alive 的子节点**:alive chain 上的子节点最后一轮才处理(届时 children.size() 已收敛到 1) +- **只挑 children.get(0) 或 children.get(1)**:每轮处理一个,下一轮再选 +- direction 强制为 pull:多子节点路径不调 resolveDirection,直接 `pull(...)` + +--- + +## 15.3 pull 对一个子节点的物理 + DB 影响 + +设 currentRoot=X,要 pull 的子节点=Y: + +| 层 | 变化 | +|---|---| +| 物理 qcow2 | `qcow2_commit(X → Y)`:X 的差量被合进 Y;Y 的 backing 从 X 翻到 X.parent | +| DB(updateDatabaseAfterPull)| Y.parentUuid = X.parentUuid;Y.distance--;其它 X 的子节点不动 | + +效果:Y 不再依赖 X,从 currentRoot 的 children 列表中"脱离";下一轮 stepDelete 重读 DB 时 Y 已不在 children 里。 + +--- + +## 15.4 完整执行轨迹示例 + +快照树: + +``` + X (待删, currentRoot) + └─ A + ├─ B + ├─ C + └─ D ── vol ← alive chain 末端 +``` + +待删的是 **A**(currentRoot=A,children=[B, C, D])。 + +| 轮 | children 重读 | onlineChild | 选中 child | 决策 | 行为 | +|---|---|---|---|---|---| +| 1 | [B, C, D] | D | B(首个非 alive)| 离线 pull | qcow2_commit(A→B), B.parentUuid=X | +| 2 | [C, D] | D | C | 离线 pull | qcow2_commit(A→C), C.parentUuid=X | +| 3 | [D] (size=1) | D | D | resolveDirection → Commit (latest+online) | 在线 blockCommit(A→D) + pivot | +| 4 | [] | — | — | terminal | deleteVolumeSnapshotAndSyncVolumeSize(A) | + +最终: +- 物理:A 的 qcow2 文件被删;B/C 的 backing 直接指 X;D 通过 in-place commit 把 A 的数据吃掉(D.installPath 不变,但内容含 A) +- DB:A 的 VO 删除;B/C/D 的 parentUuid 全部跳过 A 直接指向 X + +--- + +## 15.5 关键不变量与代码对应 + +| 不变量 | 代码位置 | 作用 | +|---|---|---| +| 每轮重读 DB | 行 876 `Q.New(VolumeSnapshotVO).eq(treeUuid).list()` | 上一轮 DB 翻转后下一轮决策基于最新状态,避免基于陈旧子节点列表做错决定 | +| 多子节点先 pull 非 alive | 行 913–915 `if (child == onlineChild) child = children.get(1)` | 保证 alive chain 上的活跃文件不被离线操作打断 | +| alive 子节点最后处理 | 多轮 pull 后 children.size() 收敛到 1,进入 commit 分支 | 在线 commit 走 libvirt blockCommit,与 alive VM 协同 | +| 同步递归(comp.success → stepDelete)| 行 891–895 | 全程在 chainSubmit 锁内,无并发;reconciler 可同步介入每轮 | +| 终态收敛 | children.isEmpty() → deleteVolumeSnapshotAndSyncVolumeSize | 数据已全部搬走,自身物理 + DB 真删 | + +--- + +## 15.6 资料 children 顺序的依赖 + +代码使用 `children.get(0)` / `children.get(1)`,依赖 `VolumeTree.fromVOs` 返回 children 的顺序。该顺序由 DB 查询顺序决定(无显式 ORDER BY),实践上稳定但不应依赖语义意义。`onlineChild` 通过 `isOnline` 判定,与 children 顺序无关——这保证了"避开 alive"逻辑不会因 DB 顺序波动而失效。 + +--- + +## 15.7 与 commit 单子节点路径的差别 + +| 场景 | direction | 物理操作 | DB 翻转 | +|---|---|---|---| +| 单子节点 + commit | child(src) → currentRoot(dst) | qcow2_commit(src→dst) + 兄弟 rebase 到 dst | dst 移入 src 位置(详见 05-commit-db-swap)| +| 单子节点 + pull | currentRoot(src) → child(dst) | qcow2_commit(src→dst) | dst.parentUuid = src.parentUuid(详见 06-pull-db-rewrite)| +| 多子节点 | 强制 pull | 选一个非 alive child 做 pull | 该 child.parentUuid = currentRoot.parentUuid,其余 children 不动 | + +多子节点本质上是**对 N 个 child 顺序应用 pull**,把多分叉树逐步收敛为单分支,最后回归到"单子节点 commit"路径完成 alive 合并。 diff --git a/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md b/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md new file mode 100644 index 00000000000..bd63251cd11 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md @@ -0,0 +1,259 @@ +# 场景 02:local + 在线 VM + 删除中间节点(快照2,3 个子节点其中 1 个 alive) + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 源码:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java`、`VolumeTree.java`、`kvmagent/.../vm_plugin.py`、`kvmagent/.../localstorage_plugin.py`。 + +--- + +## 前提 + +- 主存储类型:**LocalStorage** +- VM 状态:**Running**(active commit 路径) +- 待删快照:**快照2**(中间节点,3 个直接子节点,其中 1 个在 alive chain 上) + +## 快照树 + +``` + 快照1 + └─ 快照2 ◄── 待删 currentRoot + ├─ 快照3 + ├─ 快照4 + └─ 快照5 ── vol ← alive chain(VM 当前盘) +``` + +## 物理 backing chain(alive 这条线) + +``` +1.qcow2 ← 2.qcow2 ← 5.qcow2 ← vol +``` + +兄弟分支: + +``` +2.qcow2 ← 3.qcow2 +2.qcow2 ← 4.qcow2 +``` + +--- + +## 总轮次(4 轮 stepDelete) + +| 轮 | currentRoot=2 的 children | 选中 | online? | direction | 物理操作 | DB 关键变更 | +|---|---|---|---|---|---|---| +| 1 | [3, 4, 5] | 3 | false | **强制 pull** | `offline_merge_snapshot` → `qcow2_rebase(1.qcow2, 3.qcow2)`(差量进 3) | 3.parentUuid=1, 3.distance-- | +| 2 | [4, 5] | 4 | false | **强制 pull** | `offline_merge_snapshot` → `qcow2_rebase(1.qcow2, 4.qcow2)`(差量进 4) | 4.parentUuid=1, 4.distance-- | +| 3 | [5] | 5 | **true** | resolveDirection → Commit | libvirt blockCommit(top=5, base=2) + pivot | DB 互换 path | +| 4 | [] | — | — | terminal | 删 VO_2 + 物理(5.qcow2 文件已 libvirt 删)| VO_2 删除 | + +--- + +## 轮 1:离线 pull 快照3 + +代码 `VolumeSnapshotTreeBase.java:912-918`,进入 `children.size() ≥ 2` 分支: + +```java +aliveChild = 5 // 唯一 vol 链上(isOnAliveChain 命中;修复后术语) +child = children.get(0) = 3 // 3 != aliveChild → 不替换 +online = isOnline(2, 3, Running) = false +pull(3, ..., online=false) +``` + +**消息**:`PullVolumeSnapshotOnPrimaryStorageMsg`(local 走主存储路径,不经 hypervisor) + +**后端 → agent 映射**:`LocalStorageKvmBackend.handle(PullVolumeSnapshotOnPrimaryStorageMsg)` 行 3845-3865 → `OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=3.qcow2, fullRebase=false}` → **`OFFLINE_MERGE_PATH = "/localstorage/snapshot/offlinemerge"`** + +**agent 物理动作**(`localstorage.py` `offline_merge_snapshot` 行 834-856): + +``` +linux.qcow2_rebase(srcPath=1.qcow2, destPath=3.qcow2) +# qemu-img rebase 默认(非 -u): +# 把 3.qcow2 旧 backing(2.qcow2) 与新 backing(1.qcow2) 之间的差异 +# 写入 3.qcow2 数据区,然后改写头部 backing 字段为 1.qcow2 +``` + +**DB 翻转**(`updateDatabaseAfterPull`,详见 `../06-pull-db-rewrite.md`): + +``` +VO_3.parentUuid = 1 +VO_3.distance -= 1 +VO_3.installPath 不变 +其它 VO 不动 +``` + +VM 状态:完全无感(3 不在 alive chain)。 + +--- + +## 轮 2:离线 pull 快照4 + +与轮 1 完全对称。 + +**结果**: + +``` +VO_4.parentUuid = 1 +VO_4.distance -= 1 +4.qcow2 物理 backing → 1.qcow2 +``` + +此时 currentRoot=2 在 DB 中的 children 只剩 [5]。 + +--- + +## 轮 3:在线 commit 快照5 → 快照2(最复杂的一轮) + +```java +direction = volumeTree.resolveDirection(2, 5, msg.direction, currentRoot.isLatest, Running) + → Commit (5 在 alive chain + Running) +online = isOnline(2, 5, Running) = true +commit(5, volumeTree, online=true, comp) +``` + +### 3.1 控制面 flow(`commit()` 行 921-1094) + +``` +flow chain: + 1. (条件) SyncVolumeSizeOnPrimaryStorage 仅当 srcSnapshot.uuid == volume.uuid;本例 src=5 ≠ vol → 跳过 + 2. AllocatePrimaryStorageSpaceMsg 预占 size + 3. CommitVolumeSnapshotOnHypervisorMsg online → 走 hypervisor + ├─ srcSnapshot = 5 inventory + ├─ dstSnapshot = 2 inventory + └─ srcChildrenInstallPathInDb = [vol.installPath] # 5 的子节点是 vol leaf + 4. updateDatabaseAfterCommit DB 互换(SQLBatch 单事务) +``` + +### 3.2 数据面(`vm_plugin.py do_block_commit`) + +``` +top = src = 5.qcow2(VM 当前活跃盘) +base = dst = 2.qcow2 + +步骤: + 1. virDomainBlockCommit(disk, base=2.qcow2, top=5.qcow2, + flags=VIR_DOMAIN_BLOCK_COMMIT_ACTIVE | SHALLOW) + → libvirt 把 5 中尚未在 2 的数据 flush 到 2.qcow2 + → 进入 READY 态(active commit 特征) + 2. _wait_for_block_job → READY + 3. virDomainBlockJobAbort(disk, flags=VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT) + → VM disk source 从 5.qcow2 → 切到 2.qcow2 + 4. for child in srcChildrenInstallPathInDb=[vol.installPath]: + if qcow2_get_backing_file(child) != base: + qcow2_rebase_no_check(base, child) + → 本例 vol 即 5.qcow2 自身,pivot 后 VM 已切到 2.qcow2,通常 noop +``` + +完成后物理: + +``` +2.qcow2 内容:原 5 的全部数据已合并进来 +5.qcow2 物理文件:libvirt 在 pivot 时删除(VIR_DOMAIN_BLOCK_COMMIT_DELETE) +VM 活跃盘 source:2.qcow2 +``` + +### 3.3 DB 翻转(参考 `../05-commit-db-swap.md` §5.3) + +``` +src=5, dst=2 + +互换前: + VO_5.installPath = 5.qcow2 parentUuid = 2 distance = N + VO_2.installPath = 2.qcow2 parentUuid = 1 distance = N-1 + +互换后(**实测修订** —— 见场景 05 §6): + VO_2 **整条 DB 记录被删除**(不是"互换后保留至轮 4") + VO_5.installPath = 2.qcow2 ← 接管旧 dst 文件(含合并数据) + VO_5.parentUuid = 1 ← 跨过 2 + VO_5.distance -= 1 + VO_5.treeUuid = 不变(dst=2 不是树根;若 dst 是根则迁到新 tree) + +GroupRef 同步:被删者(2) 的 GroupRef 一并删除(VO_2 被 DELETE) + +distance 递减:src=5 的所有后代 distance -= 1(本例无更深 snapshot,只有 vol leaf) +``` + +### 3.4 vol.installPath 的同步 + +**实测结论**(场景 05 §5.2 / §6):commit 路径下 `vol.installPath` 字段在 DB 中**不变**。vol 之前挂 5.qcow2(VO_5 旧 installPath),commit + pivot 后物理上 vol 实际挂 2.qcow2(含合并数据的文件),但这个切换通过两个步骤实现: +- **物理层**:libvirt blockCommit pivot 后 vm domain 已经在用 2.qcow2 作为 backing;同步路径里 sibling 的 `qcow2_rebase_no_check(base=2.qcow2, child)` 把 vol 的 backing 链改写到 2.qcow2 +- **DB 层**:vol VO 的 installPath 字段保留原值,但 VO_5 的 installPath 字段被改为 2.qcow2(VO_5 接管 dst 物理文件),vol → VO_5 的 backing 关系仍然指向同一物理文件 + +因此"vol 跟着合并数据走"不是靠 `UPDATE VolumeVO SET installPath=...`,而是靠物理 backing 链 + VO 文件接管的组合。这是 alive 末端 commit 的关键行为(与中间节点 commit 不同:中间节点 commit 没有 vol 需要跟踪)。 + +### 3.5 互换后链状态 + +``` +DB 视角: + vol.installPath = 5.qcow2(**不变**,但物理 backing 已切到 2.qcow2) + VO_5.installPath = 2.qcow2 parentUuid = 1 ← 接管原 dst 文件 + VO_2 已删除 + VO_3.installPath = 3.qcow2 parentUuid = 1 + VO_4.installPath = 4.qcow2 parentUuid = 1 + +物理 backing chain: + vol → 2.qcow2(含合并数据)→ 1.qcow2 + 3.qcow2 → 1.qcow2 + 4.qcow2 → 1.qcow2 + 5.qcow2:libvirt 已删 +``` + +--- + +## 轮 4:物理清扫 5.qcow2 + +> ⚠ **实测修订**:VO_2 在轮 3 的 SQLBatch 中已被直接删除(不是"互换 path 保留至轮 4")。轮 4 的 `children=[]` 是因为 VO_5.parentUuid 已跨过 2 指向 1,VO_2 在树中已不可见。 + +```java +children = [] +deleteVolumeSnapshotAndSyncVolumeSize(comp) +``` + +**消息**:`DeleteVolumeSnapshotOnPrimaryStorageMsg` + +**agent 物理动作**(实测):删 **5.qcow2 物理文件**(即原 VO_5 的旧 installPath;libvirt 在 pivot 时已逻辑解除引用,此处 agent 真正删盘)。注意:传给 agent 的 path 来自 stepDelete 调用栈记住的 currentRoot 物理路径,而非已删除的 VO_2 VO。 + +**DB**:syncVolumeSize 更新 vol 的 size。VO_2 已在轮 3 删除,本轮 DB 无 VO 删除。 + +--- + +## 终态 + +``` +快照树(DB): + 快照1 + ├─ 快照3 installPath=3.qcow2 backing=1.qcow2 + ├─ 快照4 installPath=4.qcow2 backing=1.qcow2 + └─ 快照5 ── vol VO_5.installPath=2.qcow2(接管旧 dst 文件) + vol.installPath=5.qcow2(DB 字段不变,但物理 backing 已切到 2.qcow2) + +VO_2 已删除(轮 3 SQLBatch 中 DELETE) + +物理: + 1.qcow2 ← 2.qcow2 ← vol(VM 活跃,物理上 vol.backing = 2.qcow2,2.qcow2 含原 5+2 合并数据) + 1.qcow2 ← 3.qcow2 + 1.qcow2 ← 4.qcow2 + 5.qcow2 文件已删(轮 4) +``` + +> "VO_5 接管 2.qcow2 / VO_2 直接删 / vol.installPath 不变"这三条对应实测验证记录详见场景 05 §5、§6。 + +--- + +## 全程关键脆弱点(仅梳理,不含加固) + +| 轮 | 失败类型 | 当前后果 | +|---|---|---| +| 1 / 2 | `qcow2_rebase` 失败(agent crash 或 IO 错) | 3 / 4 backing 可能部分改写但未完成;DB 翻转尚未发生,幂等可重试 | +| 1 / 2 | `qcow2_rebase` 成功 + DB 翻转 SQL 失败 | 物理 child.backing=1,DB child.parentUuid=2 → 不一致 | +| 3 | blockCommit 卡住 / pivot 前 agent 死 | VM 可能仍指 5.qcow2,DB 未翻转 | +| 3 | blockCommit 成功但 reply 丢失 / SQLBatch 失败 | 物理已切到 2.qcow2,DB 仍旧态(VO_2 未删 / VO_5.installPath 仍 5.qcow2),重启会按 DB 读 5.qcow2 而 libvirt 已删它 | +| 3 | DB 翻转成功,但 vol 物理 backing 改写失败 | vol.qcow2 头部 backing 仍指 5.qcow2(已删)→ VM 重启失败 | +| 4 | 删 5.qcow2 失败 | 孤儿文件残留 | + +--- + +## 与其它场景对照 + +| 场景 | 轮数 | 核心特征 | +|---|---|---| +| `01-multi-children-stepDelete.md` | 4 | 通用多子节点骨架;以 X→A→{B,C,D} 抽象演示 | +| **本场景 02**(local + Running + 删快照2) | 4 | 落到具体存储 + 在线 + alive 子节点是 vol 直接父;最后一轮在线 commit + vol.installPath 同步是关键差异 | diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md new file mode 100644 index 00000000000..2b8178990d4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md @@ -0,0 +1,93 @@ +# 单盘快照删除一致性加固设计 - 总览 + +- 状态:Draft +- 日期:2026-05-13 +- 关联:ZSV-5799;MR zstack#7674 / premium#10776 / utility#5743 +- 调研基线:`docs/snapshot-single-delete/00-overview.md` + +## 文档拆分 + +| 文件 | 内容 | +|---|---| +| `00-overview.md` | 背景 / 目标 / 约束 / 整体架构(本文) | +| `01-control-plane-reconciler.md` | 控制面 VolumeSnapshotTreeReconciler 设计 | +| `02-data-plane-validation.md` | 数据面 4 层 L1-L4 校验 | +| `03-flowchain-recovery.md` | FlowChain 混合恢复策略与异常场景 | +| `04-testing-strategy.md` | 测试金字塔与用例清单 | +| `05-rollout-plan.md` | 灰度 / 监控 / 回滚 / 风险登记 | +| `06-invariants-and-scope.md` | 不变量护栏总结 / 范围之外 | + +--- + +## 1. 背景 + +ZSV-5799 引入了 `scope=single` 单节点快照删除(commit/pull 路径)。现有实现的关键不足(详见 `docs/snapshot-single-delete/14-limitations-and-todos.md`): + +- **物理文件泄漏**:commit/pull/delete 物理失败后只 warn,文件/LV 残留 +- **DB 不一致**:DB 翻转后失败留下错位 path、悬空 parentUuid、兄弟节点 backing 与 DB parentUuid 不一致 +- **重试不幂等**:失败后中间状态可能让重试失败 +- **节点孤立**:分叉链兄弟节点物理 rebase 完成、DB 未更新 +- **在线 VM**:active commit pivot 状态机不严谨 + +## 2. 目标 + +加固现有删除单盘快照逻辑,确保: + +1. **不变量 1**:操作结束后 DB `(uuid, installPath, parentUuid, distance, treeUuid)` 与物理 qcow2 backing chain 必须一致 +2. **不变量 2**:失败重试可从任意中间状态推进到目标态,**不依赖任何额外状态字段** +3. **不变量 3**:物理删除失败不破坏不变量 1(VO 删,孤儿文件由 warn 记录) + +## 3. 约束与决策 + +| 维度 | 决策 | +|---|---| +| 一致性范围 | 物理泄漏 + DB 一致 + 重试幂等 + 在线 VM 安全,全部覆盖 | +| 状态机 | **不加新表 / 不加新字段**,靠扫描 + qcow2 物理状态推断 | +| GC 触发 | **只在操作完成 / 失败后** 跑当前快照树的局部对账 | +| 控制面预检 | 不做;首次执行走轻量路径 | +| 数据面校验 | L1 dump + L2 verify + L3 check + L4 blockJob 状态机加固,全开 | +| 物理删除失败 | 维持现状(VO 删 + warn) | +| 失败恢复 | 混合策略:可逆 flow rollback;不可逆 flow 由 reconciler 前进式补全 | + +## 4. 整体架构 + +``` + ┌─────────────────────────────────────────────┐ + 用户 / API │ 控制面(zstack management) │ + APIDeleteVolumeSnapshotMsg │ + │ │ ┌─────────────────────────┐ │ + ▼ │ │ VolumeSnapshotTreeBase │ │ + VolumeSnapshotTreeBase │ │ deletion() │ │ + │ │ │ stepDelete() │ │ + │ commit/pull/del │ └────────┬─────────────────┘ │ + ▼ │ │ success/fail │ + FlowChain │ ▼ │ + │ │ ┌─────────────────────────┐ │ + │ each step ends │ │ VolumeSnapshotTreeReconciler (新) │ + └────────────────►│ │ reconcile(treeUuid) │ + │ │ 1) 拉物理 backing chain │ + │ │ 2) 与 DB 比对 │ + │ │ 3) 输出 fix actions(受限动作集) │ + │ │ 4) 顺序执行;记 remaining │ + │ └────────┬─────────────────┘ │ + │ │ │ + └───────────┼────────────────────────────────────┘ + │ GetVolumeBackingChainFromPrimaryStorageMsg + ▼ + ┌─────────────────────────────────────────────┐ + │ 数据面(kvm agent) │ + │ │ + │ vm_plugin.py / *_plugin.py │ + │ ├─ L1 操作前 dump chain → recovery file │ + │ ├─ qemu-img commit/rebase 主操作 │ + │ ├─ L2 操作后 verify_backing_chain │ + │ ├─ L3 异常路径 qemu-img check │ + │ └─ L4 _wait_for_block_job 状态机加固 │ + └─────────────────────────────────────────────┘ +``` + +**核心组件三处**: + +1. **控制面**:抽出 `VolumeSnapshotTreeReconciler`(新类,不是新服务),负责 DB ↔ 物理对账 +2. **数据面**:4 层校验工具集中在 `kvmagent/zstacklib/utils/snapshot_recovery.py`(新建),所有存储后端共享 +3. **FlowChain**:success / fail 回调都先调 reconciler diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md new file mode 100644 index 00000000000..0742ba329b9 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md @@ -0,0 +1,157 @@ +# 控制面:VolumeSnapshotTreeReconciler + +## 5.1 类设计 + +**位置**:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeReconciler.java` + +```java +public class VolumeSnapshotTreeReconciler { + @Autowired private CloudBus bus; + @Autowired private DatabaseFacade dbf; + + public ReconcileResult reconcile(String treeUuid, String volumeUuid, + ReconcileTrigger trigger); +} + +public class ReconcileResult { + boolean consistent; + List appliedActions; + List remaining; +} + +public enum ReconcileTrigger { + AfterCommitSuccess, AfterCommitFail, + AfterPullSuccess, AfterPullFail, + AfterDeleteSuccess, AfterDeleteFail, +} +``` + +## 5.2 工作流程 + +``` +reconcile(treeUuid, volumeUuid, trigger): + 1. 读 DB:Q.New(VolumeSnapshotVO).eq(treeUuid).list() + 若结果为空(dst 是树根的 commit 已完成切换到新 treeUuid 场景) + → 通过 volumeUuid 查 latest VO,反推真实 treeUuid 重新加载 + 2. 读物理:对每个 alive 叶节点发 GetVolumeBackingChainFromPrimaryStorageMsg + (分叉链时多发,合并去重得到全树物理 chains) + + GetSnapshotInstalledPathExistenceMsg + 3. 比对 → InconsistencyReport[] + 4. 翻译为 FixAction(受限动作集) + 5. 顺序执行;失败的进 remaining +``` + +**注**:step 2 对分叉链需遍历所有 alive 叶节点,而不是仅当前 volume.installPath 这条线性 chain, +否则 I4(installPath 错位到非当前叶所在分支)会漏检。 + +## 5.3 不一致检测(5 类) + +| ID | 名称 | 检测 | 修复 | +|---|---|---|---| +| **I1** | 物理已不存在 / DB 仍有 | `physical.exists=false && dbVO != null` | DELETE_DB_VO + 重算 distance/parent | +| **I2** | DB 已删 / 物理仍在 | `physical.exists=true && dbVO=null` | SCHEDULE_GC_ORPHAN_FILE | +| **I3** | parentUuid 不一致 | `db.parent != null && physical.backing != db.parent.installPath`(必须先排除悬空 → I3b 优先评估)| UPDATE_DB_PARENT_UUID + distance | +| **I3b** | 悬空 parentUuid | `db.parentUuid != null && Q(VolumeSnapshotVO).eq(uuid, parentUuid) == null`(兄弟 rebase 完成后 parent VO 已被删,自身 parentUuid 仍指向已删 UUID)| 三种子情形:(a) `physical.backing` 能反查到树内某 alive VO → UPDATE_DB_PARENT_UUID = 该 VO.uuid;(b) `physical.backing == null`(已 rebase 到卷 base)→ UPDATE_DB_PARENT_UUID(null);(c) `physical.backing` 存在但反查不到任何 alive VO(指向已被 stepDelete 的 VO 物理路径,物理 rebase 尚未发生)→ 不动 DB,记 remaining 由下次重试推动物理 rebase 后再修 | +| **I4** | installPath 不一致 | DB.installPath 物理不存在但能在树内任一 alive 叶 backing chain 中找到该 uuid 对应物理位置 | UPDATE_DB_INSTALL_PATH + size | +| **I5** | latest 标志错位 | aliveChain 末端 latest=false 或非末端 latest=true | UPDATE_DB_LATEST_FLAG | + +## 5.4 受限动作集 + +```java +public enum FixActionType { + DELETE_DB_VO, + UPDATE_DB_PARENT_UUID, + UPDATE_DB_INSTALL_PATH, + UPDATE_DB_LATEST_FLAG, + SCHEDULE_GC_ORPHAN_FILE +} +``` + +**显式禁止**:reconciler 不发 Commit/Pull/Delete*Msg、不调 agent rebase。修物理的责任全部在 agent 层。 + +**评估顺序**(强制): +1. I1(自身物理不存在) +2. I3b(parent 悬空)— 必须先于 I3,避免 `db.parent` 为 null 时 I3 NPE +3. I3(parent 存在但 installPath 不一致)— 仅在 `db.parent != null` 时评估 +4. I4(自身 installPath 错位) +5. I5(latest flag 错位) +6. I2(孤儿物理文件)— 最后处理,避免误删与 I1/I4 修复相关文件 + +## 5.5 调用点 + +`VolumeSnapshotTreeBase.java` 修改: + +```java +private void commit(VolumeSnapshotLeaf child, VolumeTree tree, boolean online, Completion comp) { + final String treeUuid = currentRoot.getTreeUuid(); + final String volumeUuid = volume.getUuid(); + final boolean dstIsRoot = (dstSnapshotInv.getParentUuid() == null); + + FlowChain chain = ... .done(new FlowDoneHandler(comp) { + public void handle(Map data) { + logReconcile(reconciler.reconcile(treeUuid, volumeUuid, AfterCommitSuccess)); + // dst 是根节点:updateDatabaseAfterCommit 会创建新 treeUuid 并迁移 VO + // 此时旧 treeUuid 下已无 VO,需对账新 treeUuid(reconciler 内部通过 volumeUuid 反查) + // 此处显式再调一次以护栏 + if (dstIsRoot) { + logReconcile(reconciler.reconcile(null, volumeUuid, AfterCommitSuccess)); + } + comp.success(); + } + }).error(new FlowErrorHandler(comp) { + public void handle(ErrorCode err, Map data) { + try { logReconcile(reconciler.reconcile(treeUuid, volumeUuid, AfterCommitFail)); } + catch (Throwable t) { logger.warn("reconcile failed", t); } + comp.fail(err); + } + }); + chain.start(); +} +``` + +`pull()` 与 `deleteVolumeSnapshotAndSyncVolumeSize()` 同结构改造。 + +**dst-is-root 双树对账**:commit 根节点时 SQLBatch 会 `persist(newTree)` 并把 src 子树迁到新 treeUuid(详见 `docs/snapshot-single-delete/05-commit-db-swap.md` §5.3)。 +若调用方持有的是旧 treeUuid,reconciler step 1 会扫到空集合 → 通过 volumeUuid 反查 latest VO 即可拿到新 treeUuid, +所以传 `null` treeUuid 是合法签名,由 reconciler 自动解析。 + +**成功路径触发策略**: +- Phase 1-2(灰度观察期):`done` 和 `error` 都触发,验证 reconciler 检测准确率 +- Phase 4(默认开启后):保留双触发。理由:L2 失败抛 `PostOpVerifyError` 已走 `error` 分支; + 但 SQLBatch 成功 + agent reply 路径也可能由于"agent 实际成功 reply 误标 fail"(场景 1 镜像)使 DB 与物理静默漂移, + 成功路径对账可在低概率下捕获这种漏报。每次成功操作的对账代价由 ISSUE 1 的锁外异步采样化解。 + +## 5.6 设计不变量 + +- **幂等收敛**:多次调用结果相同;不会把已一致状态修坏 +- **不抛异常给调用方**:reconciler 失败不让 commit/pull 的成功变失败 +- **同步运行在 chainSubmit 锁内**:reconciler 在 commit/pull 的 done/error 回调内同步执行,期间持 chainSubmit 锁;不引入额外锁、不做 CAS。串行性由外层 vm 队列 + chainSubmit 双重保证(见 §5.6.1) +- **SQLBatch 单事务**:所有 DB 修补原子 + +### 5.6.1 串行性来源(不需要额外锁的依据) + +`APIDeleteVolumeSnapshotGroupMsg` → `VolumeSnapshotGroupBase.handleDelete` 通过 `overlaySend(DeleteVolumeSnapshotGroupInnerMsg)` 把请求排到 vm 队列;`completion.done()` 在 `overlaySend` 回调内调用,回调返回前下一个排队请求无法进入。叠加同一棵快照树的 `chainSubmit` 串行: + +``` +vm 队列 ──► chainSubmit ──► commit/pull flow ──► done/error ──► reconciler ──► comp.success/fail ──► chainSubmit 释放 ──► vm 队列释放 +``` + +因此 reconciler 跑完前不会有任何同卷 / 同组的新请求观察到中间状态。原计划的"段 2 释放 chainSubmit + CAS"为冗余设计,已废弃。reconciler 内部仍然按 §5.2 顺序"读 DB → 拉物理 → SQLBatch 修补"线性执行,全程持锁。 + +## 5.7 熔断与降级 + +| GlobalConfig | 默认 | 含义 | +|---|---|---| +| `volumeSnapshot.reconciler.enabled` | true | 总开关 | +| `volumeSnapshot.reconciler.timeout.sec` | 30 | 拉物理 chain 超时 | +| `volumeSnapshot.reconciler.maxFixActions` | 50 | 单次最多修补数(熔断)| + +## 5.8 可观测性 + +``` +[VolumeSnapshotTreeReconciler] tree= trigger=AfterCommitSuccess + inconsistencies: I3(snap-a parentUuid mismatch), I2(orphan-file /xxx.qcow2) + applied: UPDATE_DB_PARENT_UUID(snap-a), SCHEDULE_GC_ORPHAN_FILE(/xxx.qcow2) + remaining: [] + duration_ms: 152 +``` diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md new file mode 100644 index 00000000000..43e471a9eed --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md @@ -0,0 +1,260 @@ +# 数据面:四层校验 + +## 6.1 共享工具模块 + +**新建** `zstacklib/zstacklib/utils/snapshot_recovery.py`: + +```python +class ChainSnapshot: + path: str + backing_file: str | None + virtual_size: int + actual_size: int + file_format: str + md5_header: str + +class ChainSnapshotSet: + operation: str + timestamp: float + snapshots: dict[str, ChainSnapshot] + def dump_to_file(self, path): ... + @staticmethod + def load_from_file(path) -> "ChainSnapshotSet": ... + +def take_chain_snapshot(paths: list[str]) -> ChainSnapshotSet: ... +def verify_post_op(before: ChainSnapshotSet, expected: dict[str, str]) -> VerifyResult: ... +``` + +**扩展** `linux.py`: + +```python +def qcow2_get_backing_chain_strict(path) -> list[str]: + """读 qcow2 backing chain,遇错抛 QcowReadError""" + +def qemu_img_check(path, repair=None) -> CheckResult: + """qemu-img check,结构化结果""" +``` + +## 6.2 L1 — 操作前 chain 快照 + +**目的**:进程崩溃 / 宿主机断电后,重启能根据 dump 判断"上次进度" + +**dump 路径**:`/var/lib/zstack/snapshot-recovery/-.json` + +接入示例(`vm_plugin.py` block_commit handler): + +```python +@kvmagent.replyerror +def block_commit(self, req): + cmd = jsonobject.loads(req[http.REQUEST_BODY]) + + # L1:dump pre-op chain + paths = [cmd.top, cmd.base] + (cmd.topChildrenInstallPathInDb or []) + pre_snap = take_chain_snapshot(paths) + pre_snap.operation = 'commit' + recovery_file = "/var/lib/zstack/snapshot-recovery/%s-%s.json" % ( + cmd.volumeUuid, uuidhelper.uuid()) + pre_snap.dump_to_file(recovery_file) + + try: + vm = get_vm_by_uuid(cmd.vmUuid) + vm.do_block_commit(cmd, cmd.volume) + for child in (cmd.topChildrenInstallPathInDb or []): + if linux.qcow2_get_backing_file(child) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, child) + + # L2:post-op verify + verify_post_commit(pre_snap, cmd.base) + + linux.rm_file_force(recovery_file) + return jsonobject.dumps(rsp) + except Exception: + raise # 失败保留 recovery 文件 +``` + +**生命周期**: +- 成功 → 删除 +- 失败 → 保留供下次操作 / 启动恢复消费 +- 超 24h → kvmagent 启动时清理 + +### 6.2.1 其它路径 L1 接入模板 + +**离线 commit**(`localstorage.py:859` / `nfs:.625` / `smp:.506` / `sb:.1285`): + +```python +# paths = top + base + 兄弟节点(commit 后兄弟需 rebase 到 base) +paths = [cmd.top, cmd.base] + (cmd.topChildrenInstallPathInDb or []) +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'offline-commit' +recovery_file = ".../%s-%s.json" % (cmd.volumeUuid, uuidhelper.uuid()) +pre_snap.dump_to_file(recovery_file) +try: + linux.qcow2_commit(cmd.top, cmd.base) + for child in (cmd.topChildrenInstallPathInDb or []): + if linux.qcow2_get_backing_file(child) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, child) + verify_post_commit(pre_snap, cmd.base) + linux.rm_file_force(recovery_file) +except Exception: + raise +``` + +**离线 pull**(`localstorage.py:835` 等): + +```python +# paths = src + dst + dst.children(pull 后 dst.children 需 rebase 到 src) +paths = [cmd.srcPath, cmd.dstPath] + (cmd.dstChildrenInstallPathInDb or []) +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'offline-pull' +... +linux.qcow2_commit(cmd.dstPath, cmd.srcPath) # pull = reverse commit +verify_post_pull(cmd.srcPath, expected_backing=pre_snap.snapshots[cmd.srcPath].backing_file, + full_rebase=False) +``` + +**fullRebase**(`create_template_with_task_daemon` + mv,详见 `docs/snapshot-single-delete/12-fullrebase-and-cleanup.md`): + +```python +# paths = dst + dst 整条 backing chain(fullRebase 会全部展平进 tmp) +chain = linux.qcow2_get_backing_chain_strict(cmd.destPath) +paths = [cmd.destPath] + chain +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'fullRebase' +pre_snap.metadata['tmp_path'] = cmd.destPath + '.tmp' # 登记临时文件路径 +recovery_file = ... +pre_snap.dump_to_file(recovery_file) +try: + create_template_with_task_daemon(cmd.destPath, cmd.destPath + '.tmp') + linux.mv(cmd.destPath + '.tmp', cmd.destPath) + verify_post_pull(cmd.destPath, expected_backing=None, full_rebase=True) + linux.rm_file_force(recovery_file) +except Exception: + # 若 tmp 残留,启动恢复扫到 metadata.tmp_path 即可清理 + raise +``` + +**SharedBlock**:`paths` 用 LV 设备路径(`/dev//`),`take_chain_snapshot` 内部对 LV 路径做 `qemu-img info` 即可,无需特殊分支。 + +## 6.3 L2 — 操作后自检 + +```python +def verify_post_commit(pre, base): + actual_backing = linux.qcow2_get_backing_chain_strict(base)[0:1] + expected_backing = pre.snapshots[base].backing_file + if actual_backing and actual_backing[0] != expected_backing: + raise PostOpVerifyError(...) + # size 检查降级为 warn:commit src 可能是零差量、qcow2 压缩、稀疏文件,不能强制断言增大 + # 阈值与容差由 [snapshot_recovery] size_check_threshold_bytes / size_check_tolerance_ratio 配置(见 6.7) + if pre.snapshots[base].actual_size > config.size_check_threshold_bytes: + new_size = linux.get_local_file_disk_usage(base) + if new_size < pre.snapshots[base].actual_size * config.size_check_tolerance_ratio: + logger.warn("base %s disk usage shrank from %d to %d after commit, " + "possibly compression/sparse, verify backing OK" % + (base, pre.snapshots[base].actual_size, new_size)) + +def verify_post_rebase(target, expected_backing): + actual = linux.qcow2_get_backing_file(target) + if actual != expected_backing: + raise PostOpVerifyError(...) + +def verify_post_pull(dst, expected_backing, full_rebase): + actual = linux.qcow2_get_backing_file(dst) + if full_rebase and actual: + raise PostOpVerifyError(...) + if not full_rebase and actual != expected_backing: + raise PostOpVerifyError(...) +``` + +接入点: + +| 操作 | 文件位置 | 验证 | +|---|---|---| +| 在线 blockCommit | `vm_plugin.py:9845` 主操作完成 | verify_post_commit | +| 在线兄弟 rebase | `vm_plugin.py:9857` 循环内 | verify_post_rebase | +| 离线 commit | `localstorage.py:859` / `nfs:.625` / `smp:.506` / `sb:.1285` | commit + rebase | +| 离线 pull | `localstorage.py:835` 等 | verify_post_pull | +| fullRebase mv 后 | 同上 | verify_post_pull(full_rebase=True) | + +失败抛 `PostOpVerifyError`(继承 `kvmagent.KvmError`)→ HTTP 500 → 控制面 FlowChain error → reconciler 介入 + +## 6.4 L3 — qemu-img check(异常路径) + +```python +def qemu_img_check(path, repair=None): + args = ['check', '-f', 'qcow2'] + if repair: args += ['-r', repair] + args.append(path) + out = shell.call(qemu_img.cmd(args)) + return parse_check_output(out) +``` + +触发: +1. L2 失败前先跑一次区分"qemu-img 静默错误" vs "文件已损坏" +2. 启动恢复诊断时 +3. 控制面 `CheckSnapshotIntegrityMsg` 显式触发 + +**仅检测,不自动修复**。`-r` 修复仅在控制面 API 显式批准时使用。 + +## 6.5 L4 — blockJob 状态机加固 + +```python +class BlockJobState(enum.Enum): + NOT_STARTED, RUNNING, READY, COMPLETED, PIVOTED, CANCELLED, FAILED + +class BlockJobMonitor: + def __init__(self, domain, disk_name, active_commit, timeout_sec): ... + def poll(self) -> BlockJobState: ... + def wait_until(self, target_states: set, timeout: int) -> BlockJobState: ... +``` + +**active commit 状态机**: + +``` +NOT_STARTED → RUNNING ──(timeout)──► FAILED → raise + │ ready event + ▼ + READY ──(timeout)──► FAILED → blockJobAbort(no pivot) → CANCELLED → raise + │ blockJobAbort(PIVOT) + ▼ + PIVOTED ──verify domain XML source==base──► COMPLETED + │ no + ▼ FAILED → raise +``` + +**改造点**: +1. 用 `wait_until({READY})` 替换"轮询 job 不在" +2. pivot 前必须确认 READY +3. 任何超时显式 CANCELLED +4. 终态通过读 domain XML disk source 二次确认 + +## 6.6 启动恢复 + +```python +def on_kvmagent_startup(): + for f in glob('/var/lib/zstack/snapshot-recovery/*.json'): + snap = ChainSnapshotSet.load_from_file(f) + if time.time() - snap.timestamp > 86400: + linux.rm_file_force(f); continue + for path in snap.snapshots: + if not os.path.exists(path): continue + result = qemu_img_check(path) + if result.image_corrupted: + logger.error("recovery: corrupted file %s" % path) + write_diagnostic_report(snap, f.replace('.json', '.report.json')) +``` + +**只诊断不改文件**。控制面通过 `GET /snapshot-recovery/report` 端点读取诊断报告。 + +## 6.7 配置 + +```ini +[snapshot_recovery] +enable_l1_dump = true +enable_l2_verify = true +enable_l3_check_on_error = true +recovery_dir = /var/lib/zstack/snapshot-recovery +recovery_max_age_hours = 24 +blockjob_timeout_sec = 3600 +size_check_threshold_bytes = 104857600 # 100 MiB;base.actual_size 大于此值才做 size warn +size_check_tolerance_ratio = 0.9 # 允许新尺寸不低于旧尺寸 * 此比值,否则记 warn +``` diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md new file mode 100644 index 00000000000..c4e159fb14f --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md @@ -0,0 +1,76 @@ +# FlowChain 改造(混合恢复策略) + +## 7.1 可逆性分类 + +| Flow | 可逆性 | 失败策略 | +|---|---|---| +| AllocatePrimaryStorageSpaceMsg | ✅ 可逆 | FlowChain 自带 rollback | +| CommitOnHypervisor/PrimaryStorage | ❌ 不可逆 | 不回滚,reconciler 前进式补全 | +| updateDatabaseAfterCommit | ✅ 事务回滚 | SQLBatch 自然回滚 + reconciler 二次对账 | +| 兄弟节点 rebase | ❌ 不可逆 | agent L2 抛错 → reconciler 修 DB parentUuid | + +## 7.2 改造模板 + +见 `01-control-plane-reconciler.md` 5.5 节代码示例:error 回调先 reconcile 再 fail,原错误向上抛但 DB 已尽力收敛。 + +## 7.3 子流程失败处理 + +- **doCommitOnHypervisorOrPrimaryStorageFlow**:agent 抛错 → flow fail → reconciler 反查物理实际状态 → 修 DB;agent 实际成功但回复丢失的场景,reconciler 把 DB 推到"成功后状态",但**仍返回原错误**给用户 +- **updateDatabaseAfterCommitFlow**:SQLBatch 失败 → 物理已变 DB 未变 → reconciler 反推应有 DB 状态 → 重新 SQL;二次失败进 remaining +- **兄弟节点 rebase**:agent 单个 child 失败立即抛 → reconciler 比对每个 child backing 与 DB parentUuid 逐个修 + +## 7.4 异常场景验证(手算) + +**场景 1:在线 active commit pivot 后 agent 进程死** + +1. agent L1 dump 已写盘 +2. 控制面 commit flow 超时 → error 回调 +3. reconciler.reconcile(AfterCommitFail) + - 拉物理 chain:base 已合并完成、top 已删 + - 检 I4(installPath 不一致)→ UPDATE_DB_INSTALL_PATH + - 检 I3(src.parentUuid 仍指 dst)→ UPDATE_DB_PARENT_UUID +4. 用户收到原错误(commit timeout) +5. 重试删除 → DB 已收敛 → 走快速路径直接 deleteVolumeSnapshotAndSyncVolumeSize + +✅ 闭环 + +**场景 2:DB 翻转 SQL 失败** + +1. 物理已 commit 完成,updateDatabaseAfterCommitFlow 失败 +2. reconciler 反推修 DB + +✅ 闭环 + +**场景 3:兄弟节点 rebase 中途失败(5 个兄弟 rebase 完 2 个失败)** + +1. agent L2 在第 3 个兄弟报错 → flow fail +2. reconciler 读所有兄弟 backing: + - 已 rebase 的 2 个:`physical.backing` 已变(指向 base),DB `parentUuid` 仍指 dst(被删 VO) + → I3b 子情形 (a) 触发:physical.backing 反查到 base.uuid → UPDATE_DB_PARENT_UUID = base.uuid + - 未 rebase 的 3 个:`physical.backing` 仍指 dst.installPath(dst VO 已删,反查不到 alive VO) + → I3b 子情形 (c) 触发:不动 DB,记 remaining,等下次重试推动物理 rebase + - dst 自身:物理仍存在 + DB VO 已被 stepDelete 删 → I2 触发,SCHEDULE_GC_ORPHAN_FILE + - 注:因 I2 评估顺序最末(见 `01` §5.4),不会误删尚被未 rebase 兄弟引用的 dst.installPath; + SCHEDULE_GC 内部会再检物理是否仍被引用,若是则放弃删除 +3. 重试删除请求 → reconciler 第二轮:未 rebase 的 3 个仍是 I3b(c),agent 重做 rebase 后变 (a);最后 dst 失去引用,GC 才真清 + +✅ 闭环(依赖 I3b 三子情形 + I2 末位评估,详见 `01-control-plane-reconciler.md` §5.3 / §5.4) + +**场景 4:reconciler 自身 SQL 失败** + +1. remaining[] 记录 + warn 日志 +2. 下次任何对该树操作再次触发对账 +3. 持续不一致 → 运维介入 + +✅ 至少不越修越坏 + +## 7.5 并发与锁 + +- reconciler 在 chainSubmit 锁内同步执行(commit/pull 的 done/error 仍持锁,期间不释放) +- **不引入额外锁、不做 CAS**:串行性由外层双重保护—— + - vm 队列:`APIDeleteVolumeSnapshotGroupMsg` 通过 `overlaySend` 排到 vm 队列,`completion.done()` 在 reconciler 跑完后才执行,下一个请求才能出队(见 `01` §5.6.1) + - chainSubmit:同一棵快照树的所有 commit/pull 已串行 +- 跨树并发:reconciler 只动当前 treeUuid VO,无冲突 +- GC 异步框架自身去重,与新业务并发无影响 + +**代价权衡**:reconciler 期间持 chainSubmit + vm 队列锁,意味着同卷 / 同组下一个请求最多等待一次 reconcile(含 `GetVolumeBackingChainFromPrimaryStorageMsg` 网络往返,超时由 `volumeSnapshot.reconciler.timeout.sec=30` 兜底)。但用户调用本来就是串行排队,等待落在原本要排队的请求上,没有放大延迟。 diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md new file mode 100644 index 00000000000..1d8c70ca69d --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md @@ -0,0 +1,63 @@ +# 测试策略 + +## 8.1 测试金字塔 + +``` + ┌──────────────────────┐ + │ E2E (~5 cases) │ + ├──────────────────────┤ + │ Integration (~30) │ + ├──────────────────────┤ + │ Unit (~100) │ + └──────────────────────┘ +``` + +## 8.2 单元测试(控制面) + +`storage/src/test/.../VolumeSnapshotTreeReconcilerTest.java`: + +- test_I1_physical_missing_db_present +- test_I2_orphan_file +- test_I3_parent_uuid_mismatch +- test_I4_install_path_swap +- test_I5_latest_flag_wrong +- test_idempotent_double_call +- test_max_fix_actions_circuit_breaker +- test_physical_unreachable +- test_sql_batch_fail +- **test_no_business_action_dispatched**(不变量护栏:spy CloudBus 验证从未发 Commit/Pull/Delete*Msg) + +## 8.3 单元测试(数据面) + +`kvmagent/test/test_snapshot_recovery.py`: + +- test_chain_snapshot_dump_load +- test_take_chain_snapshot_with_missing_file +- test_verify_post_commit_backing_unchanged +- test_verify_post_commit_size_shrank +- test_verify_post_rebase_mismatch +- test_verify_post_pull_full_rebase +- test_qemu_img_check_corrupted +- test_blockjob_state_machine_pivot_path +- test_blockjob_timeout_cancellation +- test_recovery_file_lifecycle + +## 8.4 集成测试(ZSTACK_SIMULATOR) + +- TestSingleSnapshotDeleteCommitSuccess +- TestSingleSnapshotDeleteCommitFailReconcile +- TestSingleSnapshotDeletePullForkChain +- TestSingleSnapshotDeleteSqlBatchFail +- TestSingleSnapshotDeleteRetryIdempotent +- TestSingleSnapshotDeleteOrphanGc +- TestSingleSnapshotDeleteSiblingDbCorrection + +## 8.5 E2E 测试 + +| 编号 | 步骤 | +|---|---| +| E1 | 5 层链 → 删中间快照(在线 commit)→ 验证文件链与 DB | +| E2 | 同 E1 + 中途 `kill -9 kvmagent` → 重启 → 验证 reconcile + 重试 | +| E3 | 分叉链(2 子节点)→ 删根节点 → 验证两子各自 backing 与 DB | +| E4 | 离线 pull 大文件(10 GB qcow2 fullRebase)→ 中途断电 → 启动恢复诊断 | +| E5 | 快照组(3 卷),其中 1 卷 reconcile 失败 → 验证其它卷不受影响 | diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md new file mode 100644 index 00000000000..11388f13b74 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md @@ -0,0 +1,55 @@ +# 上线计划 + +## 9.1 灰度 + +``` +Phase 1 (周 1):默认 false 上线 + - 仅日志旁路:reconcile 跑但不执行 FixAction + - 验证检测准确率 + +Phase 2 (周 2):测试环境开启 + - 全测试集群 enabled=true,跑 E2E + 压力 + +Phase 3 (周 3-4):开发/UAT 集群灰度 + - 一台真实业务集群打开,观察一周 + +Phase 4 (周 5+):默认开启 + - release notes,保留 GlobalConfig 关闭通道 +``` + +## 9.2 监控告警 + +| 日志 grep | 阈值 | +|---|---| +| `[VolumeSnapshotTreeReconciler] applied:` | > 10/h | +| `[VolumeSnapshotTreeReconciler] remaining:` | > 0 | +| `[VolumeSnapshotTreeReconciler] circuit-breaker triggered` | 立即 | +| `PostOpVerifyError` | > 5/h | +| `recovery: corrupted file` | 立即 | + +## 9.3 文档 + +| 产出 | 位置 | +|---|---| +| 设计 spec | 本目录 | +| 运维手册 | `docs/snapshot-single-delete/15-operation-runbook.md` | +| Reconciler 排错指南 | 同上附录 | +| GlobalConfig | release notes | + +## 9.4 回滚预案 + +1. **快速止血**:`updateGlobalConfig volumeSnapshot reconciler.enabled false` +2. **代码回滚**:reconciler 调用全 try-catch,关闭等价于现状 +3. **数据修复**:reconciler 只动 DB 不动物理,最坏 SQL 反向恢复 + +agent 侧 L1/L2/L4 经 `kvmagent.conf` 开关独立回滚。 + +## 9.5 风险登记 + +| 风险 | 等级 | 缓解 | +|---|---|---| +| reconciler 误判物理状态错改 DB | 高 | 单元测试 + 灰度日志旁路 + circuit-breaker | +| L1 dump 文件累积撑爆磁盘 | 中 | 24h 自动清理 + 磁盘监控 | +| L4 状态机改造引入回归 | 中 | 单元测试 + fallback 开关 | +| 对账 SQL 与并发新建快照冲突 | 低 | chainSubmit 已串行 | +| GCJob 入队过多 | 低 | 现有框架去重 | diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md new file mode 100644 index 00000000000..1b79fe9a045 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md @@ -0,0 +1,23 @@ +# 不变量护栏总结 / 范围之外 + +## 10. 不变量护栏总结 + +设计的核心护栏(任意一项被破坏即视为设计失败): + +1. reconciler 永远不发 Commit/Pull/Delete*Msg(单元测试强制) +2. reconciler 不抛异常给调用方 +3. reconciler 多次调用结果一致(幂等收敛) +4. agent L2 失败必抛 PostOpVerifyError,不静默 +5. L1 dump 文件成功必删,失败必留 +6. FlowChain error 路径必先 reconcile 后 fail +7. maxFixActions 熔断保护(默认 50) +8. 所有 GlobalConfig / kvmagent.conf 开关可独立关闭 + +## 11. 范围之外 + +- Ceph RBD:本设计不涉及(普通 RBD 快照不支持 commit/pull,超出 single 删除范围) +- **StorageSnapshot / Memory 快照 / CDP**:在 `VolumeSnapshotTreeBase.java:836` 提前 return,绕过 commit/pull 路径,由 `deleteVolumeSnapshotAndSyncVolumeSize` 直接处理,无需加固(详见 `docs/snapshot-single-delete/13-premium-and-cdp.md` §13.2) +- 链克隆 + single 删除并存(VolumeSnapshotReferenceVO TODO):独立议题 +- 全量定时 GC:本设计不引入;只做"操作后局部对账" +- VmState 扩展(如 Migrating):独立议题 +- 快照组并发度可配:独立议题 From 9b05c73dadb8440abcf2b4e18ca486d1f1c8628b Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Thu, 14 May 2026 18:51:37 +0800 Subject: [PATCH 2/5] [storage]: decouple alive-chain membership from vmState in VolumeTree VolumeTree previously conflated two independent concepts in isOnline / resolveDirection: (a) is the snapshot on the volume's live backing chain (a structural property of the tree) (b) is the VM currently routed through the hypervisor (Running/Paused) Two consequences: 1. In VolumeSnapshotTreeBase.stepDelete, the multi-children "defer the alive-chain child to the final round" guard used isOnline, which returned false for every child when the VM was Stopped. The guard silently disappeared and child selection fell back to children.get(0), which could land on the volume's backing chain and corrupt the live chain on rebase failure. 2. resolveDirection's shouldUseCommitStrategy was likewise gated on vmState, so direction=Auto + Stopped degraded to Pull (writing N copies of the (target - parent) delta into each child file) instead of a single-file Commit. Split into two predicates: - isOnAliveChain(snapshotUuid): pure tree-structure query, used by stepDelete to identify the alive child regardless of vmState. - isHypervisorOperation(vmState): pure run-state predicate, used to pick libvirt vs primary-storage agent. isOnline is rewritten as the compound (treeIsCurrent && isHypervisorOperation(vmState) && both endpoints on alive chain), preserving its existing semantics for current callers. resolveDirection now derives shouldUseCommitStrategy purely from tree structure, so Auto + Stopped + alive-chain target now picks Commit. stepDelete renames onlineChild to aliveChild and queries isOnAliveChain, so the alive-child deferral protection now applies to Stopped VMs as well. Resolves: ZSV-10538 Change-Id: I6e63786d6d6b616f7a766b6c7463617572786969 --- .../snapshot/VolumeSnapshotTreeBase.java | 9 ++-- .../zstack/storage/snapshot/VolumeTree.java | 48 ++++++++++++++++--- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java index beb8b044d1a..5781bc56308 100755 --- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java @@ -884,8 +884,8 @@ private void stepDelete(Completion completion) { return; } - VolumeTree.VolumeSnapshotLeaf onlineChild = children.stream() - .filter(child -> volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState)) + VolumeTree.VolumeSnapshotLeaf aliveChild = children.stream() + .filter(child -> volumeTree.isOnAliveChain(child.getUuid())) .findFirst().orElse(null); Completion comp = new Completion(completion) { @@ -910,7 +910,10 @@ public void fail(ErrorCode errorCode) { pull(child, volumeTree, online, comp); } } else { - if (onlineChild != null && Objects.equals(child.getUuid(), onlineChild.getUuid())) { + // Multi-children: defer the alive-chain child to the final round so that any in-flight failure + // on a non-alive sibling does not corrupt the volume's live backing chain. This guard now + // applies to Stopped VMs as well, because isOnAliveChain is vmState-independent. + if (aliveChild != null && Objects.equals(child.getUuid(), aliveChild.getUuid())) { child = children.get(1); } boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java index ea669fd6202..3f23b637408 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java @@ -361,12 +361,38 @@ public List getAliveChainSnapshotUuids() { return aliveChain.stream().map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); } + /** + * Pure alive-chain membership query, VM-state-independent. + * A snapshot is "on the alive chain" iff this tree is the current tree of the volume + * AND the snapshot is one of the ancestors of the live volume node. + *

+ * This is intentionally decoupled from {@link #isHypervisorOperation(VmInstanceState)}; + * the two were previously conflated in {@link #isOnline(boolean, String, String, VmInstanceState)}, + * causing the multi-children "avoid alive child" protection in + * {@code VolumeSnapshotTreeBase.stepDelete} to silently fail when the VM is Stopped. + */ + public boolean isOnAliveChain(String snapshotUuid) { + return current && getAliveChainSnapshotUuids().contains(snapshotUuid); + } + + /** + * Whether physical snapshot operations should be routed through the hypervisor (libvirt blockCommit/blockPull) + * instead of the primary storage agent (qemu-img). Purely a function of VM run-state. + */ + public static boolean isHypervisorOperation(VmInstanceState vmState) { + return vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused; + } + public DeleteVolumeSnapshotDirection resolveDirection(String targetSnapshotUuid, String childSnapshotUuid, String initialDirection, boolean targetSnapshotIsLatest, VmInstanceState vmState) { - boolean online = (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) - && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) && getAliveChainSnapshotUuids().contains(childSnapshotUuid); - - boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + // shouldUseCommitStrategy reflects "would the commit path move data along the live chain", which is purely + // a property of the tree structure (vol's ancestor chain) and should not depend on whether the VM is currently + // running. Previously this was conjoined with vmState ∈ {Running, Paused}, which caused Stopped + Auto to + // silently degrade to Pull (writing N copies of (target - parent) delta to each child file) even when the + // commit path would have produced a single merged file. + boolean targetOnAliveChain = isOnAliveChain(targetSnapshotUuid); + boolean childOnAliveChain = isOnAliveChain(childSnapshotUuid); + boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && targetOnAliveChain && childOnAliveChain; if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Pull.toString()) && shouldUseCommitStrategy) { throw new IllegalArgumentException("the snapshot will be deleted by block 'commit', but the direction is 'pull', " + @@ -386,9 +412,19 @@ public DeleteVolumeSnapshotDirection resolveDirection(String targetSnapshotUuid, return DeleteVolumeSnapshotDirection.fromString(initialDirection); } + /** + * Compound predicate: target and child are both on the alive chain AND the VM is currently running. + * Used to decide whether to route through the hypervisor (libvirt) path vs the primary storage agent path. + *

+ * Equivalent to {@code treeIsCurrent && isHypervisorOperation(vmState) + * && isOnAliveChain(target) && isOnAliveChain(child)}, with the {@code current} check + * folded into both {@code isOnAliveChain} calls. + */ public boolean isOnline(boolean treeIsCurrent, String targetSnapshotUuid, String childSnapshotUuid, VmInstanceState vmState) { - return treeIsCurrent && (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) - && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) && getAliveChainSnapshotUuids().contains(childSnapshotUuid); + return treeIsCurrent + && isHypervisorOperation(vmState) + && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) + && getAliveChainSnapshotUuids().contains(childSnapshotUuid); } // TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, From b93a0852c6f7bac86d2e7d7a5d254519674bf118 Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Thu, 14 May 2026 18:22:47 +0800 Subject: [PATCH 3/5] [docs]: snapshot single-delete scenarios, bugs, and proposals Add the snapshot-single-delete documentation set under docs/: scenarios/ 03-local-stopped-delete-mid-with-3-children.md - LocalStorage, VM Stopped, mid-tree delete with 3 children; full offline pull walk-through. 04-deleteSingleFlows-online-offline-decision.md - decision matrix for deleteSingleFlows / stepDelete / resolveDirection / isOnline, mapping vmState x direction x scope to agent entry points. 05-local-stopped-direction-commit-actual.md - real ZSV environment capture for VM Stopped + scope=single + direction=Commit on a 5-leaf tree; corrects 3 source-reading mistakes. _query_tree.py - helper for inspecting snapshot tree state. bugs.md - consolidated bug ledger for the single-delete path (Bug 0..7 plus newly added Bug 8/9). proposals/ scope-direction-api-redesign.md - API parameter cleanup for APIDeleteVolumeSnapshot[Group]Msg (scope/direction defaults, enum normalize, internal msg defaults, dead-code warn). group-disband-symmetry-and-integrity-check.md - design behind the symmetric-disband (A) and VM integrity-check (C) fixes. Resolves: ZSV-10538 Change-Id: I7468736c7763656d62666d727971716e6d646375 --- docs/snapshot-single-delete/bugs.md | 390 ++++++++++++++++++ ...up-disband-symmetry-and-integrity-check.md | 295 +++++++++++++ .../proposals/scope-direction-api-redesign.md | 283 +++++++++++++ ...ocal-stopped-delete-mid-with-3-children.md | 304 ++++++++++++++ ...leteSingleFlows-online-offline-decision.md | 349 ++++++++++++++++ ...5-local-stopped-direction-commit-actual.md | 295 +++++++++++++ .../scenarios/_query_tree.py | 93 +++++ 7 files changed, 2009 insertions(+) create mode 100644 docs/snapshot-single-delete/bugs.md create mode 100644 docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md create mode 100644 docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md create mode 100644 docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md create mode 100644 docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md create mode 100644 docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md create mode 100644 docs/snapshot-single-delete/scenarios/_query_tree.py diff --git a/docs/snapshot-single-delete/bugs.md b/docs/snapshot-single-delete/bugs.md new file mode 100644 index 00000000000..c2ca518beb5 --- /dev/null +++ b/docs/snapshot-single-delete/bugs.md @@ -0,0 +1,390 @@ +# 单盘快照删除(scope=single) — 当前实现 Bug 清单 + +> 5.5.6 基线,基于场景 02 / 03 / 04 / 05 的源码梳理 + ZSV 真实环境实测整理。 +> 排序:先按"根本性 vs 派生",再按严重度。 +> 加固设计应优先覆盖 🔴 项;🟡 项作为语义修正;🟢 项作为代码质量改进。 + +## ✅ 修复进度(最新) + +| Bug | 状态 | 修复方式 | +|---|---|---| +| Bug 0 | ✅ 已修复 | `VolumeTree.isOnline` 拆为 `isOnAliveChain`(VM 状态无关)+ `isHypervisorOperation`;`stepDelete` 改用 `isOnAliveChain` 选 `aliveChild`,保护对 Running/Stopped 都生效 | +| Bug 1 | ✅ 已修复(顺带) | `resolveDirection` 中 `shouldUseCommitStrategy` 解耦 vmState,Stopped + Auto 现在按结构走 Commit | +| Bug 3 | ✅ 失去影响 | `aliveChild` 显式识别后,`children.get(0)` 顺序不再影响保护 | +| Bug 7 | ✅ 失去影响 | 同上 | +| Bug 5 | 🟢 降级(中→低) | 互换路径变可预测,但仍建议显式记录 "要删的物理路径" | +| Bug 2 | ⚠ 待修复 | `direction=null → Commit` 与"不传 = Auto"惯例不符(1 行可改) | +| Bug 4 | ⚠ 待修复(P0) | 物理推进 + DB 未推进的幽灵态,需 reconciler + 意图日志 | +| Bug 6 | ⚠ 待修复 | 删除期间 VM 状态锁 | +| **Bug 8** | ⚠ 待修复(P0) | API `scope="chain"` 默认与 UI 直觉相反;`auto` 取值文档承诺但未实现;含一段死代码 warn | +| Bug 9 | ⚠ 待修复 | 内部 `DeleteVolumeSnapshotMsg.direction` 无默认 `auto`,cascade 路径退化为 Commit | + +--- + +## Bug 0(根本性 / 🔴 高):`isOnline` 把"alive chain 归属"与"是否走 hypervisor"耦合在同一布尔值 — ✅ **已修复** + +> **修复**:拆 `isOnline` 为 `isOnAliveChain(uuid)` + 静态 `isHypervisorOperation(vmState)`;`stepDelete` 多子节点段改用 `isOnAliveChain` 识别 `aliveChild`,对 Running/Stopped 都生效。原 `isOnline` 签名保留,内部组合两个新方法,行为等价于"既在 alive chain 又走 hypervisor"。`resolveDirection` 中 `shouldUseCommitStrategy` 同步解耦 vmState(顺带修 Bug 1)。 + +### 现状 + +```java +// VolumeTree.java 行 389-392 +public boolean isOnline(boolean current, target, child, VmInstanceState vmState) { + return current + && (vmState == Running || vmState == Paused) // ← 把 vmState 当作 aliveChain 判定 + && aliveChain.contains(target) + && aliveChain.contains(child); +} +``` + +### 问题 + +"alive chain"的真正含义是 **vol 当前依赖的快照链路**(vol.installPath → parentUuid 反向递归),这条链路在 VM Stopped 时**仍然真实存在**,仅仅是 VM 没在跑而已。重启时 libvirt 会照样按这条链拉起。 + +当前代码把两个语义合并: +- 通道选择("用 libvirt 还是 qemu-img"):**由 vmState 决定** +- 链路归属("哪个 child 是 vol 所在的那条链,应该最后处理"):**由 vol.installPath 链决定,与 vmState 无关** + +把这两件事压在一个 `isOnline` 返回值里 → Stopped 时 `isOnline` 永远返回 false → `stepDelete` 多子节点段的"避开 alive 子节点"保护**完全失效**。 + +### 影响范围 + +1. **直接派生** Bug 3(顺序未定义):Stopped 时 `onlineChild = null`,换位 if 进不去,`child = children.get(0)` 由底层 collection 顺序决定 +2. **放大** Bug 4(幽灵态)的爆炸半径:若 vol 所在链被任意一轮选中,半完成态会直接波及 VM 启动链路 +3. 加固设计 reconciler 失去"vol 链是最后被动"这个不变式 + +### 实测证据 + +场景 05(VM Stopped + Commit)的 children=[3,4,5],实测 `children.get(0)` 返回 **4**,不是 distance 最小的 3,也不是 vol 所在的 5。本次"5 最后处理"是 collection 顺序的运气,不是代码语义保证。 + +### 修复方向 + +```java +// 拆开两个独立判定 +public boolean isOnAliveChain(String snapshotUuid) { + return aliveChain.contains(snapshotUuid); // 与 vmState 无关 +} + +public boolean isHypervisorOperation(VmInstanceState vmState) { + return vmState == Running || vmState == Paused; +} + +// stepDelete 改写 +SnapshotInventory aliveChild = children.firstMatch(c -> volumeTree.isOnAliveChain(c.getUuid())); +SnapshotInventory child = children.get(0); +if (aliveChild != null && child == aliveChild) { + child = children.get(1); // 对 Running / Stopped 都生效 +} +boolean online = volumeTree.isOnAliveChain(child) && volumeTree.isHypervisorOperation(vmState); +``` + +效果:Stopped 时 vol 所在 child(如 5)被识别为 aliveChild → 强制最后处理 → 失败半径只到旁支。 + +--- + +## Bug 1(语义错误 / 🟡 中):`direction=Auto` 在 Stopped 下退化为 Pull — ✅ **已修复(随 Bug 0)** + +> **修复**:`resolveDirection` 中 `shouldUseCommitStrategy = current && !targetSnapshotIsLatest && isOnAliveChain(target) && isOnAliveChain(child)`,不再要求 VM Running/Paused。Stopped + Auto + 待删/child 都在 vol 链上 → 返回 Commit,磁盘占用回归单份合并文件。 + +### 现状 + +`VolumeTree.resolveDirection`(行 364-387): + +```java +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; +if ("Auto".equals(initial)) { + return shouldUseCommitStrategy ? Commit : Pull; +} +``` + +VM Stopped → `online=false` → `shouldUseCommitStrategy=false` → Auto 返回 **Pull**。 + +### 问题 + +"Auto"的用户预期是"按最优策略走",但 Stopped 下 Auto = Pull 的代价: + +| 路径 | 物理操作 | 磁盘占用 | +|---|---|---| +| Stopped + Commit | `offline_commit_snapshot` 单次 qcow2_commit | 单份合并文件 | +| **Stopped + Auto/Pull** | N 次 `offline_merge_snapshot`(每 child 一次 qcow2_rebase) | **N 份 (target - parent) 差量副本** | + +N = currentRoot 的 children 数。N=3 时磁盘占用接近 3 倍。 + +### 修复方向 + +`resolveDirection` 里 Auto 在离线场景下也允许返回 Commit。可选规则: +- 简单:`Auto + Stopped + !targetIsLatest` 总是返回 Commit +- 复杂:根据 children 数量 / 差量大小做容量评估 + +### 影响 + +不影响正确性,影响**容量预期**。生产环境如果客户期望"删快照能释放空间",Auto 路径反而把空间放大。 + +--- + +## Bug 2(API 语义不一致 / 🟡 中):`direction=null` 当作 Commit,不是当作 Auto + +### 现状 + +`VolumeTree.resolveDirection` 第一行: + +```java +if (initial == null) { + return VolumeSnapshotDeletionDirection.Commit; +} +``` + +### 问题 + +- 大部分 ZStack API "字段不传 = 默认 = Auto" 是惯例 +- 这里 "字段不传 = 强制 Commit" —— 行为与 `direction=Auto` 显式传入完全不同(参考 Bug 1) + +后果: +- 前端调用方调试时不传 direction,意外触发离线 commit(DB 互换、VO 直接 DELETE) +- 自动化脚本若按 "省略 = 默认" 风格写,行为不可预测 + +### 修复方向 + +任一即可: +- API 入口校验 `direction != null`,否则报错 +- `resolveDirection` 里 `null` 当 Auto 处理(再结合 Bug 1 修复) + +--- + +## Bug 3(派生 / 🟢 低):`children.get(0)` 顺序未定义 — ✅ **失去影响(随 Bug 0)** + +> **修复后**:不管 collection 返回 [3,4,5] / [4,3,5] / [5,3,4],`aliveChild=5` 都会被 `isOnAliveChain` 显式识别并放最后处理。顺序假设不再是行为前提。 + +### 现状 + +`stepDelete` 多子节点段直接 `children.get(0)`,children 来自 `tree.snapshotLeaf(currentRoot).children` 的 Collection。 + +### 实测 + +场景 05 树 [3,4,5] 取出顺序为 [4,3,5],非按 distance 也非按 createDate。 + +### 问题 + +Stopped 时 Bug 0 让换位保护失效 → 任意顺序都可能选中 vol 所在 child。 + +### 与 Bug 0 关系 + +**Bug 0 是因,Bug 3 是果**。修了 Bug 0(按 alive chain 归属避开 vol 链),children 顺序就不重要了 —— 不管返回 [3,4,5] / [4,3,5] / [5,3,4],aliveChild=5 都会被识别并放到最后。 + +如果只想做"小步修复",可单独排序 children(按 distance 或按"是否在 vol 链上"),但根治还是修 Bug 0。 + +--- + +## Bug 4(崩溃半完成态 / 🔴 高):轮 3 `offline_commit` 物理成功 + DB SQLBatch 失败 → 幽灵态 + +### 触发 + +Stopped + Commit + scope=single + 待删节点有子节点(即 commit 路径生效): +- agent `qcow2_commit(top=5, base=2)` + `qcow2_rebase_no_check(vol)` 完成(物理已合并、vol backing 已切) +- Java 端 `updateDatabaseAfterCommit` 的 SQLBatch 失败(DB 死锁 / 连接断 / JVM crash) + +### 物理 vs DB 不一致 + +``` +物理: + vol.qcow2 头部 backing = 2.qcow2(aa72…e70c) + 2.qcow2 含 5+2 合并数据 + 5.qcow2 已被抽空但文件未删(轮 4 还没执行) + +DB(仍是互换前状态): + VO_2.installPath = 2.qcow2 → 仍存在 + VO_5.installPath = 5.qcow2 → 指向已被抽空的文件 + vol.installPath = 5.qcow2(DB 字段一直不变) +``` + +### 后果 + +1. **VM 启动**:libvirt 读 vol.qcow2 头部找 backing → 找到 2.qcow2 → 能启动 → 但 DB 视图错乱 +2. **后续删除请求**:若用户再次发起删 VO_2 / VO_5,stepDelete 会按 DB 推演,与物理状态对不上 +3. **reconciler 误判**:看到 VO_5.installPath=5.qcow2 文件被抽空,可能误判为"5 损坏需要修复",触发重建覆盖已合并数据 + +### 修复方向 + +- 物理操作前写"操作意图日志"(CommitVolumeSnapshotIntentVO 或类似),记录 src/dst/topChildren/target DB 状态 +- 重启时按日志做幂等推进(物理已成功 → 补 DB;DB 已成功 → 跳过) +- 物理 + DB 的对应关系通过日志显式追踪,不依赖内存 inventory + +--- + +## Bug 5(隐式状态传递 / 🟢 低 — 修 Bug 0 后从 🟡 降级):轮 4 删除路径依赖未文档化的内存对象状态 + +### 现状 + +轮 3 互换后 VO_2 整条 DELETE,但 `stepDelete` 调用栈仍持有 currentRoot 的内存 inventory。轮 4 进入 `deleteVolumeSnapshotAndSyncVolumeSize`,传给 agent 的物理路径来自这个内存 inventory。 + +实测(场景 05)轮 4 删的是 `0cab…cd1a.qcow2`(原 VO_5 物理文件),不是 `aa72…e70c.qcow2`(原 VO_2 物理文件,已被 VO_5 接管)—— **删对了**。 + +### 问题 + +这个"删对了"靠的是某处把内存 inventory 的 installPath 字段在互换时改写为了"被删者旧的 src 文件路径"(5 的旧文件)—— 但这个状态传递**没有显式记录**,全靠 SQLBatch 旁的内存写。 + +任何重构(比如把互换改成只动 DB 不动内存对象)都可能让轮 4 删错对象: +- **删错为 `aa72…e70c.qcow2`** → 把含合并数据的文件删掉 → vol 启动失败、真实数据丢失 + +### 修复方向 + +互换 + 物理删的对应关系显式记录: +```java +SwapResult result = updateDatabaseAfterCommit(src, dst); +// result.physicalFileToDelete = "5.qcow2 的物理路径" +// 显式传给轮 4,不靠内存 inventory +``` + +--- + +## Bug 8(API 默认值 / 🔴 高):`scope = "chain"` 默认值与 UI 直觉相反;`auto` 取值文档承诺但未实现 + +### 现状 + +```java +// APIDeleteVolumeSnapshotMsg.java 行 70-71 +// APIDeleteVolumeSnapshotGroupMsg.java 行 31-32 +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; +``` + +```java +// VolumeSnapshotTreeBase.java 行 473-490 +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + if (msg.getScope() == null) { // ← 死代码:上一行已 false + logger.warn("snapshot deletion scope is null, default to Chain scope"); + } + ... + deleteChainFlows(); // 删 currentLeaf 及其所有 descendants +} else { + deleteSingleFlows(); // 仅删该节点 + merge +} +``` + +### 问题 + +1. **默认 `chain` 与"删快照"UI 直觉不符**:用户在快照管理页面点"删除",预期是 single("只删这一个,别动旁支/后代")。默认 chain 会**雪崩删整棵子树**,CLI/SDK 用户漏传 scope 即触发,恢复成本极高。 +2. **快照组(Group)默认 `chain` 风险更大**:一个 group 含多盘,每盘按 chain 默认 → 单次 API 调用可能删几十个 snapshot。 +3. **`auto` 是死字符串**:`Objects.equals(scope, "Chain")` 是硬比较,传 `"auto"` 实际进 else 分支等价于 `single`。文档(validValues)承诺 auto 智能判断,实现完全没有。 +4. **死代码 warn**:`if (msg.getScope() == null) logger.warn(...)` 永远进不来 —— 第一行 `Objects.equals(null, "Chain")` 已返回 false。表明原作者意图"null → Chain"但被 API 层默认值掩盖。 + +### 修复方向 + +- **改默认为 `single`**:单盘 API 默认 single(与 UI 直觉一致);Group 的默认建议同步改 single 或前端强制确认弹窗 +- **实现 `auto` 分支**:如"无 children → single;有 children 且全是叶子 → single;否则按用户场景";或直接从 validValues 移除 `auto` +- **删除死代码 warn**:替换为真正的 null 防御 `if (scope == null) scope = Single;` +- **统一 enum 比较**:用 `DeleteVolumeSnapshotScope.valueOf(scope) == Chain` 而不是字符串硬比,避免大小写 / 拼写漂移 + +### 影响 + +- 误删风险:CLI / 自动化脚本漏传 scope → 整棵子树消失 +- 文档与实现脱节:开放给用户的 `auto` 取值名义存在、行为不存在 +- 加固设计若依赖 scope 语义(如 reconciler 区分单点/链)会被字符串硬比的实现绊倒 + +--- + +## Bug 9(API → 内部 msg 默认值脱钩 / 🟡 中):内部 `DeleteVolumeSnapshotMsg.direction` 没默认 `auto`,cascade 路径退化为 Commit + +### 现状 + +```java +// APIDeleteVolumeSnapshotMsg.java +private String direction = "auto"; // ✅ API 层有默认 + +// DeleteVolumeSnapshotMsg.java +private String direction; // ❌ 内部 msg 无默认 +private String scope; // ❌ 同上 + +// VolumeSnapshotDeletionMsg.java +private String direction; // ❌ 同上 +private String scope; +``` + +### 问题 + +任何**非 API 入口**的调用路径(cascade 删 volume 时联动删 snapshot、snapshot group 内部 split 派发到单盘 msg、定时清理任务等),如果不显式 `setDirection("auto")`,直接传 null 进 `VolumeTree.resolveDirection`,会落到 Bug 2 路径 → 强制 Commit。 + +后果: +- 用户从 UI 操作 = `direction=auto` 路径 +- 系统级联(删 vm/volume 联动)= `direction=null → Commit` 路径 +- **同样的快照树,两条入口行为完全不同**,对账 / 复现困难 + +### 修复方向 + +- 内部 msg 字段也给 `= "auto"` 默认(一行) +- 或在 `VolumeSnapshotTreeBase.handleDeletionMsg` 入口统一兜底:`if (direction == null) direction = "auto";` +- 与 Bug 2 一并修复("resolveDirection 中 null 当 Auto")即可顺带解决,但更稳妥是 msg 层和处理层双兜底 + +### 与 Bug 2 关系 + +Bug 2 是"resolveDirection 把 null 当 Commit";Bug 9 是"为什么内部 msg 会把 null 传进来"。修 Bug 2 解决症状,修 Bug 9 解决源头。两条都修最稳。 + +--- + +## Bug 6(顶替原 Risk 6 / 🟡 中):删除过程中 vmState 无锁,可能与 VM 启动竞争 + +### 触发 + +`deleteSingleFlows` 行 852-859 一次性查 vmState,整个递归 stepDelete 复用该值。期间若 VM 被并发启动(API / 调度器 / autoStart): +- agent 正在做 `qcow2_commit` / `qcow2_rebase` +- libvirt 同时尝试启动 VM,qemu 探测 backing 链 + +后果难以预测:qemu-img 与 qemu 进程对同一文件加锁冲突、或 qemu 读到半完成的 backing 头部。 + +### 修复方向 + +- 删除操作期间在 VM 上加状态锁(如 `LockVmInstanceMsg`) +- 或每轮重新校验 vmState,发现变动即终止 + +--- + +## Bug 7(次要):`children` 排序行为依赖底层实现 — ✅ **失去影响(随 Bug 0)** + +修复后测试不再受 collection 实现顺序影响,因为 `aliveChild` 选择是基于内容(uuid 是否在 aliveChain 中)而非位置。但**为了测试稳定性**,仍建议未来给 children 加确定排序。 + +--- + +## 严重度汇总表 + +| # | Bug | 严重度 | 类型 | 根因 / 派生 | 修复状态 | +|---|---|---|---|---|---| +| **Bug 0** | `isOnline` 耦合 vmState 与 aliveChain | 🔴 高 | 设计层 | 根因 | ✅ 已修复 | +| Bug 1 | Auto 在 Stopped 退化为 Pull,磁盘放大 N 倍 | 🟡 中 | 语义错误 | 独立 | ✅ 随 Bug 0 修复 | +| Bug 2 | direction=null 当作 Commit 而非 Auto | 🟡 中 | API 语义不一致 | 独立 | ⚠ 待修复 | +| Bug 3 | children.get(0) 顺序未定义 | 🟢 低 | 实现细节 | 派生自 Bug 0 | ✅ 失去影响 | +| **Bug 4** | offline commit 物理成功 + SQLBatch 失败 → 幽灵态 | 🔴 高 | 崩溃原子性 | 独立 | ⚠ 待修复(P0) | +| Bug 5 | 轮 4 删除路径靠内存 inventory 传递 | 🟢 低(修 Bug 0 后降级) | 代码质量 | 重构风险 | ⚠ 待修复(P1) | +| Bug 6 | vmState 无锁,删除与 VM 启动可竞争 | 🟡 中 | 并发 | 独立 | ⚠ 待修复 | +| Bug 7 | children 顺序依赖底层 collection 实现 | 🟢 低 | 测试稳定性 | 派生自 Bug 0 | ✅ 失去影响 | +| **Bug 8** | API `scope="chain"` 默认值 + `auto` 取值未实现 + 死代码 warn | 🔴 高 | API 契约 | 独立 | ⚠ 待修复(P0) | +| Bug 9 | 内部 `DeleteVolumeSnapshotMsg.direction` 无默认 `auto` | 🟡 中 | 入口一致性 | 与 Bug 2 同源 | ⚠ 待修复 | + +--- + +## 加固设计优先级建议(剩余项) + +| 优先级 | 任务 | 覆盖 Bug | +|---|---|---| +| ~~P0~~ | ~~拆 `isOnline` 为 `isOnAliveChain` + `isHypervisorOperation`~~ | ~~Bug 0、1、3、7(降级 5)~~ ✅ 已完成 | +| **P0** | reconciler 检测"物理推进 + DB 未推进"幽灵态 + 操作意图日志 | Bug 4 | +| **P0** | API `scope` 默认改 `single`、实现 `auto` 分支或下线 `auto` validValue、删死代码 warn | Bug 8 | +| P1 | `direction=null` 当 Auto + 内部 msg 默认值同步为 `auto` | Bug 2、Bug 9 | +| P1 | 互换 + 物理删的对应关系显式化 | Bug 5 | +| P2 | 删除操作期间 VM 状态锁 | Bug 6 | + +--- + +## Bug → 场景对应 + +| Bug | 在哪些场景文档可见 | +|---|---| +| Bug 0 | 03(口径说明)、04(决策矩阵)、05(实测顺序异常) | +| Bug 1 | 03(Auto/Pull 路径写出磁盘 N 份差量)、04(决策矩阵) | +| Bug 2 | 04("initial=null → Commit"决策表) | +| Bug 3 | 05 §6(实测 children.get(0)=4 非 3) | +| Bug 4 | 05 §7(脆弱点表,Stopped + Commit 最严重故障) | +| Bug 5 | 05 §4 轮 4 / §6 与推演的差异 | +| Bug 6 | 04(vmState 一次性读取) | +| Bug 7 | 05 §6(顺序差异) | +| Bug 8 | 04(scope 决策入口;当前文档未覆盖 chain 路径,建议补一段) | +| Bug 9 | 04(direction 入口路径,cascade / group split 未列出) | diff --git a/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md b/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md new file mode 100644 index 00000000000..85fb23dd395 --- /dev/null +++ b/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md @@ -0,0 +1,295 @@ +# 快照组解散对称化 + VM 级完整性拦截(A+C 组合方案) + +> 范围:`VolumeSnapshotTreeBase.ungroupAfter*`、`VolumeSnapshotGroupBase`、`VolumeSnapshotGroupChecker`、VM 删除 cascade、Attach/Detach 卷 +> 关联 Bug:bugs.md 中 **Bug 11 / Bug 12 / Bug 13**(待登记) +> 基线:5.5.6 +> 状态:提案(未实施) +> 决策点已确认:拦截 = **VM 级**;VM destroy 时 incomplete = **cascade 自动清理**;force = **API 字段** + +--- + +## 1. 背景 + +`VolumeSnapshotGroupVO` 表示"VM 上多盘一致性快照集",每盘一条 `VolumeSnapshotGroupRefVO`。当前删除快照时存在两条不对称的解散路径: + +| 路径 | 入口 | 触发条件 | 解散行为 | +|---|---|---|---| +| `ungroupAfterDeleteSingleSnapshot`(行 1427-1443) | scope=single 删单快照 | 该快照属于某 group | 仅 `ref.snapshotDeleted=true`;**所有 ref 都 deleted 才删 group VO** | +| `ungroupAfterDeleted`(行 2148-2169) | scope=chain 删子树 | 待删 snapshot 的根 volume 是 **Root** | **立即删除整个 group VO**,data 盘 ref 变孤儿 | + +后果: +- root 盘单删 chain → group VO 消失,data 盘 ref 还指向已不存在的 group → 残留孤儿 +- data 盘单删 chain → group VO 仍在,ref.snapshotDeleted=true → 组 incomplete +- 后续对该 VM 删组 / 建组 / 删 VM / 挂卸盘 → 没有任何拦截,所有操作"看起来正常"实际带病前进 + +本提案双管齐下: +- **A**:解散逻辑统一对称(消除孤儿源头) +- **C**:VM 级完整性拦截(让残留 incomplete 组成为后续操作的硬阻断点) + +--- + +## 2. 方案 A — 解散对称化 + +### 2.1 改动 + +`VolumeSnapshotTreeBase.ungroupAfterDeleted` 行 2148-2169 移除 `Root` 特例: + +```java +private void ungroupAfterDeleted(List snapshots) { + List uuids = snapshots.stream() + .map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); + + SQL.New(VolumeSnapshotGroupRefVO.class) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) + .set(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).update(); + + // 不再区分 root / data,统一查"全 ref deleted 才解散整组" + Set groupUuids = Q.New(VolumeSnapshotGroupRefVO.class) + .select(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) + .listValues().stream().map(Object::toString).collect(Collectors.toSet()); + + for (String groupUuid : groupUuids) { + long remaining = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, false).count(); + if (remaining == 0) { + vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid); + cleanVmHostBackupFilesForGroup(Collections.singletonList(groupUuid)); + dbf.removeByPrimaryKey(groupUuid, VolumeSnapshotGroupVO.class); + } + } +} +``` + +### 2.2 收益 + +- root 盘 chain 删除不再立即删 group VO,与 data 盘行为对齐 +- 不再产生"group 已不存在 / ref 仍在"的孤儿 +- `ungroupAfterDeleteSingleSnapshot` 与 `ungroupAfterDeleted` 行为合并,可后续重构为同一私有方法 + +### 2.3 兼容性 + +- 旧 root 单删 chain 后立即解散的"快"行为消失:仍要等 data 盘 ref 也清理才解散 +- 实际上历史路径就是 bug —— 旧行为留下孤儿 ref,新行为留下 incomplete 组(被 C 拦截后用户必须清理) + +--- + +## 3. 方案 C — VM 级完整性拦截 + +### 3.1 拦截入口 + +| 入口 API | 拦截条件 | 错误信息 | force 字段 | +|---|---|---|---| +| `APIDeleteVolumeSnapshotGroupMsg`(其他组) | VM 上有 incomplete 组(exclude 自身) | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再删除其他快照组` | ✅ | +| `APICreateVolumeSnapshotGroupMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再创建新快照组` | ❌(不应允许) | +| `APIAttachDataVolumeToVmMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再挂载磁盘` | ❌ | +| `APIDetachDataVolumeFromVmMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再卸载磁盘` | ❌ | +| `APIDestroyVmInstanceMsg` | VM 上有 incomplete 组 | **不拦截**(cascade 自动清理) | ❌ | + +**豁免**: +- 删 incomplete 组**自身** → 放行(exclude 当前 group_uuid) +- 单快照 API(`APIDeleteVolumeSnapshotMsg`) → 放行(清债途径) + +### 3.2 incomplete 检测 + +在 `VolumeSnapshotGroupChecker` 新增静态方法: + +```java +public class VolumeSnapshotGroupChecker { + /** + * 返回 VM 上所有 incomplete 组(部分 ref 已 snapshotDeleted=true 但仍存在未删的 ref)。 + * @param excludeGroupUuid 排除指定 group(如删自身时不算违例),null 表示不排除 + */ + public static List findIncompleteGroupsOnVm(String vmUuid, String excludeGroupUuid) { + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .eq(VolumeSnapshotGroupVO_.vmInstanceUuid, vmUuid) + .listValues(); + + List incomplete = new ArrayList<>(); + for (Object o : groupUuids) { + String guuid = o.toString(); + if (guuid.equals(excludeGroupUuid)) continue; + long deletedRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).count(); + long totalRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid).count(); + if (deletedRefs > 0 && deletedRefs < totalRefs) { + incomplete.add(guuid); + } + } + return incomplete; + } +} +``` + +### 3.3 拦截织入示例 + +#### 3.3.1 删除其他组 + +```java +// VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg) +private void handle(APIDeleteVolumeSnapshotGroupMsg msg) { + APIDeleteVolumeSnapshotGroupEvent evt = new APIDeleteVolumeSnapshotGroupEvent(msg.getId()); + String vmUuid = self.getVmInstanceUuid(); + if (!msg.isForce()) { + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(vmUuid, self.getUuid()); + if (!incomplete.isEmpty()) { + evt.setError(operr("VM[uuid=%s] 存在不完整快照组%s,请先清理后再删除其他快照组", + vmUuid, incomplete)); + bus.publish(evt); + return; + } + } + // ... 原逻辑 +} +``` + +#### 3.3.2 创建新组 / 挂卸盘 + +各 API handle 入口: + +```java +List incomplete = VolumeSnapshotGroupChecker.findIncompleteGroupsOnVm(vmUuid, null); +if (!incomplete.isEmpty()) { + bus.replyErrorByMessageType(msg, operr("VM[uuid=%s] 存在不完整快照组%s,请先清理后再 ...", + vmUuid, incomplete)); + return; +} +``` + +#### 3.3.3 VM destroy — cascade 自动清理(不拦截) + +`VolumeSnapshotGroupCascadeExtension`: + +```java +@Override +public void asyncCascade(CascadeAction action, Completion completion) { + if (CascadeConstant.DELETION_CHECK_CODE.equals(action.getActionCode())) { + // VM destroy 不拦截 incomplete 组,由后续 cleanup 阶段处理 + completion.success(); + return; + } + + if (CascadeConstant.DELETION_CLEANUP_CODE.equals(action.getActionCode())) { + String vmUuid = ((VmInstanceInventory) action.getParentIssuer().get(0)).getUuid(); + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(vmUuid, null); + if (!incomplete.isEmpty()) { + // force 删除所有 incomplete 组(包括其残留 ref) + forceDeleteGroups(incomplete, completion); + return; + } + completion.success(); + } +} +``` + +`forceDeleteGroups`:直接 SQLBatch 删 `VolumeSnapshotGroupRefVO` + `VolumeSnapshotGroupVO`,然后调 `vidm.deleteArchiveVmInstanceResourceMetadataGroup` + `cleanVmHostBackupFilesForGroup`。**不再走 chain 删快照** —— VM 销毁时 volume 也会被销毁,对应 snapshot tree 通过各 PS cascade 清理。 + +### 3.4 force 字段(仅 API 层) + +```java +// APIDeleteVolumeSnapshotGroupMsg.java +@APIParam(required = false, + description = "true = 跳过 VM 完整性检查(运维兜底);默认 false") +private boolean force = false; +``` + +仅 `APIDeleteVolumeSnapshotGroupMsg` 加 `force`。其他 API(建组/挂卸盘)不应允许带病前进,不开 force。 + +--- + +## 4. 用户清债的两条路径 + +| 场景 | 操作 | 结果 | +|---|---|---| +| 整组清理 | `APIDeleteVolumeSnapshotGroupMsg(group_uuid=incomplete)` | 走 chain 删剩余快照 → A 解散逻辑收尾 → group VO 删除 | +| 个体清理 | 对每个残留 ref 对应的 snapshot 调 `APIDeleteVolumeSnapshotMsg` | 同上路径触发 A 解散收尾 | +| 紧急绕过 | `APIDeleteVolumeSnapshotGroupMsg(group_uuid=other, force=true)` | 跳过完整性检查删其他组(incomplete 组留待事后处理) | + +--- + +## 5. 行为矩阵 + +| T0 状态 | T1 操作 | T1 结果 | T2 操作 | T2 结果 | +|---|---|---|---|---| +| 组1 完整(root + data 各一) | 删组1 root 单快照 (single) | 组1 ref 一个 deleted;**组1 VO 保留** | 删组2 | C 拦截 | +| 同上 | 同上 | 同上 | 删组1(自身) | 放行(exclude) | +| 同上 | 同上 | 同上 | 删组1 data ref 对应 snapshot | 放行 → 触发 A 解散 | +| 组1 完整 | 删组1 整组 (chain) | 全 ref deleted → 组1 VO 删 | 删组2 | 放行 | +| 组1 incomplete | 升级 management 重启 | 状态持久 | 删 VM | **放行**(cascade 自动清 incomplete) | +| 组1 incomplete | — | — | 删组2 force=true | 放行(带病删,组1 仍在) | +| 组1 incomplete | — | — | 建新组 / 挂盘 / 卸盘 | C 拦截,无 force 兜底 | + +--- + +## 6. 改动清单 + +| # | 文件 | 改动 | +|---|---|---| +| 1 | `storage/.../VolumeSnapshotTreeBase.java` 行 2148-2169 | 移除 root 特例,统一"全 ref deleted 才解散" | +| 2 | `storage/.../group/VolumeSnapshotGroupChecker.java` | 新增 `findIncompleteGroupsOnVm(vmUuid, excludeGroupUuid)` | +| 3 | `storage/.../group/VolumeSnapshotGroupBase.java handle(APIDeleteVolumeSnapshotGroupMsg)` | 入口加 incomplete 检查 + force 旁路 | +| 4 | `storage/.../VolumeSnapshotManagerImpl.java handle(APICreateVolumeSnapshotGroupMsg)` | 入口加 incomplete 检查 | +| 5 | VM Attach/Detach DataVolume API handle | 入口加 incomplete 检查 | +| 6 | `storage/.../group/VolumeSnapshotGroupCascadeExtension.java` | DELETION_CLEANUP 阶段 force 清 incomplete 组(VM destroy 路径) | +| 7 | `header/.../group/APIDeleteVolumeSnapshotGroupMsg.java` | 新增 `boolean force = false` | +| 8 | i18n 错误码表 | 新增 `GROUP_INCOMPLETE_BLOCK_*` 系列 | +| 9 | API 文档 / changelog / 升级公告 | 提示历史 incomplete 组将首次拦截,提供清债指引 | + +--- + +## 7. 兼容矩阵 + +| 场景 | 旧行为 | 新行为 | 兼容 | +|---|---|---|---| +| root chain 删除 | 立即删 group VO,留 data ref 孤儿 | 仅 mark deleted;等 data ref 齐删 | ⚠ break(更合理) | +| data chain 删除 | mark deleted,等齐 | 同(不变) | ✅ | +| single 删除 | mark deleted,等齐 | 同(不变) | ✅ | +| 升级前已存在的 incomplete 组 | 后续操作无任何提示 | 首次触发拦截 | ⚠ break(运维需清债 / force) | +| 升级前正常组 | 正常 | 正常 | ✅ | +| VM destroy 时存在 incomplete 组 | 走旧 cascade,行为不确定 | cascade 自动 force 清 | ✅ 改善 | + +--- + +## 8. 测试要点 + +| 场景 | 预期 | +|---|---| +| root chain 单删 → 不立即解散 | group VO 仍在,root ref deleted=true | +| data chain 单删 → 不解散 | 同上 | +| 全部 ref 都删完 → 自动解散 | group VO 消失 + vidm 调用 + backup file 清 | +| 组1 incomplete → 删组2 | argerr/operr,提示组1 incomplete | +| 组1 incomplete → 删组1 自身 | 放行 | +| 组1 incomplete → 删组1 剩余 snapshot(个体 API) | 放行 → A 收尾解散 | +| 组1 incomplete → 建新组 | operr 拦截 | +| 组1 incomplete → attach/detach data volume | operr 拦截 | +| 组1 incomplete → 删 VM | 放行,cascade 自动清 incomplete 组 | +| 删组2 force=true,组1 incomplete | 放行,组1 保留 | +| 升级旧库 → 已存在 incomplete 组 → 任意操作首次触发拦截 | 报错信息可指导清债 | + +--- + +## 9. 与 bugs.md 的对应 + +| Bug(待登记) | 描述 | 闭环来源 | +|---|---|---| +| Bug 11 | 解散非对称:root 立即删 vs data 等齐 | 方案 A | +| Bug 12 | incomplete 组持续污染后续操作,无任何检测 | 方案 C | +| Bug 13 | `getEffectiveSnapshots` 不过滤 `ref.snapshotDeleted=false` | A 间接缓解 + C 阻断后续触发场景 | + +--- + +## 10. 风险与决策点 + +| 决策点 | 已确认 | 备注 | +|---|---|---| +| 拦截层级 | **VM 级** | 同 VM 上任一 incomplete 组阻断 VM 上其他组操作 | +| VM destroy 时 incomplete 处理 | **cascade 自动清理** | 不拦截,cleanup 阶段 force 删 | +| force 字段位置 | **API 字段** | 仅 `APIDeleteVolumeSnapshotGroupMsg`,建组/挂卸盘不开 force | +| `findIncompleteGroupsOnVm` 性能 | 待 review | 每次 N+1 查询;如 VM 上组数多可改单 SQL JOIN + GROUP BY HAVING | +| 升级公告 | 必须有 | 升级前需提供 SQL 检测脚本:`SELECT vmInstanceUuid, volumeSnapshotGroupUuid FROM VolumeSnapshotGroupRefVO WHERE snapshotDeleted=1 GROUP BY volumeSnapshotGroupUuid HAVING COUNT(*) < (SELECT COUNT(*) FROM VolumeSnapshotGroupRefVO r2 WHERE r2.volumeSnapshotGroupUuid=...)` | diff --git a/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md b/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md new file mode 100644 index 00000000000..1df09b4b0d0 --- /dev/null +++ b/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md @@ -0,0 +1,283 @@ +# 快照删除 API 参数(`scope` / `direction`)重构提案 + +> 范围:`APIDeleteVolumeSnapshotMsg`、`APIDeleteVolumeSnapshotGroupMsg` 及其内部派生 msg +> 关联 Bug:bugs.md 中 **Bug 2 / Bug 8 / Bug 9** +> 基线:5.5.6 +> 状态:提案(未实施) + +--- + +## 1. 背景 + +历史上"删除快照"的语义只有一种 —— **删除待删节点 + 所有子孙节点**(子树雪崩删)。后来引入"单点删除"(只删该节点本身,子孙 merge 到 parent),通过 `scope` 入参区分两种行为: + +```java +// APIDeleteVolumeSnapshotMsg.java / APIDeleteVolumeSnapshotGroupMsg.java +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; + +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; +``` + +```java +// VolumeSnapshotTreeBase.handleDeletionMsg 行 473-490 +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + if (msg.getScope() == null) { + logger.warn("snapshot deletion scope is null, default to Chain scope"); + } + ... + deleteChainFlows(); +} else { + deleteSingleFlows(); +} +``` + +--- + +## 2. 当前设计的问题 + +### 2.1 `scope` 相关 + +| # | 问题 | 影响 | +|---|---|---| +| S1 | `validValues` 列了 `"auto"`,但代码用 `Objects.equals(scope, "Chain")` 字符串硬比,`auto` 实际等价于 `single` | 文档承诺 ≠ 实现;调用方误判 | +| S2 | `if (msg.getScope() == null) logger.warn(...)` 是死代码 —— 上一行 `Objects.equals(null, "Chain")` 已 false,永远进不来 | warn 永远打不出,作者意图("null → Chain 兜底")未生效 | +| S3 | "凡是非 Chain 字符串都默默走 single" —— 拼错 / 大小写漂移 / 老 `auto` 全部静默走 single | 异常值无法被发现,潜在数据破坏 | +| S4 | `chain` 命名容易被误读为"alive chain"或"整棵 tree" | 文档与实现差异,新人误读 | +| S5 | Group API 默认 `chain` 风险高一个量级(多盘 × 子树) | 一次 API 调用可能删几十个 snapshot | + +### 2.2 `direction` 相关 + +| # | 问题 | 影响 | +|---|---|---| +| D1 | API 层默认 `"auto"`,但内部 `DeleteVolumeSnapshotMsg.direction` / `VolumeSnapshotDeletionMsg.direction` 没有默认值(null) | cascade、group split、定时清理路径若不显式 set 即传 null | +| D2 | `VolumeTree.resolveDirection(null) → Commit`,与"不传 = Auto"惯例相反 | 同棵树两条入口(API vs cascade)行为分叉;Stopped 下意外走 offline_commit | + +### 2.3 语义对清 + +为避免"chain"再被歧义解读,先固化术语: + +| 术语 | 定义 | +|---|---| +| **chain(本提案中)** | 以待删节点为根的子树(`currentLeaf.getDescendants()`),含所有子孙、旁支、分叉。**不是** alive chain,**不是**整棵 tree。 | +| **single** | 仅待删节点本身;子孙保留并 merge 到 parent。 | +| **alive chain**(不在本提案 scope 中) | vol 当前依赖的快照链路(vol.installPath → parentUuid 反向递归)。仅出现在 `VolumeTree.aliveChain` 内部判定,与 API `scope` 无关。 | + +--- + +## 3. 设计目标 + +1. **保留默认 `chain`** —— 与老 API 行为兼容,避免 5.x.x 升级断老脚本 / cascade 路径 +2. **删除 `auto` 死值** —— 清理 validValues 中无实现的取值 +3. **enum 显式校验** —— 非法字符串抛 argerr,不再"任意非 Chain 都按 single" +4. **修复死代码 warn** —— 真正生效的 null 兜底分支 +5. **内部 msg 默认值与 API 对齐** —— cascade 路径与 API 路径行为一致 +6. **API 描述明确雪崩删语义** —— 让用户一眼看清"chain = 子树删" + +--- + +## 4. 详细方案 + +### 4.1 入参定义改写 + +```java +// APIDeleteVolumeSnapshotMsg.java +@APIParam(required = false, validValues = {"single", "chain"}, + description = "chain (默认) = 删除该节点及其所有子孙节点(子树删);" + + "single = 仅删除该节点本身,子孙节点 merge 到 parent") +private String scope = "chain"; + +@APIParam(required = false, validValues = {"pull", "commit", "auto"}, + description = "auto (默认) = 按 VM 状态与链路结构自适应选择 commit 或 pull") +private String direction = "auto"; +``` + +`APIDeleteVolumeSnapshotGroupMsg.java` 同步改写(参数定义相同)。 + +变化点: +- `scope` validValues 移除 `"auto"` +- `description` 写清 chain 是雪崩删 +- `direction` validValues 不变(`auto` 是真实实现,与 scope 的死值不同) + +### 4.2 后端处理改写 + +```java +// VolumeSnapshotTreeBase.handleDeletionMsg 行 473 附近 +DeleteVolumeSnapshotScope parsedScope; +if (msg.getScope() == null) { + parsedScope = DeleteVolumeSnapshotScope.Chain; + logger.warn(String.format( + "snapshot[uuid=%s] deletion scope is null, default to Chain (subtree delete)", + msg.getSnapshotUuid())); +} else { + try { + parsedScope = DeleteVolumeSnapshotScope.valueOf(StringUtils.capitalize(msg.getScope())); + } catch (IllegalArgumentException e) { + throw new OperationFailureException(argerr( + "invalid scope[%s], expect one of: single, chain", msg.getScope())); + } +} + +if (parsedScope == DeleteVolumeSnapshotScope.Chain) { + long size = 0; + for (VolumeSnapshotInventory inv : currentLeaf.getDescendants()) { + if (inv.isLatest()) ancestorOfLatest = true; + size += inv.getSize(); + } + requiredSize = Math.min(size, volume.getSize()); + deleteChainFlows(); +} else { + deleteSingleFlows(); +} +``` + +修复点: +- **S1 / S3**:enum 校验,非法字符串(含老 `auto`、拼错、大小写漂移)被 argerr 拦截 +- **S2**:死代码 warn 挪到真正的 null 兜底分支 +- **隐式分支风险**:`else` 不再"凡是非 chain 都按 single",仅 `Single` enum 值进 single 路径(这里 enum 二选一,等价于显式 switch;如未来加第三种值需改 switch) + +### 4.3 内部 msg 默认值同步 + +```java +// header/.../DeleteVolumeSnapshotMsg.java +private String direction = "auto"; // 修 D1 +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); // 与 API 默认对齐 + +// header/.../VolumeSnapshotDeletionMsg.java +private String direction = "auto"; +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); + +// header/.../group/DeleteVolumeSnapshotGroupInnerMsg.java +private String direction = "auto"; +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); +``` + +效果:cascade、group split、定时清理任何路径若不显式 set,行为与 API 默认一致(chain + auto),不再退化为 Commit(Bug 9 闭环)。 + +### 4.4 `direction=null` 兜底(修 Bug 2) + +`VolumeTree.resolveDirection` 第一行: + +```java +// 修改前 +if (initial == null) { + return VolumeSnapshotDeletionDirection.Commit; +} + +// 修改后 +if (initial == null) { + initial = VolumeSnapshotDeletionDirection.Auto.toString(); // 与 API 默认一致 +} +``` + +修了内部 msg 默认值后这条仍是双保险:万一某条 cascade 路径用旧 builder 不带默认值构造 msg,仍能在 resolveDirection 入口兜住。 + +--- + +## 5. Group API 单独评估 + +`APIDeleteVolumeSnapshotGroupMsg` 的 scope 透传给每盘 single msg: + +```java +// VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg) +imsg.setScope(msg.getScope()); // 行 192 / 227 +imsg.setDirection(msg.getDirection()); +``` + +Group + chain 默认风险:**多盘 × 子树**,单次 API 可删几十个 snapshot,回滚成本极高。 + +| 选项 | 描述 | 推荐度 | 兼容性 | +|---|---|---|---| +| A. Group 保留默认 `chain` | 与单盘一致 + 与老脚本兼容;UI/文档单独警示风险 | ⭐⭐⭐ | ✅ 完全兼容 | +| B. Group 默认改 `single` | 与单盘默认拉开,强调"按盘点删" | ⭐⭐⭐ | ⚠ 老脚本行为变化 | +| C. Group `required = true` | 强制用户显式选择 | ⭐⭐⭐⭐ 最安全 | ⚠ 老脚本断 | + +**推荐 A**(最小改动):UI 层 + 文档显著警示,老脚本不动。如果业务上确认"快照组 = 一致性快照集,几乎无人对它做子树删",再走 C 在下个大版本下线默认值。 + +--- + +## 6. 兼容矩阵 + +| 调用方传参 | 旧行为 | 新行为 | 兼容 | +|---|---|---|---| +| 不传 `scope` | chain | chain(默认) | ✅ 等价 | +| `scope=chain` | chain | chain | ✅ 等价 | +| `scope=Chain` | chain(大写恰好等于 enum.toString) | chain(normalize) | ✅ 等价 | +| `scope=CHAIN` | 走 single(字符串非精确 "Chain") | chain(normalize) | ⚠ 行为变化但更合理 | +| `scope=single` | single | single | ✅ 等价 | +| `scope=auto` | 走 single(字符串非 "Chain") | argerr 拒绝 | ⚠ **break** | +| `scope=garbage` | 走 single | argerr 拒绝 | ⚠ **break** | +| 不传 `direction` | auto(API 层默认) | auto | ✅ 等价 | +| 内部 msg 不 set `direction` | null → resolveDirection 返回 Commit | auto → 按 vmState/链路 | ⚠ **行为变化**(更合理) | +| 内部 msg 不 set `scope` | null → 走 single 分支("非 Chain") | chain(默认) → 走 chain 分支 | ⚠ **行为变化**(与 API 默认对齐) | + +### break 项处理 + +1. **`scope=auto` break**:当前实际行为是"伪装成智能、其实落 single",调用方若依赖此行为本身就是 bug 用法。可选过渡:保留 `auto` 在 validValues 一个版本,内部 alias 到 chain(或 single,按实际调用方调研结果决定),warn `"scope=auto is deprecated"`。 +2. **`scope=garbage` break**:原本静默 single,新行为 argerr。这是**好的 break** —— 之前的隐藏 bug 暴露出来。 +3. **内部 msg 默认行为变化**:cascade / group split 路径如果原本依赖 "null=Commit / null=single" 隐含语义,会变。需要全量审查内部 msg 的所有调用点: + +```bash +# 搜索 cascade 路径 +rg "new DeleteVolumeSnapshotMsg\(\)" -l +rg "new VolumeSnapshotDeletionMsg\(\)" -l +``` + +凡是不显式 setDirection / setScope 的,确认是否需要保留旧的隐含语义;如需要,应在该路径显式 setDirection("commit") / setScope("single") 而不是依赖默认。 + +--- + +## 7. 改动清单 + +| # | 文件 | 改动 | +|---|---|---| +| 1 | `header/src/main/java/org/zstack/header/storage/snapshot/APIDeleteVolumeSnapshotMsg.java` | `scope` validValues 移除 `auto`;description 写清子树语义 | +| 2 | `header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java` | 同 1 | +| 3 | `header/src/main/java/org/zstack/header/storage/snapshot/DeleteVolumeSnapshotMsg.java` | `direction = "auto"`、`scope = "Chain"` 默认 | +| 4 | `header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotDeletionMsg.java` | 同 3 | +| 5 | `header/src/main/java/org/zstack/header/storage/snapshot/group/DeleteVolumeSnapshotGroupInnerMsg.java` | 同 3 | +| 6 | `storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java` 行 473 | enum normalize + 死代码 warn 修复 + argerr | +| 7 | `storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java` `resolveDirection` | `null → Auto` 兜底 | +| 8 | API 文档 / changelog | 兼容矩阵公告;UI 建议默认选 single;Group 警示 | +| 9 | 调用点审查 | `rg "new DeleteVolumeSnapshotMsg"` 验证内部 msg 默认变化的影响 | + +--- + +## 8. 测试要点 + +| 测试场景 | 预期 | +|---|---| +| API 不传 scope → 走 chain(兼容老行为) | ✅ | +| API 传 `scope=chain` 删多分支节点 | 子树全删 | +| API 传 `scope=single` 删多分支节点 | 仅该节点删,子孙 merge | +| API 传 `scope=auto` | argerr,不再静默走 single | +| API 传 `scope=GARBAGE` | argerr | +| API 不传 direction → resolveDirection 走 Auto 分支 | ✅ | +| Cascade 路径(删 vm/volume 联动)→ 内部 msg 走 chain + auto | 与 API 一致 | +| Group API 不传 scope → 多盘均走 chain | 兼容老行为 | +| 老脚本传 `scope=Chain`(首字母大写)| 走 chain(兼容) | +| `scope=CHAIN`(全大写)| 走 chain(normalize 后) | + +--- + +## 9. 与 bugs.md 的对应 + +本提案落地后,bugs.md 中的修复进度更新: + +| Bug | 当前状态 | 提案落地后 | +|---|---|---| +| Bug 2 (`direction=null → Commit`) | ⚠ 待修复 | ✅ resolveDirection 兜底为 Auto | +| Bug 8 (`scope` validValues / 死代码 / 默认风险) | ⚠ 待修复(P0) | ✅ enum normalize + argerr + 死代码修复 | +| Bug 9 (内部 msg `direction` 无默认) | ⚠ 待修复 | ✅ 三个内部 msg 同步默认 `auto` + `chain` | + +--- + +## 10. 风险与决策点 + +| 决策点 | 选项 | 备注 | +|---|---|---| +| `scope=auto` 是否保留过渡期 | 直接 break / 一版本 deprecated | 取决于调用方调研:有无脚本真传 auto | +| Group 默认是否改 `single` | A 保持兼容 / B 改 single / C required | 推荐 A,激进可走 C | +| 内部 msg 默认值变化是否需要 cascade 路径全审 | 是 | 必须全量 grep `new DeleteVolumeSnapshotMsg()` 确认无依赖 null 行为 | +| `chain` 是否重命名 `subtree` | 保留 / 重命名 + alias | 已决定保留:term 历史包袱 + 重命名收益小 | diff --git a/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md b/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md new file mode 100644 index 00000000000..0186fd84512 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md @@ -0,0 +1,304 @@ +# 场景 03:local + 关机 VM + 删除中间节点(快照2,3 个子节点其中 1 个 alive) + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 与 `02-local-running-delete-mid-with-3-children.md` 对照阅读:树结构相同,唯一差别是 VM 状态从 Running 变为 Stopped。 +> 源码: +> - `VolumeSnapshotTreeBase.java` 行 828-919(stepDelete)/ 1097-1290(pull) +> - `VolumeTree.java` 行 364-392(resolveDirection / isOnline) +> - `LocalStorageKvmBackend.java` 行 3845-3865(PullVolumeSnapshotOnPrimaryStorageMsg → OFFLINE_MERGE_PATH) +> - `localstorage.py` 行 834-856(`offline_merge_snapshot`) + +> ⚠ **本场景按 `initial.direction ∈ {Auto, Pull}` 的口径推演**(即最后一轮也走离线 pull)。 +> 如果 API 入参 `direction=Commit`(或前端不传 → resolveDirection 默认返回 Commit),最后一轮的行为完全不同:会走 `offline_commit_snapshot`(数据 5→2,DB 互换 path,VO_2 直接 DELETE),**请参考实测记录 `05-local-stopped-direction-commit-actual.md`**。 +> 决策矩阵见 `04-deleteSingleFlows-online-offline-decision.md` §"与场景 02 / 03 的对应"。 +> +> ⚠ **Bug 0 修复后**(参考 `../bugs.md`):`direction=Auto` 在 Stopped 下不再退化为 Pull —— `shouldUseCommitStrategy` 解耦 vmState 后,Auto + 待删/child 都在 vol 链上时返回 **Commit**,行为等价于场景 05。本场景 03 现在仅适用于 `direction=Pull` 显式入参,且需满足 `shouldCommit=false`(即不在 vol 链上)。 + +--- + +## 前提 + +- 主存储类型:**LocalStorage** +- VM 状态:**Stopped**(合法状态之一,校验在 `deleteSingleFlows()` 行 854-858) +- 待删快照:**快照2** + +## 快照树(与场景 02 完全相同) + +``` + 快照1 + └─ 快照2 ◄── 待删 currentRoot + ├─ 快照3 + ├─ 快照4 + └─ 快照5 ── vol ← alive chain +``` + +## 物理 backing chain + +``` +1.qcow2 ← 2.qcow2 ← 5.qcow2 ← vol(VM 关机,文件无人持有) +2.qcow2 ← 3.qcow2 +2.qcow2 ← 4.qcow2 +``` + +--- + +## 关键差异:所有轮都走"离线 pull" + +`VolumeTree.resolveDirection`(行 364-387): + +```java +boolean online = (vmState == Running || vmState == Paused) && alive(target) && alive(child); +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; +``` + +VM=Stopped → `online=false` → `shouldUseCommitStrategy=false` → Auto / null / Pull 全部解析为 **Pull**;`isOnline`(行 389-392)同样要求 Running/Paused。 + +后果:哪怕快照5 在 alive chain 上,VM 关机时它也走**离线 pull**,agent 不调 libvirt blockCommit,全部 qemu-img 离线操作。 + +**离线 pull 的真实控制面 / 数据面**: + +| 层 | 实现 | +|---|---| +| Java 控制面 | `pull()` 行 1250-1268 → 构造 `PullVolumeSnapshotOnPrimaryStorageMsg`,参数 `srcSnapshotParentPath`(= 快照1.qcow2)、`srcSnapshot`(= 被删的 currentRoot=快照2)、`dstSnapshot`(= 选中 child) | +| 后端转发 | `LocalStorageKvmBackend.handle(PullVolumeSnapshotOnPrimaryStorageMsg)`(行 3845-3865)→ 构造 `OfflineMergeSnapshotCmd { srcPath = srcSnapshotParentPath, destPath = dst.installPath, fullRebase = (srcPath == null) }` → 走 **`OFFLINE_MERGE_PATH = "/localstorage/snapshot/offlinemerge"`** | +| Agent 数据面 | `offline_merge_snapshot`(`localstorage.py` 行 834-856):核心一行 `linux.qcow2_rebase(cmd.srcPath, cmd.destPath)`(fullRebase 时改走 `qcow2.create_template` 扁平化) | + +**关键澄清**:场景 03 的 pull 走的是 `offline_merge_snapshot`,**不是** `offline_commit_snapshot`(后者由 commit 离线分支 `CommitVolumeSnapshotOnPrimaryStorageMsg` 调用)。`qcow2_rebase(backing=快照1, file=child)` 的语义是把 child 的 backing 从原快照2 改成快照1,并**把"快照2 与 快照1 之间的差量数据"复制进 child 文件**(因为快照1 作为基线只读不可写,只能往 child 写)。 + +--- + +## 总轮次(4 轮 stepDelete,与场景 02 同结构但全离线) + +| 轮 | currentRoot=2 的 children | 选中 | online? | direction | 物理操作 | DB 关键变更 | +|---|---|---|---|---|---|---| +| 1 | [3, 4, 5] | 3 | false | 强制 pull | `qcow2_rebase(1.qcow2, 3.qcow2)`(差量进 3.qcow2) | 3.parentUuid=1, distance-- | +| 2 | [4, 5] | 4 | false | 强制 pull | `qcow2_rebase(1.qcow2, 4.qcow2)`(差量进 4.qcow2) | 4.parentUuid=1, distance-- | +| 3 | [5] | 5 | **false** | resolveDirection → **Pull**(不再是 Commit) | `qcow2_rebase(1.qcow2, 5.qcow2)`(差量进 5.qcow2) | **5.parentUuid=1, distance--,不互换 path** | +| 4 | [] | — | — | terminal | 删 VO_2 + 物理 2.qcow2 | VO_2 删除 | + +**全程数据落地**:每一轮把"快照2 相对于快照1 的增量"**复制进当前选中的 child**(3 / 4 / 5 各拿一份独立副本)。快照1.qcow2 内容**不变**,快照2.qcow2 内容也**不变**,直到轮 4 整文件删除。 + +--- + +## 轮 1 / 轮 2:与场景 02 完全相同 + +`stepDelete` 多子节点分支(行 912-918)不依赖 vmState,只依赖 children.size 与 onlineChild 选择算法。Stopped 时 `isOnline` 全部返回 false,`onlineChild = null`,`child = children.get(0)`,不需要"避开 alive 子节点"的替换。 + +```java +onlineChild = null // VM Stopped,没有 alive child +child = children.get(0) = 3 +// if 块未触发 +online = isOnline(2, 3, Stopped) = false +pull(3, ..., online=false) +``` + +控制面 → 后端 → agent: + +``` +PullVolumeSnapshotOnPrimaryStorageMsg{ + srcSnapshotParentPath = "1.qcow2", + srcSnapshot = VO_2, + dstSnapshot = VO_3 +} + → LocalStorageKvmBackend.handle → OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=3.qcow2, fullRebase=false} + → offline_merge_snapshot: + linux.qcow2_rebase(srcPath=1.qcow2, destPath=3.qcow2) + # 物理:3.qcow2 backing 改写为 1.qcow2,差量数据合并入 3.qcow2 +DB: VO_3.parentUuid=1, distance-- +``` + +轮 2 同理对快照4。 + +--- + +## 轮 3:离线 pull 快照5(与场景 02 的根本差别) + +`children.size() == 1` 分支(行 903-911): + +```java +direction = volumeTree.resolveDirection(2, 5, msg.direction, currentRoot.isLatest, Stopped) + → online=false → shouldUseCommitStrategy=false → 解析为 Pull +online = isOnline(2, 5, Stopped) = false +pull(5, volumeTree, online=false, comp) // 离线 pull,不进 commit 分支 +``` + +**关键差异**:场景 02 在轮 3 走在线 commit(libvirt blockCommit + pivot + DB 互换 path);场景 03 走离线 pull,**不互换 path**,DB 修改路径完全不同。 + +### 3.1 控制面 flow(`pull()` 行 1097-1290) + +``` +flow chain: + 1. get-snapshot-backing-chain 获取 srcSnapshotParentPath(= 1.qcow2) + 2. allocate-primary-storage-capacity 预占 size + 3. (条件) get-volume-current-size 仅 dst.uuid == volume.uuid 时;本例 dst=5 ≠ vol → 跳过 + 4. pull-volume-snapshot-on-primary-storage online=false → PullVolumeSnapshotOnPrimaryStorageMsg + online=true 才走 PullVolumeSnapshotOnHypervisorMsg + 5. updateDatabaseAfterPull +``` + +`PullVolumeSnapshotOnHypervisorMsg` 在本场景**完全不会被构造**,因为 `online=false`。所以 hypervisor 端 vm_plugin 的 do_block_commit 路径在场景 03 整个删除过程中**一次都不调用**。 + +### 3.2 数据面(`offline_merge_snapshot`) + +``` +src=快照2, dst=快照5 +OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=5.qcow2, fullRebase=false} + +if linux.qcow2_get_backing_file(destPath=5.qcow2) == srcPath=1.qcow2: + return(已经挂在 1.qcow2,幂等 noop) + +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath=1.qcow2, cmd.destPath=5.qcow2) + # qemu-img rebase 默认(非 -u): + # 把 5.qcow2 旧 backing(2.qcow2) 与新 backing(1.qcow2) 之间的差异 + # 写入 5.qcow2 的数据区,然后改写 5.qcow2 头部 backing 字段为 1.qcow2 +else: + # fullRebase 路径:扁平化(srcPath 为 null 时触发,本例不触发) + qcow2.create_template(cmd.destPath, tmp) → mv tmp cmd.destPath +``` + +**与之对比的 `offline_commit_snapshot`(commit 离线分支用)**: + +``` +top=child, base=parent # 由 LocalStorageKvmBackend.java:3827-3829 注入 +linux.qcow2_commit(top=child, base=parent) # 把 child flush 进 parent +for c in topChildrenInstallPathInDb: + linux.qcow2_rebase_no_check(base=parent, c) # child 的 children 重挂 parent +``` + +**两者方向相反**: +- `offline_merge_snapshot`(pull 用):数据从 dropped 节点 **流入 child**(每个 child 独立拷一份),dropped 文件不动 +- `offline_commit_snapshot`(commit 用):数据从 src(child) **流入 dst(被删 currentRoot)**,DB 后续会互换 installPath + +场景 03 全程使用前者。 + +### 3.3 DB 翻转(`updateDatabaseAfterPull`,对照 `../06-pull-db-rewrite.md`) + +``` +src=2, dst=5 + +更新前: + VO_5.installPath = 5.qcow2 parentUuid = 2 distance = N + VO_2.installPath = 2.qcow2 parentUuid = 1 distance = N-1 + +更新后: + VO_5.parentUuid = 1 ← 跨过 2 + VO_5.distance -= 1 + VO_5.installPath 不变(仍 5.qcow2,物理上含合并入的 2-vs-1 差量) + VO_5.size = newInstallPathSize(agent 返回,因合并入差量略增) + VO_2 不变(待轮 4 真删) +``` + +**与场景 02 的对照**: + +| 维度 | 场景 02(Running,commit) | 场景 03(Stopped,pull) | +|---|---|---| +| Agent 路径 | `CommitVolumeSnapshotOnHypervisorMsg` → libvirt blockCommit | `PullVolumeSnapshotOnPrimaryStorageMsg` → `offline_merge_snapshot` → `qcow2_rebase` | +| 物理操作位置 | child 数据进入被删者 | 被删者数据复制进 child(每个 child 各一份) | +| dst.installPath | **互换**:VO_2 ↔ VO_5 path 互换 | **不变**:VO_5 path 仍 5.qcow2 | +| vol.installPath | 同步切到 2.qcow2(关键脆弱点) | 不变(仍指 5.qcow2,VM 关机也不影响)| +| treeUuid 迁移 | dst=2 不是根 → 不迁移;若是根则新建 newTree | pull 路径不涉及 treeUuid 迁移 | +| GroupRef installPath | 同步互换 | 不变 | +| libvirt 调用 | blockCommit + pivot + sibling rebase | 完全不调 | +| 被删快照文件何时清 | libvirt pivot 自动删(VIR_DOMAIN_BLOCK_COMMIT_DELETE,文件名是 5.qcow2) | 轮 4 显式删(文件名是 2.qcow2) | + +### 3.4 翻转后链状态 + +``` +DB 视角: + vol.installPath = 5.qcow2(不变,VM 关机重启时按此 backing chain 启动) + VO_5.installPath = 5.qcow2 parentUuid = 1 ← 含合并入的 2-vs-1 差量 + VO_2.installPath = 2.qcow2 parentUuid = 1 ← 待删 + VO_3.installPath = 3.qcow2 parentUuid = 1 + VO_4.installPath = 4.qcow2 parentUuid = 1 + +物理 backing chain: + vol → 5.qcow2 → 1.qcow2 + 3.qcow2 → 1.qcow2 + 4.qcow2 → 1.qcow2 + 2.qcow2:仍存在但已无人引用(待轮 4 删) +``` + +--- + +## 轮 4:删 VO_2 自身 + +```java +children = [] // VO_5.parentUuid 已跨过 2 指向 1 +deleteVolumeSnapshotAndSyncVolumeSize(comp) +``` + +**消息**:`DeleteVolumeSnapshotOnPrimaryStorageMsg` + +**agent 物理动作**:删 VO_2.installPath = **2.qcow2**(场景 02 删的是 5.qcow2,是因为互换后 VO_2 指向 5;本场景未互换,VO_2 仍指 2.qcow2)。 + +**DB**:VO_2 删除,syncVolumeSize 更新 vol 的 size。 + +--- + +## 终态 + +``` +快照树: + 快照1 + ├─ 快照3 installPath=3.qcow2 backing=1.qcow2 含 (2-1) 差量 + ├─ 快照4 installPath=4.qcow2 backing=1.qcow2 含 (2-1) 差量 + └─ 快照5 ── vol installPath=5.qcow2 backing=1.qcow2 含 (2-1) 差量 + +物理: + 1.qcow2 ← 5.qcow2 ← vol + 1.qcow2 ← 3.qcow2 + 1.qcow2 ← 4.qcow2 + 2.qcow2 已删 +``` + +**注意"差量被复制 3 份"**:场景 03 由于走 pull,被删快照(2)与父(1)之间的差量数据会被分别复制到 3、4、5 三个文件中,磁盘占用相比场景 02 偏高(场景 02 只有一份合并文件)。这是 commit-vs-pull 的固有差异,与是否在线无关。 + +与场景 02 终态对比: + +| 维度 | 场景 02 终态 | 场景 03 终态 | +|---|---|---| +| 含合并数据的物理文件 | 单个 2.qcow2(VO_5 占用,含 5+2 全合并) | 3.qcow2 / 4.qcow2 / 5.qcow2 各含一份 (2-1) 差量 | +| vol.installPath 指向 | 2.qcow2 | 5.qcow2 | +| 删除掉的物理文件 | 5.qcow2(libvirt 在 pivot 时删)+ 2.qcow2 实际名(互换后归 VO_2,轮 4 走 delete)| 2.qcow2 | +| 总磁盘占用 | 较低(差量只一份) | 较高(差量 N 份,N=child 数) | + +**功能等价**:vol 拉起的 backing chain 长度都是 2 层(`vol → child → 1.qcow2`),用户视角"快照2 已删,3/4/5 仍在"完全一致。 + +--- + +## 全程关键脆弱点(仅梳理,不含加固) + +| 轮 | 失败类型 | 当前后果 | +|---|---|---| +| 1 / 2 | qcow2_rebase 失败(agent crash 或 IO 错) | 该 child 的 backing 可能已部分改写但数据未完成;DB 翻转尚未发生 → 物理仍指 2 / DB 仍指 2,幂等可重试 | +| 1 / 2 | qcow2_rebase 成功 + DB 翻转 SQL 失败 | 物理 child.backing=1,DB child.parentUuid=2 → 不一致 | +| 3 | 同上(对快照5) | 同上 | +| 3 | DB 翻转 SQL 失败 | 物理 5.qcow2 已挂 1,DB 仍记 parentUuid=2 | +| 4 | 删 2.qcow2 失败 | 孤儿文件残留 | + +注意:场景 03 没有 active commit pivot 的状态机问题,也没有 vol.installPath 必须同步切的脆弱点;最大风险只剩"qcow2_rebase 与 DB 翻转两步非原子"。 + +--- + +## 与场景 02 的核心结论 + +1. **agent 入口完全不同**:Stopped → `OFFLINE_MERGE_PATH` (`offline_merge_snapshot` → `qcow2_rebase`);Running → `CommitVolumeSnapshotOnHypervisorMsg` (libvirt blockCommit) 或 `OFFLINE_COMMIT_PATH` (`offline_commit_snapshot` → `qcow2_commit`) +2. **物理数据落地的文件不同**:场景 02 落到 dst(被删者的 path,单一文件);场景 03 落到每个 child(多份副本) +3. **DB 是否互换 path 不同**:commit 互换、pull 不互换;这直接决定加固设计 reconciler I4(installPath 不一致)的检测要在两条路径上分别考虑 +4. **vol.installPath 同步要求不同**:场景 02 必须切(脆弱点),场景 03 不动(天然安全) +5. **失败模式不同**:场景 03 没有 active commit pivot 状态机问题,但 "qcow2_rebase + DB 翻转" 仍是两步非原子操作;且失败会以"3/4/5 中某些已 rebase、某些未 rebase"的部分推进态出现 + +--- + +## 附:用户直觉表述与代码事实的对应 + +用户口头描述:"把快照2 的内容合并到快照1,删除快照2,快照 3/4/5 重新指定父节点为 1"。 + +按代码事实拆解: + +| 用户语 | 代码事实 | +|---|---| +| "3/4/5 重新指定父节点为 1" | ✅ `qcow2_rebase(1.qcow2, child.qcow2)` 改写 child 头部 backing 字段;DB `VO_child.parentUuid=1` | +| "把快照2 的内容合并到快照1" | ⚠ 严格意义上 1.qcow2 不被写(只读基线)。等效效果:(快照2 - 快照1) 的差量数据被分别**复制进每个 child**,使每个 child 在新的 1.qcow2 backing 下行为等价于原先在 2.qcow2 backing 下 | +| "删除快照2" | ✅ 轮 4 真正物理删 2.qcow2 | diff --git a/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md b/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md new file mode 100644 index 00000000000..e8117418d49 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md @@ -0,0 +1,349 @@ +# 场景 04:`deleteSingleFlows()` 中 online / offline 分支的判定时序 + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 源码: +> - `VolumeSnapshotTreeBase.java` 行 828-1290(`deleteSingleFlows` / `stepDelete` / `commit` / `pull`) +> - `VolumeTree.java` 行 364-392(`resolveDirection` / `isOnline`) + +--- + +## 总览:online / direction 在哪两步被决定 + +### 极简决策图 + +``` + ┌────────────────────────┐ + │ deleteSingleFlows() │ + │ 查 vmState (一次) │ + └───────────┬────────────┘ + │ + ▼ + ┌────────────────────────┐ + │ stepDelete() (每轮) │ + │ children = ? │ + └───────────┬────────────┘ + │ + ┌─────────────────┼──────────────────┐ + │ │ │ + ▼ ▼ ▼ + children=0 children≥2 children=1 + │ │ │ + │ │ ┌─────┴────────┐ + │ │ │ resolveDir │ + │ │ │ +isOnline │ + │ │ └─────┬────────┘ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌─────────┐ + │ deleteVO │ │ pull │ │ commit │ or pull + │ (终结) │ │ (强制) │ │ │ + └──────────┘ └────┬─────┘ └────┬────┘ + │ │ + └────────┬─────────┘ + ▼ + ┌────────────────┐ + │ online? │ + └───┬────────┬───┘ + true │ │ false + ▼ ▼ + Hypervisor PrimaryStorage + Msg Msg +``` + +### 四象限:(direction × online) → agent 入口(一图速查) + +``` + ┌─────────────────────┬─────────────────────┐ + │ online = true │ online = false │ + │ (Running/Paused │ (Stopped/Destroy │ + │ + alive chain) │ 或非 alive) │ + ┌─────────────┼─────────────────────┼─────────────────────┤ + │ Commit │ libvirt blockCommit │ qemu-img commit │ + │ │ + pivot (active) │ child→parent + 子节 │ + │ (默认 / null)│ vm_plugin │ 点 rebase │ + │ │ do_block_commit │ offline_commit_ │ + │ │ │ snapshot │ + ├─────────────┼─────────────────────┼─────────────────────┤ + │ Pull │ block-stream / pull │ qemu-img rebase │ + │ │ on hypervisor │ (parent, child) │ + │ (Auto 在线) │ vm_plugin │ offline_merge_ │ + │ │ do_pull │ snapshot │ + └─────────────┴─────────────────────┴─────────────────────┘ + ↑ ↑ + 场景 02 最后一轮 场景 02 轮 1/2 + 场景 03 全程 +``` + +### "请求 → 多轮 stepDelete → agent 入口"时间线 + +``` +APIDeleteVolumeSnapshotMsg (direction=null/Auto/Pull/Commit) + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ deleteSingleFlows() │ +│ vmState = query (一次, 整请求复用) │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮1 stepDelete children=[3,4,5] │ +│ 多子节点段 → 强制 pull → child=3 → online? │ +│ Running+3∈alive → 在线 pull (但本例 3 非 alive → offline)│ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮2 stepDelete children=[4,5] │ +│ 多子节点段 → 强制 pull → child=4 → offline │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮3 stepDelete children=[5] │ +│ 单子节点段 → resolveDirection → Commit/Pull │ +│ → isOnline → true/false │ +│ → commit() or pull() → hypervisor / PS msg │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮4 stepDelete children=[] │ +│ 终结 → deleteVolumeSnapshotAndSyncVolumeSize │ +└──────────────────────────────────────────────────────────────┘ +``` + +--- + +## 详细判定表(保留供查阅) + +整个删除请求只关心**两个布尔 / 枚举判定**: + +| 判定 | 取值 | 决定时机 | 决定位置 | 决定输入 | +|---|---|---|---|---| +| `vmState` | Running / Paused / Stopped / Destroyed / Destroying | `deleteSingleFlows()` flow 开头 | 行 852-859 | `VmInstanceVO.state`(如果 volume 没挂 vm,`vmState=null`) | +| `direction` | Commit / Pull / Auto / null | `stepDelete()` 仅在 **children.size()==1** 时计算 | 行 904-905 | `msg.getDirection()`(API 入参)+ `currentRoot.isLatest()` + `vmState` | +| `online` | true / false | `stepDelete()` 每一轮选完 child 后立即算 | 行 906 / 行 916 | `tree.current` + `vmState` + `target/child ∈ aliveChain` | + +`commit()` / `pull()` 内部再用一次 `online`(参数透传)决定走 hypervisor 消息还是 primary storage 消息。 + +--- + +## 第一步:`vmState` 校验(行 852-859) + +```java +if (volume.getVmInstanceUuid() != null) { + vmState = Q.New(VmInstanceVO.class)...select(state).findValue(); + if (vmState != Running && vmState != Paused + && vmState != Destroyed && vmState != Stopped && vmState != Destroying) { + trigger.fail("vm is not Running/Paused/Destroyed/Stopped/Destroying"); + return; + } +} +``` + +要点: +- volume 未挂 VM → `vmState = null`,后续所有 `online` 计算返回 false(Pull 全走离线) +- 合法的 vmState:5 种,其中 **Running / Paused 才有可能 online**;Stopped / Destroying / Destroyed 一定 offline +- `vmState` 仅查一次,整个删除请求过程中**复用同一快照值**(不在 stepDelete 每轮重查) + +--- + +## 第二步(每轮):`stepDelete()` 选 child 并判定 online / direction + +行 875-919 的伪流程: + +``` +stepDelete(): + children = tree.getSnapshotLeaf(currentRoot.uuid).getChildren() + + if children.isEmpty(): + deleteVolumeSnapshotAndSyncVolumeSize() # 终结分支,无 online/direction 判定 + return + + onlineChild = children.firstMatch(c -> isOnline(currentRoot, c, vmState)) # ⚠ Bug 0 已修复:改为 isOnAliveChain(c),命名也改为 aliveChild + + if children.size() == 1: + child = children.get(0) + direction = tree.resolveDirection(currentRoot, child, msg.direction, currentRoot.isLatest, vmState) + online = tree.isOnline(current, currentRoot, child, vmState) + if direction == Commit: + commit(child, tree, online, comp) + else: + pull(child, tree, online, comp) + else: + # 多子节点:避开 alive child(让它最后一轮单独跑 commit) + if onlineChild != null && children.get(0) == onlineChild: + child = children.get(1) + else: + child = children.get(0) + online = tree.isOnline(current, currentRoot, child, vmState) + pull(child, tree, online, comp) # 多子节点段恒走 pull,不判定 direction +``` + +### 2.1 `direction` 判定的"作用域" + +`direction` **只在 children.size()==1 时计算并使用**。多子节点段恒走 pull(行 917 `pull(...)`,不调 `resolveDirection`)。也就是说: + +- 多子节点段:`msg.getDirection()` 即使是 Commit,也**被忽略**,强制 pull +- 多子节点段最终把所有非 alive 子节点都推下去后,剩 1 个子节点(通常是 alive child)→ 才进入"判 direction"分支 + +### 2.2 `resolveDirection`(`VolumeTree.java` 行 364-387) + +```java +boolean online = (vmState == Running || Paused) + && aliveChain.contains(target) && aliveChain.contains(child); +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + +if (initialDirection == "Pull" && shouldUseCommitStrategy) + throw "the snapshot will be deleted by block 'commit', but the direction is 'pull'"; + +if (initialDirection == null) return Commit; // 默认 Commit +if (initialDirection == "Auto") return shouldUseCommitStrategy ? Commit : Pull; +return DeleteVolumeSnapshotDirection.fromString(initialDirection); // 显式 Commit / Pull +``` + +输入到决策的真值表(current 树 + child=alive child 的常见情形): + +| `vmState` | `targetIsLatest` | online | shouldCommit | initial=Auto | initial=null | initial=Pull | initial=Commit | +|---|---|---|---|---|---|---|---| +| Running | false | true | **true** | **Commit** | Commit | ❌ throw | Commit | +| Running | true | true | false | Pull | Commit | Pull | Commit | +| Stopped | * | false | false | Pull | Commit | Pull | Commit | +| Paused | false | true | true | Commit | Commit | ❌ throw | Commit | + +> ⚠ **Bug 0 修复后**(参考 `../bugs.md`):`shouldUseCommitStrategy` 已解耦 vmState。新规则只看 "target/child 是否都在 aliveChain"。修复后 `Stopped + target/child∈aliveChain` 行:`shouldCommit=true`、`Auto → Commit`、`Pull → ❌ throw`。Stopped + Auto + 待删/child 都在 vol 链上 → 走 offline commit(与场景 05 路径一致),不再写出 N 份差量。 + +注意三个反直觉点: +1. `initial=null` 总是返回 Commit(不看 online) —— Commit 路径在离线下会落到 `CommitVolumeSnapshotOnPrimaryStorageMsg → offline_commit_snapshot` +2. `initial=Pull` 在 shouldCommit 时直接 throw —— API 拒绝 +3. `initial=Auto` 才会真正按 online 切换;这是 `APIDeleteVolumeSnapshotMsg` 默认值(前端通常不显式指定 → 走 Auto) + +### 2.3 `isOnline`(`VolumeTree.java` 行 389-392) + +```java +return treeIsCurrent + && (vmState == Running || Paused) + && aliveChain.contains(target) && aliveChain.contains(child); +``` + +四个条件全 true 才返回 true: +- `treeIsCurrent`:该 snapshot 树当前挂在 volume 上(VolumeSnapshotTreeVO.current=true) +- `vmState ∈ {Running, Paused}` +- `target`(被删者)在 aliveChain 上 +- `child`(被选中合并方)在 aliveChain 上 + +**关键观察**:`shouldUseCommitStrategy` 的 online 子句**与 `isOnline` 对 `target/child` 的判定本质相同**(除 `treeIsCurrent` 外)。所以 `direction == Commit` 几乎一定意味着 `online == true`(仅"非 current 树"是反例 —— 但非 current 树通常也不在 aliveChain)。 + +--- + +## 第三步:`commit()` / `pull()` 用 `online` 选 hypervisor 还是 primary storage 消息 + +### 3.1 `commit()` 行 1006-1080 + +```java +if (online) { + String hostUuid = ...VmInstanceVO.hostUuid; + CommitVolumeSnapshotOnHypervisorMsg cmsg = new CommitVolumeSnapshotOnHypervisorMsg(); + ... + bus.send(cmsg); // → KVMHost → vm_plugin do_block_commit (libvirt blockCommit + pivot) +} else { + CommitVolumeSnapshotOnPrimaryStorageMsg cmsg = new CommitVolumeSnapshotOnPrimaryStorageMsg(); + ... + bus.send(cmsg); // → LocalStorageKvmBackend.handle → OFFLINE_COMMIT_PATH → offline_commit_snapshot +} +``` + +### 3.2 `pull()` 行 1227-1268 + +```java +if (online) { + PullVolumeSnapshotOnHypervisorMsg pmsg = new PullVolumeSnapshotOnHypervisorMsg(); + ... + bus.send(pmsg); // → KVMHost → vm_plugin do_block_stream / do_block_commit (取决于 hypervisor 实现) +} else { + PullVolumeSnapshotOnPrimaryStorageMsg pmsg = new PullVolumeSnapshotOnPrimaryStorageMsg(); + ... + bus.send(pmsg); // → LocalStorageKvmBackend.handle → OFFLINE_MERGE_PATH → offline_merge_snapshot +} +``` + +### 3.3 (direction × online) 四象限到 agent 入口 + +| direction | online | Java 消息 | Agent 入口 | 物理操作 | +|---|---|---|---|---| +| Commit | true | `CommitVolumeSnapshotOnHypervisorMsg` | KVM `vm_plugin` `do_block_commit` | libvirt blockCommit (active) + pivot | +| Commit | false | `CommitVolumeSnapshotOnPrimaryStorageMsg` | local `offline_commit_snapshot` | `qcow2_commit(child→parent)` + 给 child 的 children 重 rebase 到 parent | +| Pull | true | `PullVolumeSnapshotOnHypervisorMsg` | KVM `vm_plugin`(pull-on-hypervisor 路径,存储具体逻辑因 backend 而异) | online block-stream / commit 子型 | +| Pull | false | `PullVolumeSnapshotOnPrimaryStorageMsg` | local `offline_merge_snapshot` | `qcow2_rebase(parent, child)`(差量进 child) | + +注意第 2 行(Commit + offline)几乎只在 `initial=null`(前端不传 direction)+ Stopped 下被走到。多子节点段被强制 pull 不会落到这里。 + +--- + +## 第四步:判定时序时间线(一次 stepDelete 调用) + +``` +[控制面入口] + deleteSingleFlows() flow start + │ + ├─ Storage / Memory 类型短路 → deleteVolumeSnapshotAndSyncVolumeSize → end + │ + ├─ vmState = query VmInstanceVO.state # 仅一次 + │ 不在 5 种合法状态 → fail + │ + └─ stepDelete() # 递归入口 + │ + ├─ children = tree.snapshotLeaf(currentRoot).children + ├─ if empty → deleteVolumeSnapshotAndSyncVolumeSize → comp.success → 收敛 + │ + ├─ onlineChild = children.firstMatch(isOnline) # 选 alive child + │ + ├─ if size == 1: + │ direction = resolveDirection(target, child, msg.dir, isLatest, vmState) # ★direction 判定★ + │ online = isOnline(current, target, child, vmState) # ★online 判定★ + │ if Commit → commit(child, tree, online, comp) + │ └─ commit 内: if online → CommitOnHypervisor; else → CommitOnPS + │ else → pull(child, tree, online, comp) + │ └─ pull 内: if online → PullOnHypervisor; else → PullOnPS + │ + └─ if size >= 2: + if onlineChild != null && children.get(0) == onlineChild: + child = children.get(1) # 避开 alive,让它最后做 + online = isOnline(current, target, child, vmState) # ★online 判定★(无 direction 判定) + pull(child, tree, online, comp) + └─ pull 内: if online → PullOnHypervisor; else → PullOnPS + +[每轮 child 处理完成后] + comp.success() → stepDelete(comp) # 重新拉一轮 children,递归直至 empty +``` + +每轮 stepDelete 至多产生一条 commit 或 pull 消息;vmState 在整个递归中复用,online 每轮单独算(树结构在变,但 vmState 不变 → online 实际由 "child 是否仍在 aliveChain" 决定)。 + +--- + +## 关键结论速查 + +1. **online / offline 不是请求级开关,是"每轮 × 该轮选中 child"级开关** +2. **direction 仅在最后一轮(children.size==1)才参与决策**;多子节点段恒走 pull +3. **vmState ∈ {Stopped, Destroyed, Destroying} → 整个请求所有轮全 offline**(无论 child 是否在 aliveChain) +4. **vmState ∈ {Running, Paused} 但被删快照不在 aliveChain → 仍 offline**(典型如:删的是分叉的 sibling 而非主链) +5. **`initial=null`(前端不传) → direction 一定 Commit**:在 Stopped 时会把 commit 路径打到 `offline_commit_snapshot`;前端如果想 Auto 行为必须显式传 `direction=Auto` +6. **`initial=Pull` 但 shouldCommit → API 直接 throw**:这是个白名单校验,避免在线 alive chain 被强制走 pull 导致 VM 被踢出 + +--- + +## 与场景 02 / 03 / 05 的对应 + +| 场景 | vmState | initial.direction | 多子节点段(轮 1-2) | 最后一轮(children.size=1) | 类型 | +|---|---|---|---|---|---| +| 02 (Running, 删快照2) | Running | Commit / Auto | online=false → 离线 pull (`offlinemerge`) | direction=Commit + online=true → **在线** commit (libvirt blockCommit + pivot) | 源码推演 | +| 03 (Stopped, 删快照2) | Stopped | Auto / Pull | online=false → 离线 pull (`offlinemerge`) | direction=Pull + online=false → 离线 pull (`offlinemerge`,差量进 5.qcow2,DB 不互换) | 源码推演 | +| **05** (Stopped, 删快照2) | Stopped | **Commit** | online=false → 离线 pull (`offlinemerge`) | direction=Commit + online=false → **离线 commit (`offlinecommit`,数据 5→2,DB 互换 + VO_2 DELETE)** | **实测** | + +注:场景 03 的"最后一轮"实际行为取决于 API 入参的 `direction`: +- `direction=null`(无入参)→ resolveDirection 返回 Commit → **同场景 05 路径** +- `direction=Auto` → 因 `online=false` 返回 Pull → `PullVolumeSnapshotOnPrimaryStorageMsg` → `offline_merge_snapshot`(数据 1→5 差量,DB 不互换) +- `direction=Pull` → 不 throw(因 `shouldCommit=false`)→ 同 Auto +- `direction=Commit` → 落到场景 05 实测路径 + +**`03-...stopped-...md` 按 `initial=Auto/Pull` 口径写**;**`05-...actual.md` 按 `initial=Commit` 实测**。两者覆盖 Stopped 路径的两种 direction 分支。加固设计的"入参矩阵"必须分别覆盖。 diff --git a/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md b/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md new file mode 100644 index 00000000000..78ef39ae929 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md @@ -0,0 +1,295 @@ +# 场景 05:local + 关机 VM + `direction=Commit` + 删 group 2(**实测**) + +> 实测于 5.5.6 基线 ZSV 环境(管理节点 172.26.53.180)。 +> 与 `02-...running-...md` / `03-...stopped-...md` 对照阅读:本文件是**实测真值**,前两个是源码推演。 +> 实测时间:2026-05-13 16:54:56 ~ 16:54:58(总耗时 ~2s)。 + +--- + +## 1. 环境与入参 + +| 项 | 值 | +|---|---| +| VM uuid | `fa51c9637c024d94a556dd474a5cd74e` | +| VM 状态(操作时) | **Stopped** | +| Host | `69a7844559844d7193c42e78095911e2` | +| 主存储 | LocalStorage `a9222f7b445e4d2ebd1f1f958dec2f7c`(`/vms_ds`) | +| Root volume | `8dea4b2bb57b402e90beb510c8784507` | +| 快照树 | `08ab32b181644617bb4f8cd32804a6dd`(current=1) | +| API | `APIDeleteVolumeSnapshotGroupMsg` | +| API uuid | `e56623d94e294f9bbabd7a1a9eaf31f2` | +| Group uuid | `ee59701943554014a95d2badb0b2b98d`(snap-group "2") | +| 入参 direction | **`Commit`** | +| 入参 scope | `single` | +| 结果 | `success=true`,操作 1 个 snapshot:`59897f45b2d841e98ec588da025dc841`(即"快照2") | + +## 2. 操作前树结构 + +### 2.1 快照 VO 表 + +| 显示名 | snapshot.uuid | parentUuid | distance | latest | installPath 文件名 | +|---|---|---|---|---|---| +| 1 | `aa7290b5…e70c` | NULL | 1 | 0 | `8dea4b2b…4507.qcow2` | +| 2 | `59897f45…c841` | `aa72…e70c` (=1) | 2 | 0 | `aa7290b5…e70c.qcow2` | +| 3 | `92e8b9bc…bc5c` | `59897…c841` (=2) | 3 | 0 | `59897f45…c841.qcow2` | +| 4 | `0baccfe6…d49c` | `59897…c841` (=2) | 3 | 0 | `596c7400…cb54.qcow2` | +| 5 | `be2680f7…5452` | `59897…c841` (=2) | 3 | **1** | `0cabc0f3…cd1a.qcow2` | +| (vol) | volume = `8dea4b2b…4507` | — | — | — | `be2680f7…5452.qcow2` | + +> **命名错位提醒**:ZStack 实现"做快照"为"冻结当前 + 新建当前",所以 snapshot.installPath 的物理**文件名**通常是它**父辈被冻结时的旧文件名**,与该 snapshot 自身的 uuid 不一致。下面用"X.qcow2"代指 VO_X 的物理文件,文件名用括号注明。 + +### 2.2 物理 backing chain(操作前) + +``` +imagecache/template/e4e3cca9…e5c.qcow2 (镜像基线,只读) + ↑ +1.qcow2 (文件: 8dea…4507.qcow2) + ↑ +2.qcow2 (文件: aa72…e70c.qcow2) + ↑ ↑ ↑ +3.qcow2 4.qcow2 5.qcow2 +(59897…c841) (596c…cb54) (0cab…cd1a) + ↑ + vol.qcow2 (be26…5452.qcow2) +``` + +--- + +## 3. 实测 Agent HTTP POST 序列(6 次) + +抓取自 `management-server.log`(`grep 'api=e56623d94e294f9bbabd7a1a9eaf31f2'`)。 + +| # | 时间 | path | 关键参数 | 含义 | +|---|---|---|---|---| +| 1 | 16:54:56.483 | `/localstorage/volume/getbackingchain` | installPath=2.qcow2 | 查"被删者(2)的 backing"→ 得 `srcSnapshotParentPath = 1.qcow2` | +| 2 | 16:54:56.642 | **`/localstorage/snapshot/offlinemerge`** | srcPath=**1.qcow2**
destPath=**4.qcow2** (`596c…cb54`) | **轮 1:离线 pull 4 → 1**(`qcow2_rebase(1, 4)`)| +| 3 | 16:54:56.949 | **`/localstorage/snapshot/offlinemerge`** | srcPath=**1.qcow2**
destPath=**3.qcow2** (`59897…c841`) | **轮 2:离线 pull 3 → 1**(`qcow2_rebase(1, 3)`)| +| 4 | 16:54:57.236 | **`/localstorage/snapshot/offlinecommit`** | top=**5.qcow2** (`0cab…cd1a`)
base=**2.qcow2** (`aa72…e70c`)
topChildrenInstallPathInDb=[vol] | **轮 3:离线 commit 5 → 2**(`qcow2_commit(5, 2)` + 给 5 的子节点 rebase 到 2)| +| 5 | 16:54:57.589 | `/localstorage/delete` | path=**5.qcow2** (`0cab…cd1a`) | **轮 4:删除"5 物理文件"**(commit 后被抽空的 top) | +| 6 | 16:54:57.898 | `/localstorage/volume/getsize` | installPath=vol | syncVolumeSize 收尾 | + +> 全程**无** `/kvm/vm/*`(即未调 libvirt blockCommit)—— 关机路径不经 hypervisor。 + +--- + +## 4. 4 轮 stepDelete 对应 + +`VolumeSnapshotTreeBase.stepDelete()` 行 875-919 的执行展开: + +### 轮 1:children = [3, 4, 5],多子节点段(强制 pull,忽略 `direction=Commit`) + +``` +onlineChild = null (Stopped → isOnline 全 false) +child = children.get(0) = 4 ★ 实测选 4,不是 3 +online = false +pull(4, tree, online=false, comp) + → PullVolumeSnapshotOnPrimaryStorageMsg + → LocalStorageKvmBackend.handle → OFFLINE_MERGE_PATH + → OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=4.qcow2, fullRebase=false} + → agent: linux.qcow2_rebase(1.qcow2, 4.qcow2) + # 4.qcow2 backing: 2.qcow2 → 1.qcow2,(2-1) 差量写入 4.qcow2 +DB: VO_4.parentUuid = 1, distance-- +``` + +⚠️ **修订源码推演**:之前 `02 / 03` 文档假设 `children.get(0) = 3`(按 distance/createDate 升序),实测**选到 4**。说明 `VolumeTree.SnapshotLeaf.getChildren()` 返回顺序**不保证按 distance/createDate**,由底层 collection 实现决定。对最终行为无影响(3、4 均非 alive,谁先谁后等价),但加固设计若依赖"3 一定先于 4"应避免此假设。 + +### 轮 2:children = [3, 5],多子节点段 + +``` +child = children.get(0) = 3 +online = false +pull(3, tree, online=false, comp) + → qcow2_rebase(1.qcow2, 3.qcow2) +DB: VO_3.parentUuid = 1, distance-- +``` + +### 轮 3:children = [5],单子节点段(`direction=Commit` 终于生效) + +``` +direction = resolveDirection(2, 5, "Commit", isLatest=true, Stopped) + → return fromString("Commit") = Commit + (initial=Commit 非 Pull、非 null、非 Auto,原样返回; + shouldUseCommitStrategy=false 仅影响 Pull 是否被拒,不拒 Commit) +online = isOnline(current=true, 2, 5, Stopped) = false + (Stopped → 第二个条件失败) +commit(5, tree, online=false, comp) + → online=false 分支 → CommitVolumeSnapshotOnPrimaryStorageMsg + → LocalStorageKvmBackend.handle → OFFLINE_COMMIT_PATH + → OfflineCommitSnapshotCmd{ + top = srcSnapshot(=5).installPath = 5.qcow2 (0cab…cd1a), + base= dstSnapshot(=2).installPath = 2.qcow2 (aa72…e70c), + topChildrenInstallPathInDb = [vol.installPath = be26…5452.qcow2] + } + → agent (offline_commit_snapshot): + if qcow2_get_backing_file(5.qcow2) != qcow2_get_backing_file(2.qcow2): + # 5 backing=2, 2 backing=1,两者不同 → 进合并 + linux.qcow2_commit(top=5.qcow2, base=2.qcow2) + # 把 5 的差量 flush 进 2;2 仍 backing=1 + for child in [vol = be26…5452.qcow2]: + if qcow2_get_backing_file(vol) != 2.qcow2: + # vol.backing 当前是 5.qcow2(0cab…cd1a)→ 不等 + linux.qcow2_rebase_no_check(base=2.qcow2, vol) + # vol.backing: 5.qcow2 → 2.qcow2 +``` + +物理结束态: +- `2.qcow2` (aa72…) 内含原 2 + 5 的合并数据,backing 仍是 1.qcow2 +- `5.qcow2` (0cab…) 已被抽空(数据已合并入 2),但**文件还在** +- `vol.qcow2` (be26…) backing 改写为 `2.qcow2` + +### 轮 3 DB 互换(SQLBatch 单事务,与场景 02 同结构) + +``` +src=5 (be26…5452), dst=2 (59897…c841) + +互换前: + VO_5.installPath = 0cab…cd1a.qcow2 parentUuid = 2 distance = 3 + VO_2.installPath = aa72…e70c.qcow2 parentUuid = 1 distance = 2 + vol.installPath = be26…5452.qcow2 + +互换后: + VO_2 整条 DB 记录删除(commit 路径"dst 即被删者",DB 不再保留旧 path) + VO_5.installPath = aa72…e70c.qcow2 parentUuid = 1 distance = 2 ← 接管 2 的物理文件 + vol.installPath = be26…5452.qcow2 (不变,但物理 backing 已切到 aa72…e70c) +``` + +⚠️ **与之前推演的差异**:源码注释推断"VO_2.installPath 互换为 5 的旧文件名",实测**直接删 VO_2**(连同 Group "2"),VO_2 没有保留任何 path 记录。互换发生在 VO_5 这一侧(VO_5 接管原 2 的文件),同时 VO_2 整条删除。 + +### 轮 4:children=[],物理清扫 + +``` +children = [] // VO_5.parentUuid 已跨过 2 指向 1 +deleteVolumeSnapshotAndSyncVolumeSize(comp) + → DeleteVolumeSnapshotOnPrimaryStorageMsg → /localstorage/delete + path = 0cab…cd1a.qcow2 ★ 删的是 5 的原物理文件(已被抽空) + → SyncVolumeSize → /localstorage/volume/getsize + vol.actualSize 更新 +``` + +--- + +## 5. 操作后实测状态 + +### 5.1 快照 VO 表(实测) + +| name | uuid | parentUuid | distance | latest | installPath 文件名 | +|---|---|---|---|---|---| +| 1 | aa72…e70c | NULL | 1 | 0 | `8dea…4507.qcow2`(不变)| +| 3 | 92e8…bc5c | **aa72…e70c (=1)** | **2** ↓ | 0 | `59897…c841.qcow2`(不变)| +| 4 | 0bac…d49c | **aa72…e70c (=1)** | **2** ↓ | 0 | `596c…cb54.qcow2`(不变)| +| **5** | be26…5452 | **aa72…e70c (=1)** | **2** ↓ | **1** | **`aa72…e70c.qcow2`** ⬅ **变了** | + +VO_2 消失。VolumeSnapshotGroupVO "2" 同步消失。 + +### 5.2 vol.installPath(实测) + +``` +vol.installPath = /vms_ds/.../snapshots/be2680f7…5452.qcow2 +``` + +**未变**(仍是 vol 自己的 uuid 文件)。物理 backing 由原 `0cab…cd1a` 切到 `aa72…e70c`。 + +### 5.3 物理 backing chain(实测 `qemu-img info`) + +``` +imagecache/template/e4e3cca9…e5c.qcow2 + ↑ +8dea…4507.qcow2 [= VO_1 物理文件,未变] + ↑ ↑ ↑ +3.qcow2 4.qcow2 aa72…e70c.qcow2 [= 新 VO_5 物理文件,原 2.qcow2,含 5+2 合并] +(59897…c841) (596c…cb54) ↑ + be26…5452.qcow2 [vol,未变] +``` + +### 5.4 物理 ls(`/vms_ds/rootVolumes/.../snapshots/`) + +| 文件名 | size | 角色 | +|---|---|---| +| `8dea…4507.qcow2` | 18 MiB | VO_1(基础) | +| `aa72…e70c.qcow2` | 6 MiB | **新 VO_5(含合并),原 VO_2 文件被接管** | +| `59897…c841.qcow2` | 6 MiB | VO_3 | +| `596c…cb54.qcow2` | 6 MiB | VO_4 | +| `be26…5452.qcow2` | 18 MiB | vol(当前可写层) | +| ~~`0cabc0f3…cd1a.qcow2`~~ | (已删) | 原 VO_5 物理文件,被轮 4 清除 | +| `92e8…bc5c.qcow2` | 18 MiB | (操作前的 vol 文件?需另查,不影响本场景)| + +--- + +## 6. 与源码推演(场景 03 - Commit 分支)的差异点回顾 + +| 检查点 | 源码推演 | 实测 | 一致 | +|---|---|---|---| +| 多子节点段强制 pull(忽略 direction) | ✓ | ✓ POST `offlinemerge` 而非 `offlinecommit` | ✅ | +| 多子节点段 `child = children.get(0)` 是 distance/createDate 最小者 | 推测"3" | **实测"4"** | ⚠ 顺序假设错 | +| 单子节点段 `direction=Commit` 显式传入 → resolveDirection 原样返回 Commit | ✓ | ✓ | ✅ | +| `online = false`(Stopped)→ 走 `CommitVolumeSnapshotOnPrimaryStorageMsg` | ✓ | ✓ POST 落到 `/localstorage/snapshot/offlinecommit` | ✅ | +| top=child(5), base=被删者(2), topChildren=[vol] | ✓ | ✓ 完全吻合请求 body | ✅ | +| DB 互换 installPath(VO_5 接管 2 的物理文件) | ✓ | ✓ VO_5.installPath = `aa72…e70c.qcow2` | ✅ | +| VO_2 处理方式 | 推测"互换 path 后保留至轮 4" | **实测直接删除(无保留态)** | ⚠ 互换是单边的 | +| vol.installPath 同步 | 推测"切到 2.qcow2 文件名" | **实测不变**(仍 `be26…5452.qcow2`);切换发生在物理 backing 层 `qcow2_rebase_no_check` | ⚠ DB 层 vol.installPath 是稳定的,"vol 跟随物理文件名"靠 backing 链而非 installPath 字段 | +| 轮 4 物理删 = 旧 5 物理文件(0cab…cd1a) | ✓ | ✓ | ✅ | + +### 关键修订(已影响场景 02 / 03 文档) + +1. **`children.get(0)` 顺序不保证按 distance**:场景 02 / 03 文档中"轮 1 删 3、轮 2 删 4"应改为"具体顺序由底层 collection 决定,3 和 4 中任一先后均合法" +2. **VO_2(dst 被删者)在 DB 中是"删除"而非"互换占位保留"**:场景 02 中关于"VO_2.installPath 互换为 5.qcow2 待轮 4 删"的描述需修正——`updateDatabaseAfterCommit` 直接将 VO_2 DELETE,VO_5 接收新 installPath;轮 4 删的是"VO_5 原文件"而非"VO_2 占位" +3. **`vol.installPath` 不参与互换**:commit 路径下 vol.installPath 字段稳定不变;vol 跟随到合并后文件,是通过**物理 backing 链改写**(`qcow2_rebase_no_check`)+ **VO_5 接管旧 dst 文件**的组合,DB 中 vol VO 的 installPath 字段不动 + +> 这三条修订需要回填到 `02-...running-...md` 和 `03-...stopped-...md`,作为后续修订项记入索引。 + +--- + +## 7. 关键脆弱点(基于实测路径) + +| 阶段 | 失败 | 后果 | +|---|---|---| +| 轮 1/2 `offlinemerge` | `qcow2_rebase` 失败 / DB 翻转失败 | 某 child 物理 backing 已切但 DB parentUuid 未翻;或反之 | +| 轮 3 `offlinecommit` 第一步 `qcow2_commit(5,2)` 失败 | 2.qcow2 未含合并数据,但代码已发出请求 | DB 未翻转,幂等可重试 | +| 轮 3 `offlinecommit` 中途崩溃(`qcow2_commit` 成功 + `qcow2_rebase_no_check(vol)` 失败) | 2.qcow2 已含合并,vol.backing 仍指 5.qcow2 | DB 未翻转 → 二次删除请求可触发 reconciler 修复 | +| 轮 3 SQLBatch 失败 | 物理已合并 + vol.backing 已切,DB 仍记 vol→VO_5(0cab…) | **VO_2 仍在 DB,VO_5.installPath 仍是 0cab…,但 0cab… 物理文件已被抽空** —— 数据可见性破坏,需 reconciler 介入 | +| 轮 4 `delete` 失败 | 0cab…cd1a 文件残留 | 孤儿文件,无人引用,GC 清扫即可 | + +**Stopped + Commit 路径最严重故障 = 轮 3 物理操作成功 + DB SQLBatch 失败**:物理上 vol 已挂 2.qcow2,但 DB 仍记 vol 挂 5.qcow2(=0cab…cd1a),重启会按 DB 拉起,导致 backing chain 指向**已被抽空但未删除**的 0cab…cd1a 文件,看不到任何已写入 2.qcow2 的数据。 + +加固设计的 reconciler I3b/I4 必须覆盖此场景。 + +--- + +## 8. 一图总结(实测时序) + +``` +16:54:56.076 APIDeleteVolumeSnapshotGroupMsg 进入 + direction=Commit, scope=single, groupUuid=ee59…2b98d + │ +16:54:56.483 POST /getbackingchain (查 2.qcow2 的 backing → 1.qcow2) + │ +16:54:56.642 [轮 1] POST /offlinemerge(srcPath=1, destPath=4) + agent: qcow2_rebase(1.qcow2, 4.qcow2) + DB: VO_4.parentUuid=1, distance-- + │ +16:54:56.949 [轮 2] POST /offlinemerge(srcPath=1, destPath=3) + agent: qcow2_rebase(1.qcow2, 3.qcow2) + DB: VO_3.parentUuid=1, distance-- + │ +16:54:57.236 [轮 3] POST /offlinecommit(top=5, base=2, topChildren=[vol]) + agent: qcow2_commit(5→2) + qcow2_rebase_no_check(2, vol) + [DB SQLBatch] DELETE VO_2; VO_5.installPath=aa72…(原2文件), + VO_5.parentUuid=1, distance-- + │ +16:54:57.589 [轮 4] POST /delete(path=0cab…=旧5物理文件) + │ +16:54:57.898 POST /getsize (vol) → SyncVolumeSize + │ +16:54:58.009 APIDeleteVolumeSnapshotGroupEvent success + results: [{snapshotUuid=59897f45…c841, success=true}] + 总耗时 ≈ 1.93s +``` + +--- + +## 9. 与场景 02 / 03 / 04 的引用更新建议 + +- `02-...running-...md` 终态表"VO_2.installPath 互换为 5"应修正为"**VO_2 被直接删除**" +- `03-...stopped-...md` 顶部"Stopped + initial=Auto/Pull"小节保留;"Stopped + initial=Commit"分支应**全部引向本文件**而非自行推演 +- `04-deleteSingleFlows-online-offline-decision.md` 末尾"场景 02/03 对应"表添加一行 "场景 05 = Stopped + Commit 实测,最后一轮走 offline commit + DB 互换 + 删 child 旧文件" +- `00-index.md` 添加场景 05 条目 diff --git a/docs/snapshot-single-delete/scenarios/_query_tree.py b/docs/snapshot-single-delete/scenarios/_query_tree.py new file mode 100644 index 00000000000..13bbc53bc7a --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/_query_tree.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +import paramiko +import sys + +HOST = "172.26.53.180" +USER = "root" +PWD = "admin@123" +VM_UUID = "fa51c9637c024d94a556dd474a5cd74e" + +def run(client, cmd): + stdin, stdout, stderr = client.exec_command(cmd) + out = stdout.read().decode("utf-8", errors="replace") + err = stderr.read().decode("utf-8", errors="replace") + return out, err + +def mysql(client, sql): + cmd = "mysql -pzstack.mysql.password zstack -t -e \"" + sql.replace('"', '\\"') + "\"" + out, err = run(client, cmd) + # Filter out mysql password warning + err = "\n".join([l for l in err.splitlines() if "Using a password" not in l and l.strip()]) + return out, err + +def main(): + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(HOST, username=USER, password=PWD, timeout=10) + + print("=" * 70) + print("1. VM 基本信息") + print("=" * 70) + out, err = mysql(client, + f"SELECT uuid, name, state, rootVolumeUuid, hostUuid FROM VmInstanceVO WHERE uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("2. Root Volume 信息") + print("=" * 70) + out, err = mysql(client, + f"SELECT v.uuid, v.name, v.type, v.installPath, v.size, v.primaryStorageUuid, v.rootImageUuid " + f"FROM VolumeVO v JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("3. 快照树 VolumeSnapshotTreeVO") + print("=" * 70) + out, err = mysql(client, + f"SELECT t.uuid AS treeUuid, t.volumeUuid, t.current, t.createDate " + f"FROM VolumeSnapshotTreeVO t JOIN VolumeVO v ON t.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("4. 快照树所有节点 VolumeSnapshotVO") + print("=" * 70) + out, err = mysql(client, + f"SELECT s.uuid, s.name, s.parentUuid, s.treeUuid, s.distance, s.latest, s.size, s.primaryStorageInstallPath " + f"FROM VolumeSnapshotVO s " + f"JOIN VolumeVO v ON s.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid " + f"WHERE vm.uuid='{VM_UUID}' " + f"ORDER BY s.distance, s.createDate\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("5. 物理 backing chain(在物理机上 qemu-img info)") + print("=" * 70) + # First get rootVolume installPath + out, err = run(client, + f"mysql -pzstack.mysql.password zstack -N -e \"" + f"SELECT v.installPath FROM VolumeVO v JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\"") + root_path = out.strip().splitlines()[-1].strip() if out.strip() else "" + print(f"vol.installPath = {root_path}") + + # Get all snapshot paths + out, err = run(client, + f"mysql -pzstack.mysql.password zstack -N -e \"" + f"SELECT s.name, s.primaryStorageInstallPath FROM VolumeSnapshotVO s " + f"JOIN VolumeVO v ON s.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid " + f"WHERE vm.uuid='{VM_UUID}'\"") + print("快照物理路径列表:") + print(out) + + # Trace backing chain from vol + if root_path: + print(f"\n--- qemu-img info --backing-chain {root_path} ---") + out, err = run(client, f"qemu-img info --backing-chain {root_path} 2>&1") + print(out) + + client.close() + +if __name__ == "__main__": + main() From adfbda76c198ee81a91dcc5d084ef781d4eba894 Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Thu, 14 May 2026 18:22:57 +0800 Subject: [PATCH 4/5] [storage]: symmetric snapshot group disband on chain delete ungroupAfterDeleted now mirrors ungroupAfterDeleteSingleSnapshot: regardless of root/data volume type, a snapshot group VO is only disbanded after ALL of its refs reach snapshotDeleted=true. Previously the root-volume path immediately removed the group VO once the root chain finished, leaving data-volume refs as orphans pointing to a non-existent group. After this change the group lives until every ref is marked deleted, which is the symmetric behavior with the single-snapshot path and the precondition the integrity check (landed in a separate commit) relies on. Resolves: ZSV-10538 Change-Id: I77707a78706271697463676a756c617a796f6c61 --- .../snapshot/VolumeSnapshotTreeBase.java | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java index 5781bc56308..7aa6adea686 100755 --- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java @@ -2145,26 +2145,37 @@ protected Boolean scripts() { return cleanup; } - // The logic for cleaning up snapshot groups when deleting a snapshot chain + // The logic for cleaning up snapshot groups when deleting a snapshot chain. + // Symmetric with ungroupAfterDeleteSingleSnapshot: regardless of root/data volume type, + // a group is only disbanded after ALL its refs have snapshotDeleted=true. + // This avoids leaving orphan refs (root chain delete used to immediately drop the group VO, + // leaving data-volume refs pointing to a non-existent group). private void ungroupAfterDeleted(List snapshots) { List uuids = snapshots.stream().map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); SQL.New(VolumeSnapshotGroupRefVO.class).in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) .set(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).update(); - if (currentRoot.getVolumeType().equals(VolumeType.Root.toString())) { - List groupUuids = new ArrayList<>(); - for (VolumeSnapshotInventory snapshot : snapshots) { - String groupUuid = snapshot.getGroupUuid(); - if (groupUuid != null) { - logger.debug(String.format("root volume snapshot[uuid:%s, name:%s] has been deleted, " + - "ungroup snapshot group[uuid:%s]", snapshot.getUuid(), snapshot.getName(), groupUuid)); - groupUuids.add(groupUuid); - } + Set touchedGroupUuids = snapshots.stream() + .map(VolumeSnapshotInventory::getGroupUuid) + .filter(Objects::nonNull) + .collect(Collectors.toSet()); + + List groupsToDelete = new ArrayList<>(); + for (String groupUuid : touchedGroupUuids) { + long remaining = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, false).count(); + if (remaining == 0) { + logger.debug(String.format("snapshot group[uuid:%s] all volume snapshots have been deleted, " + + "disbanding group", groupUuid)); + groupsToDelete.add(groupUuid); } + } - groupUuids.forEach(groupUuid -> vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid)); - cleanVmHostBackupFilesForGroup(groupUuids); - dbf.removeByPrimaryKeys(groupUuids, VolumeSnapshotGroupVO.class); + if (!groupsToDelete.isEmpty()) { + groupsToDelete.forEach(groupUuid -> vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid)); + cleanVmHostBackupFilesForGroup(groupsToDelete); + dbf.removeByPrimaryKeys(groupsToDelete, VolumeSnapshotGroupVO.class); } } From e02de7afb33560edc67e56c4811f4d4824596605 Mon Sep 17 00:00:00 2001 From: "tao.gan" Date: Thu, 14 May 2026 18:23:09 +0800 Subject: [PATCH 5/5] [storage]: VM-scoped snapshot group integrity check + cascade Once symmetric disband is in place, an incomplete snapshot group (some refs deleted, some still alive on the VM) is sticky and must not be silently ignored by subsequent operations on the same VM. Block on the following entries when the VM has any incomplete group (the incomplete group itself is excluded from the check, so users always have a path to clear the debt): - APIDeleteVolumeSnapshotGroupMsg (other groups): blocked unless a new boolean force=true is passed. force is added to the API as the operator bypass; other entries deliberately do NOT expose force. - APICreateVolumeSnapshotGroupMsg: blocked. - APIAttachDataVolumeToVmMsg / APIDetachDataVolumeFromVmMsg: blocked. APIDestroyVmInstanceMsg is intentionally NOT blocked: the new VolumeSnapshotGroupCascadeExtension keys on VmInstanceVO and force-cleans every group VO (refs -> archived metadata -> host backup files -> group VO) when the VM is destroyed, so VM teardown is never wedged by leftover incomplete groups. Single-snapshot APIs are also exempt by design. Resolves: ZSV-10538 Change-Id: I717363667067726a796467787568756d6f7a706e --- conf/springConfigXml/volumeSnapshot.xml | 6 + .../APIDeleteVolumeSnapshotGroupMsg.java | 11 ++ .../group/VolumeSnapshotGroupBase.java | 14 ++ .../VolumeSnapshotGroupCascadeExtension.java | 153 ++++++++++++++++++ .../group/VolumeSnapshotGroupChecker.java | 44 +++++ .../storage/volume/VolumeApiInterceptor.java | 30 ++++ 6 files changed, 258 insertions(+) create mode 100644 storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java diff --git a/conf/springConfigXml/volumeSnapshot.xml b/conf/springConfigXml/volumeSnapshot.xml index f2ad0dc93a4..e2afe874754 100755 --- a/conf/springConfigXml/volumeSnapshot.xml +++ b/conf/springConfigXml/volumeSnapshot.xml @@ -41,6 +41,12 @@ + + + + + + diff --git a/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java b/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java index cb1f8dde454..2a534d29203 100644 --- a/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java +++ b/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java @@ -31,6 +31,9 @@ public class APIDeleteVolumeSnapshotGroupMsg extends APIDeleteMessage implements @APIParam(required = false, validValues = {"single", "chain", "auto"}) private String scope = "chain"; + @APIParam(required = false) + private boolean force = false; + @APINoSee private String vmUuid; @@ -58,6 +61,14 @@ public void setScope(String scope) { this.scope = scope; } + public boolean isForce() { + return force; + } + + public void setForce(boolean force) { + this.force = force; + } + public String getVmUuid() { return vmUuid; } diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java index a9e47dc2d69..63bbb7e6932 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java @@ -186,6 +186,20 @@ public String getName() { private void handleDelete(APIDeleteVolumeSnapshotGroupMsg msg, NoErrorCompletion completion) { APIDeleteVolumeSnapshotGroupEvent event = new APIDeleteVolumeSnapshotGroupEvent(msg.getId()); + + if (!msg.isForce()) { + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(self.getVmInstanceUuid(), self.getUuid()); + if (!incomplete.isEmpty()) { + event.setError(operr("VM[uuid:%s] has incomplete snapshot group(s) %s, " + + "please clean them up first (or pass force=true) before deleting other snapshot groups", + self.getVmInstanceUuid(), incomplete)); + bus.publish(event); + completion.done(); + return; + } + } + DeleteVolumeSnapshotGroupInnerMsg imsg = new DeleteVolumeSnapshotGroupInnerMsg(); imsg.setUuid(msg.getUuid()); imsg.setDeletionMode(msg.getDeletionMode()); diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java new file mode 100644 index 00000000000..786c4e1330d --- /dev/null +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java @@ -0,0 +1,153 @@ +package org.zstack.storage.snapshot.group; + +import org.springframework.beans.factory.annotation.Autowired; +import org.zstack.core.cascade.AbstractAsyncCascadeExtension; +import org.zstack.core.cascade.CascadeAction; +import org.zstack.core.cascade.CascadeConstant; +import org.zstack.core.db.DatabaseFacade; +import org.zstack.core.db.Q; +import org.zstack.core.db.SQL; +import org.zstack.header.core.Completion; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO_; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO_; +import org.zstack.header.vm.VmDeletionStruct; +import org.zstack.header.vm.VmInstanceVO; +import org.zstack.header.vm.additions.VmHostBackupFileVO; +import org.zstack.header.vm.additions.VmHostBackupFileVO_; +import org.zstack.header.vm.additions.VmHostFileManager; +import org.zstack.header.vm.devices.VmInstanceResourceMetadataManager; +import org.zstack.utils.Utils; +import org.zstack.utils.logging.CLogger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Cascade extension keyed on VmInstance for cleaning up VolumeSnapshotGroup VOs + * when a VM is destroyed. + * + * Background: snapshot groups are VM-scoped. When a VM is destroyed, any remaining + * group VOs (whether complete or incomplete due to partial single-snapshot deletions) + * become orphaned. Without this cleanup, those rows would survive beyond the VM + * and pollute downstream queries. + * + * On DELETION_CHECK we do NOT block — VM destroy should proceed even with + * incomplete groups (per product decision); cleanup is automatic. + */ +public class VolumeSnapshotGroupCascadeExtension extends AbstractAsyncCascadeExtension { + private static final CLogger logger = Utils.getLogger(VolumeSnapshotGroupCascadeExtension.class); + + private static final String NAME = VolumeSnapshotGroupVO.class.getSimpleName(); + + @Autowired + private DatabaseFacade dbf; + @Autowired + private VmInstanceResourceMetadataManager vidm; + @Autowired + private VmHostFileManager vmHostFileManager; + + @Override + public void asyncCascade(CascadeAction action, Completion completion) { + if (action.isActionCode(CascadeConstant.DELETION_CLEANUP_CODE)) { + handleDeletionCleanup(action, completion); + } else if (action.isActionCode(CascadeConstant.DELETION_DELETE_CODE, + CascadeConstant.DELETION_FORCE_DELETE_CODE)) { + handleDeletion(action, completion); + } else { + completion.success(); + } + } + + private void handleDeletion(CascadeAction action, Completion completion) { + if (!VmInstanceVO.class.getSimpleName().equals(action.getParentIssuer())) { + completion.success(); + return; + } + + List vmUuids = vmUuidsFromAction(action); + if (vmUuids.isEmpty()) { + completion.success(); + return; + } + + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .in(VolumeSnapshotGroupVO_.vmInstanceUuid, vmUuids) + .listValues(); + if (groupUuids.isEmpty()) { + completion.success(); + return; + } + + logger.debug(String.format("VM destroy cascade: force-removing %d snapshot group(s) %s for vm(s) %s " + + "(includes any incomplete groups from prior single-snapshot deletions)", + groupUuids.size(), groupUuids, vmUuids)); + + // 1. drop all refs first (FK-like constraint via business logic) + SQL.New(VolumeSnapshotGroupRefVO.class) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuids) + .delete(); + + // 2. clean associated metadata + backup files + groupUuids.forEach(vidm::deleteArchiveVmInstanceResourceMetadataGroup); + cleanVmHostBackupFilesForGroup(groupUuids); + + // 3. remove group VOs + dbf.removeByPrimaryKeys(groupUuids, VolumeSnapshotGroupVO.class); + + completion.success(); + } + + private void cleanVmHostBackupFilesForGroup(List groupUuids) { + if (groupUuids.isEmpty()) { + return; + } + + List backupUuidList = Q.New(VmHostBackupFileVO.class) + .in(VmHostBackupFileVO_.resourceUuid, groupUuids) + .select(VmHostBackupFileVO_.uuid) + .listValues(); + + backupUuidList.forEach(vmHostFileManager::cleanVmHostBackupFile); + } + + private void handleDeletionCleanup(CascadeAction action, Completion completion) { + try { + dbf.eoCleanup(VolumeSnapshotGroupVO.class); + } catch (Throwable t) { + logger.warn("eoCleanup VolumeSnapshotGroupVO failed: " + t.getMessage()); + } finally { + completion.success(); + } + } + + private List vmUuidsFromAction(CascadeAction action) { + Object ctx = action.getParentIssuerContext(); + if (ctx == null) { + return Collections.emptyList(); + } + List uuids = new ArrayList<>(); + if (ctx instanceof List) { + for (Object o : (List) ctx) { + if (o instanceof VmDeletionStruct) { + uuids.add(((VmDeletionStruct) o).getInventory().getUuid()); + } + } + } + return uuids; + } + + @Override + public List getEdgeNames() { + return Arrays.asList(VmInstanceVO.class.getSimpleName()); + } + + @Override + public String getCascadeResourceName() { + return NAME; + } +} diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java index 9749984de47..3c3f9385953 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java @@ -3,6 +3,7 @@ import org.zstack.core.db.Q; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupAvailability; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO_; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO_; import org.zstack.header.vo.ResourceVO; @@ -25,6 +26,49 @@ public static boolean isAvailable(String uuid) { return getAvailability(uuid).isAvailable(); } + /** + * Find all incomplete snapshot groups on a VM. + * An incomplete group is one where part of its refs have snapshotDeleted=true + * but at least one ref is still alive (snapshotDeleted=false). + * Such groups represent a "debt" that pollutes subsequent group/VM operations. + * + * @param vmInstanceUuid the VM to inspect + * @param excludeGroupUuid group uuid to exclude from the result (e.g. when the caller is + * itself trying to delete that group, do not flag it as a blocker); + * pass null to include all groups + * @return list of incomplete group uuids (excluding excludeGroupUuid); empty if none + */ + public static List findIncompleteGroupsOnVm(String vmInstanceUuid, String excludeGroupUuid) { + if (vmInstanceUuid == null) { + return Collections.emptyList(); + } + + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .eq(VolumeSnapshotGroupVO_.vmInstanceUuid, vmInstanceUuid) + .listValues(); + + List incomplete = new ArrayList<>(); + for (Object o : groupUuids) { + String guuid = o.toString(); + if (guuid.equals(excludeGroupUuid)) { + continue; + } + long deletedRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).count(); + if (deletedRefs == 0) { + continue; + } + long totalRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid).count(); + if (deletedRefs < totalRefs) { + incomplete.add(guuid); + } + } + return incomplete; + } + public static List getAvailability(List uuids) { List results = new ArrayList<>(); List groups = Q.New(VolumeSnapshotGroupVO.class) diff --git a/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java b/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java index 09e41c229f7..b956b750ce8 100755 --- a/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java +++ b/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java @@ -44,6 +44,7 @@ import org.zstack.header.storage.snapshot.VolumeSnapshotVO; import org.zstack.header.storage.snapshot.VolumeSnapshotVO_; import org.zstack.header.storage.snapshot.group.MemorySnapshotValidatorExtensionPoint; +import org.zstack.storage.snapshot.group.VolumeSnapshotGroupChecker; import org.zstack.header.tag.SystemTagVO; import org.zstack.header.vm.APICreateVmInstanceMsg; import org.zstack.header.vm.DiskAO; @@ -213,6 +214,8 @@ private void validate(APICreateVolumeSnapshotGroupMsg msg) { throw new ApiMessageInterceptionException(argerr("volume[uuid:%s] is not root volume", msg.getRootVolumeUuid())); } + checkIncompleteSnapshotGroupsOnVm(vmvo.getUuid(), "create new snapshot group"); + if (msg.isWithMemory() && !(vmvo.getState().equals(VmInstanceState.Running) || (vmvo.getState().equals(VmInstanceState.Paused)))) { throw new ApiMessageInterceptionException(argerr("Can not take memory snapshot, vm current state[%s], but expect state are [%s, %s]", vmvo.getState().toString(), VmInstanceState.Running.toString(), VmInstanceState.Paused.toString())); @@ -316,9 +319,13 @@ private void validate(APIDetachDataVolumeFromVmMsg msg) { throw new ApiMessageInterceptionException(operr("the volume[uuid:%s, name:%s, type:%s] can't detach it", vol.getUuid(), vol.getName(), vol.getType())); } + + String vmUuid = msg.getVmUuid() != null ? msg.getVmUuid() : vol.getVmInstanceUuid(); + checkIncompleteSnapshotGroupsOnVm(vmUuid, "detach data volume"); } private void validate(APIAttachDataVolumeToVmMsg msg) { + checkIncompleteSnapshotGroupsOnVm(msg.getVmInstanceUuid(), "attach data volume"); new SQLBatch() { @Override protected void scripts() { @@ -691,6 +698,29 @@ public boolean start() { return true; } + /** + * Block VM-scoped operations when the VM has any incomplete snapshot group. + * An incomplete group is one whose refs are partially deleted (some snapshotDeleted=true, + * but at least one alive). Such groups must be cleaned up first to avoid pollution + * of subsequent group operations on this VM. + * + * Exempt operations: deleting an incomplete group itself (handled by + * {@code VolumeSnapshotGroupBase#handleDelete} which excludes self), single-snapshot + * deletion, and VM destroy (handled by VolumeSnapshotGroupCascadeExtension cleanup). + */ + private void checkIncompleteSnapshotGroupsOnVm(String vmUuid, String operationDesc) { + if (vmUuid == null) { + return; + } + List incomplete = VolumeSnapshotGroupChecker.findIncompleteGroupsOnVm(vmUuid, null); + if (!incomplete.isEmpty()) { + throw new ApiMessageInterceptionException(operr( + "VM[uuid:%s] has incomplete snapshot group(s) %s, " + + "please clean them up first before %s", + vmUuid, incomplete, operationDesc)); + } + } + @Override public boolean stop() { return true;