diff --git a/conf/springConfigXml/volumeSnapshot.xml b/conf/springConfigXml/volumeSnapshot.xml index f2ad0dc93a4..e2afe874754 100755 --- a/conf/springConfigXml/volumeSnapshot.xml +++ b/conf/springConfigXml/volumeSnapshot.xml @@ -41,6 +41,12 @@ + + + + + + diff --git a/docs/snapshot-single-delete/00-overview.md b/docs/snapshot-single-delete/00-overview.md new file mode 100644 index 00000000000..9dbddf12309 --- /dev/null +++ b/docs/snapshot-single-delete/00-overview.md @@ -0,0 +1,50 @@ +# 单快照节点删除(scope=single)— 总览 + +> 需求:ZSV-5799 "支持删除快照不删除链" +> 关联 MR:zstack#7674 / premium#10776 / zstack-utility#5743 +> 入口 API:`APIDeleteVolumeSnapshotGroupMsg`(含 `direction` + `scope` 字段) + +--- + +## 文档索引 + +| 文档 | 内容 | +|---|---| +| [01-api-and-fields.md](01-api-and-fields.md) | API 入口、字段、枚举定义 | +| [02-call-chain.md](02-call-chain.md) | 处理链路总览(Group → Tree → Storage) | +| [03-direction-resolution.md](03-direction-resolution.md) | `resolveDirection()` 决策表与 fromVOs 构建 | +| [04-scope-and-stepDelete.md](04-scope-and-stepDelete.md) | scope 分支与 stepDelete 递归 | +| [05-commit-db-swap.md](05-commit-db-swap.md) | Commit 路径 DB 翻转(最关键) | +| [06-pull-db-rewrite.md](06-pull-db-rewrite.md) | Pull / pullToVolume DB 改写 | +| [07-group-passthrough.md](07-group-passthrough.md) | Group 透传与并发、失败聚合 | +| [08-hypervisor-online-commit.md](08-hypervisor-online-commit.md) | 在线 libvirt blockCommit + pivot | +| [09-agent-qemu-img.md](09-agent-qemu-img.md) | agent 端 qemu-img 三种命令对比 | +| [10-storage-backend-matrix.md](10-storage-backend-matrix.md) | Local/NFS/SMP/SharedBlock/Ceph 后端差异 | +| [11-sibling-rebase.md](11-sibling-rebase.md) | 分叉链兄弟节点 rebase | +| [12-fullrebase-and-cleanup.md](12-fullrebase-and-cleanup.md) | fullRebase 树根删除与残留清理 | +| [13-premium-and-cdp.md](13-premium-and-cdp.md) | Premium / CDP / 灾备兼容性 | +| [14-limitations-and-todos.md](14-limitations-and-todos.md) | 已知限制 / TODO / FIXME | + +--- + +## 一图概览 + +``` +[祖父] ── [待删节点 X] ── [子 Y] ── ... + │ + ┌──────────┴───────────┐ + │ scope=single │ + │ direction=commit │ 在线VM 且 X≠latest + │ → Y 差量写入 X 文件 │ + │ → DB: 互换 path, Y.parent=X.parent + │ + │ direction=pull │ 离线 或 X=latest + │ → 祖父+X 合并入 Y(rebase) + │ → DB: Y.parent = X.parent +``` + +## 仓库根 + +- `/d/0zw/zw/zstack/` —— 开源主库 +- `/d/0zw/zw/premium/` —— Premium(独立 git) +- `/d/0zw/zw/zstack-utility/` —— Python agent diff --git a/docs/snapshot-single-delete/01-api-and-fields.md b/docs/snapshot-single-delete/01-api-and-fields.md new file mode 100644 index 00000000000..fd7a6f34034 --- /dev/null +++ b/docs/snapshot-single-delete/01-api-and-fields.md @@ -0,0 +1,56 @@ +# 01 — API 入口与字段定义 + +## 1.1 `APIDeleteVolumeSnapshotGroupMsg`(快照组删除) + +**文件**:`header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java:24` + +```java +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; + +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; // 默认保留旧行为 +``` + +REST 路径:`DELETE /volume-snapshots/group/{uuid}` + +## 1.2 `APIDeleteVolumeSnapshotMsg`(单快照删除) + +**文件**:`header/.../APIDeleteVolumeSnapshotMsg.java:49` + +```java +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; + +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; // 默认 chain,向后兼容 +``` + +REST 路径:`DELETE /volume-snapshots/{uuid}` + +## 1.3 枚举类 + +### `DeleteVolumeSnapshotDirection` — `header/.../DeleteVolumeSnapshotDirection.java:3` + +| 值 | 语义 | +|---|---| +| `Pull("pull")` | 下拉方向:父快照内容合入子快照 | +| `Commit("commit")` | 上提方向:子快照内容合入父快照 | +| `Auto("auto")` | 系统自动判断 | + +### `DeleteVolumeSnapshotScope` — `header/.../DeleteVolumeSnapshotScope.java:3` + +| 值 | 语义 | +|---|---| +| `Single("single")` | 只删除当前单节点,保留整条链 | +| `Chain("chain")` | 删除当前节点及所有后代(旧默认) | +| `Auto("auto")` | 系统自动判断(实际等同 single) | + +## 1.4 传递结构体 + +`VolumeSnapshotDeletionStructs` — `header/.../VolumeSnapshotDeletionStructs.java:5` +跨层透传 `direction + scope + 快照列表`。 + +## 1.5 兼容性 + +API 默认值 `scope = "chain"` 保持向后兼容;**必须显式传 `scope=single`** 才会触发新功能。 diff --git a/docs/snapshot-single-delete/02-call-chain.md b/docs/snapshot-single-delete/02-call-chain.md new file mode 100644 index 00000000000..54f98cd1c17 --- /dev/null +++ b/docs/snapshot-single-delete/02-call-chain.md @@ -0,0 +1,47 @@ +# 02 — 处理链路总览 + +## 2.1 快照组删除链路 + +``` +APIDeleteVolumeSnapshotGroupMsg + └─ VolumeSnapshotGroupBase.handle() GroupBase.java:163 + └─ handleDelete() GroupBase.java:187 + └─ DeleteVolumeSnapshotGroupInnerMsg (携带 scope/direction) + └─ While 循环每个 VolumeSnapshotVO GroupBase.java:212 + └─ DeleteVolumeSnapshotMsg(scope,direction) + └─ VolumeSnapshotTreeBase + └─ deletion() TreeBase.java:358 + ├─ scope=chain → deleteChainFlows() :487 + └─ scope=single → deleteSingleFlows() :828 + └─ stepDelete() :875 + ├─ 叶节点 → deleteVolumeSnapshotAndSyncVolumeSize + ├─ 单子节点 → resolveDirection → commit() / pull() + └─ 多子节点 → pull() (强制) +``` + +## 2.2 关键透传点 + +`VolumeSnapshotGroupBase.java:221-228`: +```java +DeleteVolumeSnapshotMsg rmsg = new DeleteVolumeSnapshotMsg(); +rmsg.setScope(msg.getScope()); +rmsg.setDirection(msg.getDirection()); +bus.makeTargetServiceIdByResourceUuid(rmsg, VolumeSnapshotConstant.SERVICE_ID, ...); +``` + +## 2.3 关键类索引 + +| 文件 | 作用 | +|---|---| +| `header/.../APIDeleteVolumeSnapshotMsg.java:49` | 单快照 API 入口 | +| `header/.../APIDeleteVolumeSnapshotGroupMsg.java:24` | 快照组 API 入口 | +| `storage/.../group/VolumeSnapshotGroupBase.java:212` | Group → 单快照消息分发 | +| `storage/.../VolumeSnapshotTreeBase.java:473` | scope 分支点 | +| `storage/.../VolumeSnapshotTreeBase.java:875` | stepDelete 递归 | +| `storage/.../VolumeSnapshotTreeBase.java:921` | commit() 流程 | +| `storage/.../VolumeSnapshotTreeBase.java:1097` | pull() 流程 | +| `storage/.../VolumeTree.java:364` | resolveDirection 决策 | +| `storage/.../VolumeTree.java:418/471` | updateDatabaseAfter Pull/Commit | +| `plugin/kvm/.../KVMHost.java:1043/1159` | 在线 commit/pull | +| `kvmagent/plugins/vm_plugin.py:3915` | libvirt blockCommit 核心 | +| `zstacklib/utils/linux.py:1389` | qcow2 工具函数 | diff --git a/docs/snapshot-single-delete/03-direction-resolution.md b/docs/snapshot-single-delete/03-direction-resolution.md new file mode 100644 index 00000000000..898a3f99cd4 --- /dev/null +++ b/docs/snapshot-single-delete/03-direction-resolution.md @@ -0,0 +1,101 @@ +# 03 — direction 决策(resolveDirection) + +## 3.1 核心代码 + +**文件**:`storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java:364` + +```java +public DeleteVolumeSnapshotDirection resolveDirection( + String targetSnapshotUuid, // 待删节点(dst, 老节点) + String childSnapshotUuid, // 待删节点的子节点(src, 新节点) + String initialDirection, // 用户传入的 direction + boolean targetSnapshotIsLatest, // 待删节点是否 latest + VmInstanceState vmState) { + + boolean online = + (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) + && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) + && getAliveChainSnapshotUuids().contains(childSnapshotUuid); + + boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + + if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Pull.toString()) + && shouldUseCommitStrategy) { + throw new IllegalArgumentException( + "the snapshot will be deleted by block 'commit', but the direction is 'pull', " + + "change the direction to 'commit' or 'auto'."); + } + + if (initialDirection == null) return DeleteVolumeSnapshotDirection.Commit; + + if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Auto.toString())) { + return shouldUseCommitStrategy + ? DeleteVolumeSnapshotDirection.Commit + : DeleteVolumeSnapshotDirection.Pull; + } + + return DeleteVolumeSnapshotDirection.fromString(initialDirection); +} +``` + +## 3.2 决策表 + +| current | targetIsLatest | online | initialDirection | 结果 | +|---|---|---|---|---| +| 任意 | 任意 | 任意 | `null` | **Commit**(兜底) | +| true | false | true | `pull` | **抛 IllegalArgumentException** | +| true | false | true | `auto` | **Commit** | +| 其它组合 | — | — | `auto` | **Pull** | +| 任意 | 任意 | 任意 | `commit` | **Commit** | +| 任意 | 任意 | 任意 | `pull`(合法) | **Pull** | + +## 3.3 关键字段含义 + +| 字段 | 含义 | +|---|---| +| `current` (`VolumeTree.current`,第38行) | 来自 `VolumeSnapshotTreeVO.current`,true 表示快照链尾连着活跃 volume | +| `targetSnapshotIsLatest` | 来自 `VolumeSnapshotVO.latest = 1`,调用方传 `currentRoot.isLatest()` | +| `aliveChain` | volume 沿 backing chain 上溯到根的所有节点,代表"qemu 当前持有的文件链" | + +## 3.4 调用方 + +`VolumeSnapshotTreeBase.java:904`: +```java +DeleteVolumeSnapshotDirection direction = volumeTree.resolveDirection( + currentRoot.getUuid(), // 待删节点 + child.getUuid(), // 子节点 + msg.getDirection(), // 用户传入 + currentRoot.isLatest(), // 来自 DB + vmState); +``` + +## 3.5 `VolumeTree.fromVOs()` 构建过程 + +`VolumeTree.java:260-327`: + +1. 校验:至多一个根(`parentUuid == null`)、至多一个 latest +2. 若 `current && 有 latest`,把 **volume 自身作为虚拟叶节点** 挂到 latest 之后(uuid = volume uuid) +3. HashMap 还原 parent/children +4. 从 volume 虚拟节点向上收集 `aliveChain` + +```java +// 步骤 3:构建树 +Map map = new HashMap<>(); +for (VolumeSnapshotInventory inv : invs) { + VolumeSnapshotLeaf leaf = map.computeIfAbsent(inv.getUuid(), k -> new VolumeSnapshotLeaf()); + leaf.inventory = inv; + if (inv.getParentUuid() != null) { + VolumeSnapshotLeaf parent = map.computeIfAbsent(inv.getParentUuid(), k -> new VolumeSnapshotLeaf()); + parent.children.add(leaf); + leaf.parent = parent; + } else { + tree.root = leaf; + } +} + +// 步骤 4:计算 aliveChain +if (tree.current) { + VolumeSnapshotLeaf leaf = tree.getSnapshotLeaf(volumeInv.getUuid()); + tree.aliveChain = leaf != null ? leaf.getAncestors() : new ArrayList<>(); +} +``` diff --git a/docs/snapshot-single-delete/04-scope-and-stepDelete.md b/docs/snapshot-single-delete/04-scope-and-stepDelete.md new file mode 100644 index 00000000000..09fc5722479 --- /dev/null +++ b/docs/snapshot-single-delete/04-scope-and-stepDelete.md @@ -0,0 +1,102 @@ +# 04 — scope 分支与 stepDelete 递归 + +## 4.1 scope 分支点 + +**文件**:`VolumeSnapshotTreeBase.java:473` + +```java +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + deleteChainFlows(); // 旧行为:删当前 + 所有后代 +} else { + deleteSingleFlows(); // single/auto:只删当前节点 +} +``` + +注意:`scope=auto` 也走 `deleteSingleFlows()` 分支;只有显式 `chain` 走级联删除。 + +## 4.2 stepDelete 完整代码 + +**文件**:`VolumeSnapshotTreeBase.java:875-918` + +```java +private void stepDelete(Completion completion) { + // 1) 从 DB 拉取整棵树最新状态 + List vos = Q.New(VolumeSnapshotVO.class) + .eq(VolumeSnapshotVO_.treeUuid, currentRoot.getTreeUuid()).list(); + boolean current = Q.New(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, currentRoot.getTreeUuid()) + .select(VolumeSnapshotTreeVO_.current).findValue(); + + // 2) 重建内存树 + VolumeTree volumeTree = VolumeTree.fromVOs(vos, current, VolumeInventory.valueOf(volume)); + List children = + volumeTree.getSnapshotLeaf(currentRoot.getUuid()).getChildren(); + + // 3) 终止条件:无子节点 + if (children.isEmpty()) { + deleteVolumeSnapshotAndSyncVolumeSize(completion); + return; + } + + // 4) 递归 completion + Completion comp = new Completion(completion) { + @Override public void success() { stepDelete(completion); } + @Override public void fail(ErrorCode e) { completion.fail(e); } + }; + + // 5) 找 online 子节点(vm running/paused 且在 aliveChain) + VolumeSnapshotLeaf onlineChild = children.stream() + .filter(c -> volumeTree.isOnline(current, currentRoot.getUuid(), c.getUuid(), vmState)) + .findFirst().orElse(null); + + VolumeSnapshotLeaf child = children.get(0); + + if (children.size() == 1) { + DeleteVolumeSnapshotDirection direction = volumeTree.resolveDirection( + currentRoot.getUuid(), child.getUuid(), + msg.getDirection(), currentRoot.isLatest(), vmState); + boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); + if (direction == Commit) commit(child, volumeTree, online, comp); + else pull(child, volumeTree, online, comp); + } else { + // 多子节点(分叉链) + if (onlineChild != null && child.getUuid().equals(onlineChild.getUuid())) { + child = children.get(1); // 优先处理非 online 子节点 + } + boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); + pull(child, volumeTree, online, comp); // 多子节点统一 pull + } +} +``` + +## 4.3 递归特性 + +| 维度 | 说明 | +|---|---| +| 终止条件 | `children.isEmpty()` | +| 每次递归 | 处理一个子节点;commit/pull 后子节点数 -1 | +| 最坏深度 | 子节点总数(**不是链深度**) | +| 多子节点策略 | 强制 pull;优先非 online 子节点 | +| 失败处理 | `comp.fail()` 直接上抛,**已完成的中间步骤不回滚**,依赖存储幂等 | + +## 4.4 多子节点优先非 online 原因 + +online 子节点的 backing file 正在被 qemu 持有写 I/O,修改它有风险; +先处理非 online 子节点,把它们逐个 pull 掉;最后 online 子节点剩一个,落入"单子节点"分支正常处理。 + +## 4.5 特殊短路 + +`VolumeSnapshotTreeBase.java:836`: +```java +if (VolumeSnapshotConstant.STORAGE_SNAPSHOT_TYPE.toString().equals(currentRoot.getType()) + || Objects.equals(currentRoot.getVolumeType(), VolumeType.Memory.toString())) { + deleteVolumeSnapshotAndSyncVolumeSize(completion); + return; +} +``` + +CDP / 存储快照 / 内存快照绕过 commit/pull,直接调用存储删除。 + +## 4.6 VmState 限制 + +`:854` 仅允许 `Running / Paused / Destroyed / Stopped / Destroying`,其它(如 Migrating / Unknown)直接失败。 diff --git a/docs/snapshot-single-delete/05-commit-db-swap.md b/docs/snapshot-single-delete/05-commit-db-swap.md new file mode 100644 index 00000000000..56240534b1c --- /dev/null +++ b/docs/snapshot-single-delete/05-commit-db-swap.md @@ -0,0 +1,105 @@ +# 05 — Commit DB 翻转(最关键) + +## 5.1 物理时序图 + +``` +Commit 前: + dst.qcow2 ←backing— src.qcow2 ←backing— grandchild.qcow2 + 父 子 孙 + +blockCommit(top=src, base=dst) 完成后: + dst.qcow2 内容 = 原 src 内容(src 的 delta 已 flush 进 dst 文件) + src.qcow2 已被 DELETE(VIR_DOMAIN_BLOCK_COMMIT_DELETE)或将被回收 + +期望逻辑: + src(保留) ← grandchild ← 但 uuid 不变,所以用 path 互换实现: + +DB 互换: + dst.installPath ← src 旧 path (dst 记录"指"已合并的文件) + src.installPath ← dst 旧 path (src 记录"指"待回收的文件) + src.parentUuid ← dst.parentUuid (跨过 dst) + src.distance -= 1 +``` + +## 5.2 为什么互换 path? + +- `blockCommit` 落地的物理文件是 dst 的路径,但数据是 src 的 +- 用户视角"保留的是子节点(src)" +- 互换后:dst 这条 DB 记录指向已合并文件,src 这条 DB 记录指向旧 dst 文件路径(即将被 `deleteVolumeSnapshotAndSyncVolumeSize` 删除) +- `cleanupAfterDeleteSingleSnapshot` 接下来按 `currentRoot.uuid`(dst 的 uuid)逻辑层删除,但物理文件路径已是旧 dst 文件,被回收 + +## 5.3 完整 SQL 操作(`VolumeTree.java:471-545`) + +```java +new SQLBatch() { + @Override + protected void scripts() { + // 1) src 及所有后代 distance -1 + List descendantsUuid = srcLeaf.getDescendants().stream() + .map(...uuid) + .filter(u -> !u.equals(srcLeaf.uuid) && !u.equals(volume.uuid)) + .toList(); + List vos = Q.New(VolumeSnapshotVO.class) + .in(VolumeSnapshotVO_.uuid, descendantsUuid).list(); + vos.forEach(vo -> vo.setDistance(vo.getDistance() - 1)); + + // 2) dst 是树根 → 新建 VolumeSnapshotTreeVO + VolumeSnapshotTreeVO newTree = null; + if (dstSnapshotInv.getParentUuid() == null) { + newTree = new VolumeSnapshotTreeVO(); + newTree.setUuid(Platform.getUuid()); + newTree.setVolumeUuid(volume.getUuid()); + newTree.setStatus(VolumeSnapshotTreeStatus.Completed); + newTree.setCurrent(descendantsUuid.contains(volume.getUuid())); + if (getAliveChainSnapshotUuids().contains(srcSnapshotInv.getUuid())) { + newTree.setCurrent(true); + } + dbf.persist(newTree); + } + if (!vos.isEmpty() && newTree != null) { + VolumeSnapshotTreeVO finalNewTree = newTree; + vos.forEach(vo -> vo.setTreeUuid(finalNewTree.getUuid())); + } + + // 3) dst 互换 installPath, size + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, dstSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.primaryStorageInstallPath, srcSnapshotInv.getPrimaryStorageInstallPath()) + .set(VolumeSnapshotVO_.size, srcSnapshotInv.getSize()) + .update(); + + // 4) GroupRef 同步 installPath + if (dstSnapshotInv.getGroupUuid() != null) { + sql(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, dstSnapshotInv.getGroupUuid()) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, dstSnapshotInv.getUuid()) + .set(VolumeSnapshotGroupRefVO_.volumeSnapshotInstallPath, + srcSnapshotInv.getPrimaryStorageInstallPath()) + .update(); + } + + // 5) src 互换 installPath,parentUuid 跨过 dst,distance -1 + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.primaryStorageInstallPath, dstSnapshotInv.getPrimaryStorageInstallPath()) + .set(VolumeSnapshotVO_.size, newInstallPathSize) + .set(VolumeSnapshotVO_.distance, srcSnapshotInv.getDistance() - 1) + .set(VolumeSnapshotVO_.parentUuid, dstSnapshotInv.getParentUuid()) + .set(VolumeSnapshotVO_.treeUuid, + newTree != null ? newTree.getUuid() : srcSnapshotInv.getTreeUuid()) + .update(); + + dbf.updateCollection(vos); + } +}.execute(); // 单事务原子提交 +``` + +## 5.4 commit() 主流程概览 + +`VolumeSnapshotTreeBase.java:921-1094`: + +1. `AllocatePrimaryStorageSpaceMsg` —— 预分配空间 +2. 分支: + - 在线 → `CommitVolumeSnapshotOnHypervisorMsg` → KVMHost → libvirt blockCommit + - 离线 → `CommitVolumeSnapshotOnPrimaryStorageMsg` → 存储后端 → qemu-img commit +3. 透传 `srcChildrenInstallPathInDb`(兄弟节点列表,见 11 节) +4. `updateDatabaseAfterCommit` —— DB 翻转 +5. 失败 rollback:通过 FlowChain 释放已分配存储空间 diff --git a/docs/snapshot-single-delete/06-pull-db-rewrite.md b/docs/snapshot-single-delete/06-pull-db-rewrite.md new file mode 100644 index 00000000000..027e6f8bd04 --- /dev/null +++ b/docs/snapshot-single-delete/06-pull-db-rewrite.md @@ -0,0 +1,119 @@ +# 06 — Pull / pullToVolume DB 改写 + +## 6.1 Pull 物理语义 + +``` +Pull 前: + grandparent ← src(待删) ← dst(子) ← descendants + +qemu-img rebase(dst → grandparent) 完成后: + dst.qcow2 文件中数据 = 原 dst delta + 原 src delta(合并) + dst 的 backing file = grandparent + src.qcow2 待删除 + +DB 改写: + dst.parentUuid ← src.parentUuid (跨过 src) + dst.distance -= 1 + dst.size = 合并后的实际大小 + 所有后代 distance -1 +``` + +## 6.2 `updateDatabaseAfterPull()` — `VolumeTree.java:418-469` + +```java +public void updateDatabaseAfterPull(VolumeSnapshotInventory srcSnapshotInv, + VolumeSnapshotLeaf dstSnapshotLeaf, long newInstallPathSize) { + + VolumeSnapshotInventory dstSnapshotInv = dstSnapshotLeaf.getInventory(); + + new SQLBatch() { + @Override + protected void scripts() { + // 1) 收集 dst 及所有后代(不含 volume 虚拟节点) + List descendantsUuid = dstSnapshotLeaf.getDescendants().stream() + .map(...uuid) + .filter(u -> !u.equals(volume.uuid)) + .toList(); + List vos = q(VolumeSnapshotVO.class) + .in(VolumeSnapshotVO_.uuid, descendantsUuid).list(); + + // 2) distance -1;dst 节点特殊处理 + vos.forEach(vo -> { + vo.setDistance(vo.getDistance() - 1); + if (vo.getUuid().equals(dstSnapshotInv.getUuid())) { + vo.setParentUuid(srcSnapshotInv.getParentUuid()); + vo.setSize(newInstallPathSize); + } + }); + + // 3) src 是树根 → 新建 VolumeSnapshotTreeVO,后代迁移 + VolumeSnapshotTreeVO newTree = null; + if (srcSnapshotInv.getParentUuid() == null) { + newTree = new VolumeSnapshotTreeVO(); + newTree.setCurrent(descendantsUuid.contains(volume.getUuid())); + newTree.setVolumeUuid(volume.getUuid()); + newTree.setUuid(Platform.getUuid()); + newTree.setStatus(VolumeSnapshotTreeStatus.Completed); + if (getAliveChainSnapshotUuids().contains(dstSnapshotInv.getUuid())) { + newTree.setCurrent(true); + } + dbf.persist(newTree); + VolumeSnapshotTreeVO finalNewTree = newTree; + vos.forEach(vo -> vo.setTreeUuid(finalNewTree.getUuid())); + } + + dbf.updateCollection(vos); + + // 4) 新树建好且 dst 就是 volume 自身(pull-to-volume 边界)→ 原树标记非 current + if (newTree != null && dstSnapshotInv.getUuid().equals(volume.getUuid()) + && q(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()).count() == 1) { + sql(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()) + .set(VolumeSnapshotTreeVO_.current, false).update(); + } + } + }.execute(); +} +``` + +## 6.3 `updateDatabaseAfterPullToVolume()` — `VolumeTree.java:396-416` + +特殊场景:dst 是 volume 自身(即 latest 快照被合并进活跃 volume 文件)。 + +```java +public void updateDatabaseAfterPullToVolume(VolumeSnapshotInventory srcSnapshotInv) { + new SQLBatch() { + @Override + protected void scripts() { + // 1) src(latest)标记为非 latest + sql(VolumeSnapshotVO.class).eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getUuid()) + .set(VolumeSnapshotVO_.latest, false).update(); + + // 2) src 的父节点成为新的 latest + if (srcSnapshotInv.getParentUuid() != null) { + sql(VolumeSnapshotVO.class) + .eq(VolumeSnapshotVO_.uuid, srcSnapshotInv.getParentUuid()) + .set(VolumeSnapshotVO_.latest, true).update(); + } + + // 3) src 是树根 → 整棵树 current=false(链空了) + if (srcSnapshotInv.getParentUuid() == null) { + sql(VolumeSnapshotTreeVO.class) + .eq(VolumeSnapshotTreeVO_.uuid, srcSnapshotInv.getTreeUuid()) + .set(VolumeSnapshotTreeVO_.current, false).update(); + } + } + }.execute(); +} +``` + +## 6.4 pull() 主流程概览(`VolumeSnapshotTreeBase.java:1097-1304`) + +1. `GetVolumeBackingChainFromPrimaryStorageMsg` —— 取祖父路径 +2. `AllocatePrimaryStorageSpaceMsg` +3. 分支: + - 在线 → `PullVolumeSnapshotOnHypervisorMsg` → libvirt block stream + - 离线 → `PullVolumeSnapshotOnPrimaryStorageMsg` → `qemu-img rebase` +4. `updateDatabaseAfterPull` / `updateDatabaseAfterPullToVolume` +5. 失败 rollback:释放分配空间 diff --git a/docs/snapshot-single-delete/07-group-passthrough.md b/docs/snapshot-single-delete/07-group-passthrough.md new file mode 100644 index 00000000000..e052063d211 --- /dev/null +++ b/docs/snapshot-single-delete/07-group-passthrough.md @@ -0,0 +1,98 @@ +# 07 — Group 透传与并发、失败聚合 + +## 7.1 入口排队 + +**`VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg)`** — `:163` + +```java +private void handle(APIDeleteVolumeSnapshotGroupMsg msg) { + thdf.chainSubmit(new ChainTask(msg) { + @Override + public String getSyncSignature() { return id; } // "volumeSnapshotGroup-" + @Override + public void run(SyncTaskChain chain) { + handleDelete(msg, new NoErrorCompletion(chain) { + @Override public void done() { chain.next(); } + }); + } + }); +} +``` + +按 group uuid 串行排队,防止同一 group 并发删除。 + +## 7.2 API → Inner 转发 + +**`handleDelete`** — `:187-210` + +```java +DeleteVolumeSnapshotGroupInnerMsg imsg = new DeleteVolumeSnapshotGroupInnerMsg(); +imsg.setUuid(msg.getUuid()); +imsg.setDeletionMode(msg.getDeletionMode()); +imsg.setScope(msg.getScope()); // ← 透传 +imsg.setDirection(msg.getDirection()); // ← 透传 + +overlaySend(imsg, new CloudBusCallBack(msg) { ... }); +// overlaySend:包成 VolumeSnapshotGroupOverlayMsg,路由到 VmInstance mailbox +// 保证"快照组删除"与"VM 状态变更"互斥 +``` + +## 7.3 真正的并行循环 + +**`handle(DeleteVolumeSnapshotGroupInnerMsg)`** — `:212-254` + +```java +SimpleFlowChain.of("delete-volume-snapshot-group") + .then("delete-volume-snapshots", trigger -> + new While<>(snapshots).step((snapshot, compl) -> { + DeleteVolumeSnapshotMsg rmsg = new DeleteVolumeSnapshotMsg(); + rmsg.setSnapshotUuid(snapshot.getUuid()); + rmsg.setVolumeUuid(snapshot.getVolumeUuid()); + rmsg.setTreeUuid(snapshot.getTreeUuid()); + rmsg.setDeletionMode(msg.getDeletionMode()); + rmsg.setScope(msg.getScope()); // ← 逐快照透传 + rmsg.setDirection(msg.getDirection()); // ← 逐快照透传 + + bus.makeTargetServiceIdByResourceUuid(rmsg, VolumeSnapshotConstant.SERVICE_ID, + getResourceIdToRouteMsg(snapshot)); + + bus.send(rmsg, new CloudBusCallBack(compl) { + @Override + public void run(MessageReply r) { + reply.addResult(new DeleteSnapshotGroupResult( + rmsg.getSnapshotUuid(), + rmsg.getVolumeUuid(), + r.getError())); + compl.done(); // 不短路 + } + }); + }, 5) // ← 并发度 5 + .run(new WhileDoneCompletion(msg) { + @Override + public void done(ErrorCodeList errs) { + trigger.next(); // 错误聚合在 reply.results + } + })) + .then("delete-vm-host-backup-files", trigger -> { + vmHostFileManager.cleanVmHostBackupFile(self.getUuid()); + trigger.next(); + }) + .done(() -> bus.reply(msg, reply)) + .error(errorCode -> { + reply.setError(errorCode); + bus.reply(msg, reply); + }) + .start(); +``` + +## 7.4 关键设计点 + +| 维度 | 说明 | +|---|---| +| 按卷分组 | `getEffectiveSnapshots()` 过滤出当前 VM 各卷的快照 | +| 并发度 | **5**(`While.step(..., 5)`) | +| 失败处理 | 每条独立 `compl.done()`,**不短路** | +| 错误聚合 | `reply.addResult(snapshotUuid, volumeUuid, errorCode)` | +| 整体回滚 | **无**;部分成功保留,返回结果列表 | +| 前置检查 | 删除流程**不**检查 `VolumeSnapshotGroupAvailability` | +| 入口唯一性 | `APIDeleteVolumeSnapshotGroupMsg` 与 `DeleteVolumeSnapshotGroupInnerMsg` 都只在此类处理 | diff --git a/docs/snapshot-single-delete/08-hypervisor-online-commit.md b/docs/snapshot-single-delete/08-hypervisor-online-commit.md new file mode 100644 index 00000000000..16e1ff0e9ee --- /dev/null +++ b/docs/snapshot-single-delete/08-hypervisor-online-commit.md @@ -0,0 +1,128 @@ +# 08 — Hypervisor 在线 commit(libvirt blockCommit + pivot) + +## 8.1 入口 + +**HTTP**:`POST /vm/volume/blockcommit`(`KVMConstant.KVM_BLOCK_COMMIT_VOLUME_PATH`) + +**Python**:`kvmagent/kvmagent/plugins/vm_plugin.py:9845` + +## 8.2 `do_block_commit()` 完整流程 + +`vm_plugin.py:3915-3983`: + +```python +def do_block_commit(self, task_spec, volume): + def do_block_commit_disk(task_spec, disk_name, top, base, active_commit): + def wait_job(_): + return not self._wait_for_block_job(disk_name, abort_on_error=True) + + def check_overlay_file(path): + if not active_commit: + return True + return self._check_target_disk_existing_by_path(path, True) + + def abort_block_commit_job(_): + flag = libvirt.VIR_DOMAIN_BLOCK_JOB_ABORT_ASYNC + if active_commit: + flag = libvirt.VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT + try: + if not self.domain.blockJobInfo(disk_name, 0): + return True + self.domain.blockJobAbort(disk_name, flag) + return True + except Exception as e: + logger.warn("pivot active layer failed, %s" % e) + return False + + # flags 组合 + if active_commit: + flags = libvirt.VIR_DOMAIN_BLOCK_COMMIT_RELATIVE + flags |= libvirt.VIR_DOMAIN_BLOCK_COMMIT_ACTIVE + else: + flags = libvirt.VIR_DOMAIN_BLOCK_COMMIT_DELETE + + # 发起 blockCommit + self.domain.blockCommit(disk_name, base, top, 0, flags) + touchQmpSocketWhenExists(task_spec.vmUuid) + + # 等数据同步 + if not linux.wait_callback_success(wait_job, timeout=d.get_remaining_timeout(), + ignore_exception_in_callback=True): + if not check_overlay_file(base): + raise kvmagent.KvmError('block commit failed') + + # pivot 或普通结束 + if not linux.wait_callback_success(abort_block_commit_job, d.get_remaining_timeout(), + ignore_exception_in_callback=True): + raise kvmagent.KvmError('block commit abort failed') + + # 确认 overlay(top)消失 + if not linux.wait_callback_success(check_overlay_file, base, d.get_remaining_timeout(), + ignore_exception_in_callback=True): + raise kvmagent.KvmError('block commit succeeded, but overlay file is not cleared') + + return base + + target_disk, disk_name = self._get_target_disk(volume) + top = get_volume_actual_installpath(task_spec.top) + base = get_volume_actual_installpath(task_spec.base) + install_path = VmPlugin.get_source_file_by_disk(target_disk) + active_commit = (top == install_path) # ← 关键判定 + + with BlockCommitDaemon(task_spec, self, disk_name, top=top, base=base, + active_commit=active_commit) as d: + return do_block_commit_disk(task_spec, disk_name, task_spec.top, + task_spec.base, active_commit) +``` + +## 8.3 libvirt flags 矩阵 + +| Flag | 作用 | +|---|---| +| `VIR_DOMAIN_BLOCK_COMMIT_DELETE` | 完成后自动删除 top 文件(非 active commit) | +| `VIR_DOMAIN_BLOCK_COMMIT_ACTIVE` | top 是活跃层,两阶段模式(需 pivot) | +| `VIR_DOMAIN_BLOCK_COMMIT_RELATIVE` | backing 用相对路径 | +| `VIR_DOMAIN_BLOCK_COMMIT_SHALLOW` | 只提交一层(**本代码未使用**) | + +## 8.4 Active commit 双阶段 pivot 流程 + +``` +Phase 1(数据同步): + blockCommit() → qemu 把 top delta 写进 base + VM 持续写 top,qemu 增量同步 + 轮询 blockJobInfo 直到 ready + +Phase 2(pivot): + blockJobAbort(VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT) + → qemu 原子切换活跃层 top → base + → top 变游离,VM 后续写直接落 base + +最后 check_overlay_file 确认 pivot 成功 +``` + +**为什么需要 pivot**:VM 正在运行,top 文件实时被写;不能直接删 top,必须先让 qemu 把活跃层切到 base。 + +## 8.5 关键辅助函数 + +`_get_snapshot_size()` — `vm_plugin.py:8946`: +```python +@staticmethod +def _get_snapshot_size(install_path): + size = linux.get_local_file_disk_usage(install_path) # du -sb(actual size) + if size is None or size == 0: + if install_path.startswith("/dev/"): + size = int(lvm.get_lv_size(install_path)) # LV 场景 + else: + size = linux.qcow2_virtualsize(install_path) # 兜底 + return size +``` + +返回 **actual size**(实际占用),SharedBlock 走 LV 大小。 + +## 8.6 active_commit 判定 + +```python +active_commit = (top == install_path) +``` + +`install_path` 是 libvirt domain XML 中 disk 当前的 source file,等于活跃层路径。当 `top` 等于活跃层时即 active commit。 diff --git a/docs/snapshot-single-delete/09-agent-qemu-img.md b/docs/snapshot-single-delete/09-agent-qemu-img.md new file mode 100644 index 00000000000..0d2f4398baf --- /dev/null +++ b/docs/snapshot-single-delete/09-agent-qemu-img.md @@ -0,0 +1,100 @@ +# 09 — agent 端 qemu-img 三种命令对比 + +## 9.1 三个函数定义 + +**文件**:`zstacklib/zstacklib/utils/linux.py:1389-1432` + +```python +# 1389:qcow2_commit +def qcow2_commit(top, base): + shell.call('%s -f qcow2 -b %s %s' % (qemu_img.subcmd('commit'), base, top)) + # qemu-img commit -f qcow2 -b + # 语义:top delta → base,base 内容更新,top 不被自动删 + +# 1395:qcow2_rebase(安全 rebase) +def qcow2_rebase(backing_file, target): + if backing_file: + fmt = get_img_fmt(backing_file) + backing_option = '-F %s -b "%s"' % (fmt, backing_file) + else: + backing_option = '-b "%s"' % backing_file + + # virtual size 一致性自动扩容 + top_virtual_size = int(qcow2_get_virtual_size(target)) + backing_chain = qcow2_get_backing_chain(target) + for idx, bf in enumerate(backing_chain): + if idx == len(backing_chain)-1 and get_img_fmt(bf) != 'qcow2': + break + bf_virtual_size = int(qcow2_get_virtual_size(bf)) + if bf_virtual_size < top_virtual_size: + qemu_img_resize(bf, top_virtual_size) + if bf == backing_file: + break + + with TempAccessible(target): + shell.call('%s -f qcow2 %s %s' % (qemu_img.subcmd('rebase'), backing_option, target)) + # qemu-img rebase -f qcow2 -F -b "" + +# 1416:qcow2_rebase_no_check(unsafe rebase) +def qcow2_rebase_no_check(backing_file, target, backing_fmt=None): + fmt = backing_fmt if backing_fmt else get_img_fmt(backing_file) + with TempAccessible(target): + shell.call('%s -F %s -u -f qcow2 -b "%s" %s' % ( + qemu_img.subcmd('rebase'), fmt, backing_file, target)) + # qemu-img rebase -F -u -f qcow2 -b "" +``` + +## 9.2 精确差异对比 + +| 函数 | 命令模板 | -u | 读旧 backing | 重写 delta | 用途 | +|---|---|---|---|---|---| +| `qcow2_commit` | `qemu-img commit -f qcow2 -b ` | — | 读 top | 否(合并) | top delta 合入 base | +| `qcow2_rebase` | `qemu-img rebase -f qcow2 -F -b ` | 无 | **读旧/新 backing** | **是** | 安全换 backing | +| `qcow2_rebase_no_check` | `qemu-img rebase -F -u -f qcow2 -b ` | **有** | 否 | 否 | 只改头部指针 | + +## 9.3 Unsafe rebase 数据语义 + +`-u`(unsafe): +- **不读取**旧 / 新 backing file 数据 +- **直接修改** target 文件 QCOW2 header 中的 `backing_file` 字段 +- 前提:新旧 backing 在 target 引用的块上**数据一致**(否则读出错误数据) + +在 single 删除场景,commit 完成后 base 的内容 = 原 src 内容,所以兄弟节点把 backing 从 src 改到 base 是**安全的**。 + +## 9.4 安全 rebase 的自动扩容 + +`qcow2_rebase` 遍历 backing chain,发现 backing 的 virtual size 比 target 小时,调用 `qemu_img_resize` 自动扩容,防止 rebase 后读越界。 + +## 9.5 SharedBlock LV 扩容(pull 时) + +**文件**:`shared_block_plugin.py:1247-1285` + +```python +total_required_size = self.get_total_required_size(dst_abs_path) +current_size = int(lvm.get_lv_size(dst_abs_path)) +if not cmd.fullRebase: + if current_size < total_required_size: + lvm.extend_lv_from_cmd(dst_abs_path, total_required_size, cmd, + extend_thin_by_specified_size=True) + with lvm.RecursiveOperateLv(src_abs_path, shared=True): + linux.qcow2_rebase(src_abs_path, dst_abs_path) +``` + +```python +# get_total_required_size — shared_block_plugin.py:967 +@staticmethod +def get_total_required_size(abs_path): + virtual_size = linux.qcow2_virtualsize(abs_path) + total_size = -1 + if linux.get_img_fmt(abs_path) == "qcow2": + try: + total_size = linux.qcow2_measure_required_size(abs_path) + # qemu-img measure:预测完整合并后的最小大小 + except Exception as e: + logger.warn(...) + if total_size > virtual_size or total_size == -1: + total_size = virtual_size + return total_size +``` + +**为什么 pull 需要扩 LV**:pull 把 src 数据合并进 dst,dst 物理占用上升;如果当前 LV 容量不够,提前扩容避免写入失败。 diff --git a/docs/snapshot-single-delete/10-storage-backend-matrix.md b/docs/snapshot-single-delete/10-storage-backend-matrix.md new file mode 100644 index 00000000000..a4789d22fa0 --- /dev/null +++ b/docs/snapshot-single-delete/10-storage-backend-matrix.md @@ -0,0 +1,108 @@ +# 10 — 存储后端支持矩阵 + +## 10.1 支持情况汇总 + +| 存储类型 | scope=single | 在线 commit | 离线 commit | pull | 备注 | +|---|---|---|---|---|---| +| **LocalStorage** | ✅ | KVMHost | `/localstorage/snapshot/offlinecommit` | `/localstorage/snapshot/offlinemerge` | qcow2 文件 | +| **NFS** | ✅ | KVMHost | `/nfsprimarystorage/offlinesnapshotcommit` | `/nfsprimarystorage/offlinesnapshotmerge` | qcow2 文件 | +| **SMP** | ✅ | KVMHost | `OFFLINE_COMMIT_SNAPSHOT_PATH` | `OFFLINE_MERGE_SNAPSHOT_PATH` | 共享挂载点 | +| **SharedBlock** | ✅ | KVMHost | 同 + 扩 LV | 同 + 扩 LV | LVM + qcow2 | +| **Ceph (RBD)** | ⚠️ 受限 | ❌ | ❌ | ❌ | RBD snapshot 不支持合并 | + +## 10.2 LocalStorage + +**Java**:`LocalStorageKvmBackend.java:3825/3846` + +```java +// 离线 commit +postRequest("/localstorage/snapshot/offlinecommit", cmd); +// 离线 pull +postRequest("/localstorage/snapshot/offlinemerge", cmd); +``` + +**Python**:`kvmagent/plugins/localstorage.py:835/859` + +```python +# offline_commit_snapshot +if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +``` + +```python +# offline_merge_snapshot +src_path = cmd.srcPath if not cmd.fullRebase else "" +if linux.qcow2_get_backing_file(cmd.destPath) == src_path: + return # 幂等 +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath, cmd.destPath) +else: + tmp = .../%s.qcow2 % uuid + qcow2.create_template_with_task_daemon(cmd.destPath, tmp, task_spec=cmd) + shell.call("mv %s %s" % (tmp, cmd.destPath)) +``` + +## 10.3 NFS + +**Java**:`NfsPrimaryStorageKVMBackend.java:1996/2031` + +**Python**:`nfs_primarystorage_plugin.py:601/625` + +逻辑与 LocalStorage 几乎一致(同样用 qcow2_commit / qcow2_rebase)。 + +## 10.4 SMP(SharedMountPoint) + +**Java**:`smp/KvmBackend.java:2443/2466` + +**Python**:`shared_mountpoint_plugin.py:483/506` + +逻辑同 NFS。 + +## 10.5 SharedBlock + +**Python**:`shared_block_plugin.py:1247/1285` + +```python +# offline_merge:扩 LV + 激活 LV + rebase +total_required_size = self.get_total_required_size(dst_abs_path) +if current_size < total_required_size: + lvm.extend_lv_from_cmd(dst, total_required_size, cmd, extend_thin_by_specified_size=True) +with lvm.RecursiveOperateLv(src_abs_path, shared=True): + linux.qcow2_rebase(src_abs_path, dst_abs_path) +``` + +```python +# offline_commit:commit 后清理 base 元数据 +with lvm.RecursiveOperateLv(top, shared=True): + if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + if cmd.topChildrenInstallPathInDb: + for c in cmd.topChildrenInstallPathInDb: + with lvm.RecursiveOperateLv(c, shared=True): + if linux.qcow2_get_backing_file(c) != base: + linux.qcow2_rebase_no_check(base, c) +lvm.delete_lv_meta(base) +``` + +## 10.6 Ceph + +`CephPrimaryStorageBase` **未实现** `CommitVolumeSnapshotOnPrimaryStorageMsg` / `PullVolumeSnapshotOnPrimaryStorageMsg`。 + +例外:`CephPrimaryStorageBase.java:2984` 临时快照删除时硬编码 `scope=Single, direction=Commit`,但仅用于撤销临时快照场景。 + +普通 RBD 快照:`cephdriver.py:87` 的 `delete_snapshot` 直接调 `rbd snap rm`,**不支持中间节点合并**。 + +**结论**:Ceph 普通快照不支持 `scope=single`。 + +## 10.7 在线场景统一走 KVMHost + +`plugin/kvm/.../KVMHost.java:1043/1159`: +- `commitVolumeSnapshot` → `POST /vm/volume/blockcommit` +- `pullVolumeSnapshot` → `POST /vm/volume/blockpull` + +所有支持的存储类型在 VM 在线时都走 libvirt blockCommit / blockPull,由 KVMHost 统一处理。 diff --git a/docs/snapshot-single-delete/11-sibling-rebase.md b/docs/snapshot-single-delete/11-sibling-rebase.md new file mode 100644 index 00000000000..e3269317bba --- /dev/null +++ b/docs/snapshot-single-delete/11-sibling-rebase.md @@ -0,0 +1,113 @@ +# 11 — 兄弟节点 rebase(分叉链关键) + +## 11.1 问题背景 + +当 commit 完成后,待删节点 X 还可能有除 src 外的其他子节点(兄弟节点),它们的 backing file 仍然指向 X 的旧物理路径,必须重新指向 base(dst)才能继续访问。 + +``` +分叉链示例: + X (待删) + / \ + src sibling1 + | | + descend ... + +commit(src → X) 完成后: + X 物理文件内容已变成 src 数据 + sibling1 的 backing 仍指向 X 旧路径 → 必须 rebase 到 base +``` + +## 11.2 Java 侧收集兄弟节点路径 + +**文件**:`VolumeSnapshotTreeBase.java:1012-1024` + +```java +// commit flow 内部 +List childrenInstallPath = child.getChildren().stream() + .map(c -> c.getInventory().getPrimaryStorageInstallPath()) + .collect(Collectors.toList()); +// child = src 节点 +// child.getChildren() = src 的所有子节点 + +// 在线消息 +CommitVolumeSnapshotOnHypervisorMsg cmsg = new CommitVolumeSnapshotOnHypervisorMsg(); +cmsg.setSrcChildrenInstallPathInDb(childrenInstallPath); + +// 离线消息(1044 行) +cmsg.setSrcChildrenInstallPathInDb(childrenInstallPath); +``` + +**注意**:变量名 `childrenInstallPath` 表面上像 src 的子节点,但实际语义是"待删节点 X(top)的子节点除 src 之外的兄弟节点"。代码命名上稍混乱,但 `topChildrenInstallPathInDb` 在 agent 侧含义明确:top(待删节点)所有子节点 → 它们的 backing 都需要 rebase 到 base。 + +KVMHost 透传:`KVMHost.java:1052` +```java +cmd.setTopChildrenInstallPathInDb(msg.getSrcChildrenInstallPathInDb()); +``` + +## 11.3 agent 侧循环 unsafe rebase + +### 在线 — `vm_plugin.py:9857` + +```python +vm.do_block_commit(cmd, cmd.volume) +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +rsp.size = VmPlugin._get_snapshot_size(cmd.base) +``` + +### 离线 LocalStorage — `localstorage.py:864-869` + +```python +if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + +if cmd.topChildrenInstallPathInDb: + for children in cmd.topChildrenInstallPathInDb: + if linux.qcow2_get_backing_file(children) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, children) +``` + +### 离线 SharedBlock — `shared_block_plugin.py:1299-1308` + +```python +with lvm.RecursiveOperateLv(top, shared=True): + if linux.qcow2_get_backing_file(cmd.top) != linux.qcow2_get_backing_file(cmd.base): + linux.qcow2_commit(cmd.top, cmd.base) + if cmd.topChildrenInstallPathInDb: + for c in cmd.topChildrenInstallPathInDb: + with lvm.RecursiveOperateLv(c, shared=True): + if linux.qcow2_get_backing_file(c) != base: + linux.qcow2_rebase_no_check(base, c) +``` + +## 11.4 兄弟节点 parentUuid 何时更新? + +**关键事实**:兄弟节点的 `parentUuid` **不在** `updateDatabaseAfterCommit` 里更新。 + +`updateDatabaseAfterCommit` 只更新: +- dst 的 path(互换) +- src 的 path、size、distance、parentUuid +- src 的所有后代的 distance + +兄弟节点(src 的兄弟,即 X 的其他子节点)的 DB `parentUuid` 仍指向 X。 + +**后续递归处理**: +- 下次 `stepDelete` 重新从 DB 构建 `VolumeTree` +- 此时 X 节点对应的物理文件路径已经是 src 数据(互换后) +- 但 DB 中兄弟节点仍挂在 X 下 → 物理 vs DB 不一致 + +这是 `VolumeTree.java:258` 注释中标记的 TODO: + +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// it is necessary to consider the dependency relationships of all snapshot nodes in the +// current snapshot tree within the VolumeSnapshotReferenceVO. +``` + +## 11.5 风险 + +- 分叉链中删中间节点时,兄弟节点物理 backing 与 DB parentUuid 暂时不一致 +- 若此时发生异常重启或并发操作,可能导致快照树状态混乱 +- 当前依赖"删除 X 后兄弟节点自然变成 X.parent 的子节点"这一物理事实,DB 修复留待后续操作 diff --git a/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md b/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md new file mode 100644 index 00000000000..5a17dfc955d --- /dev/null +++ b/docs/snapshot-single-delete/12-fullrebase-and-cleanup.md @@ -0,0 +1,96 @@ +# 12 — fullRebase 与残留文件清理 + +## 12.1 fullRebase 触发 + +**触发条件**:src 快照是树根(`srcSnapshotInv.getParentUuid() == null`),即 src 没有 backing file。 +此时 pull 操作不能简单 rebase(没有新 backing 可指),必须把 dst 文件 flatten 成独立 qcow2。 + +Java 侧构造 `OfflineMergeSnapshotCmd` 时设置 `fullRebase = true`。 + +## 12.2 agent 侧实现(`localstorage.py:835-857`) + +```python +src_path = cmd.srcPath if not cmd.fullRebase else "" + +if linux.qcow2_get_backing_file(cmd.destPath) == src_path: + return # 幂等 + +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath, cmd.destPath) +else: + tmp = os.path.join(os.path.dirname(cmd.destPath), + '%s.qcow2' % uuidhelper.uuid()) + qcow2.create_template_with_task_daemon(cmd.destPath, tmp, task_spec=cmd) + shell.call("mv %s %s" % (tmp, cmd.destPath)) +``` + +## 12.3 `create_template_with_task_daemon` + +**文件**:`zstacklib/zstacklib/utils/qcow2.py:10` + +```python +def create_template_with_task_daemon(src, dst, task_spec, dst_format='qcow2', opts=None, **daemonargs): + t_shell = traceable_shell.get_shell(task_spec) + p_file = tempfile.mktemp() + + class ConvertTaskDaemon(plugin.TaskDaemon): + def _cancel(self): + traceable_shell.cancel_job_by_api(self.api_id) + linux.rm_file_force(self.dst_path) + + def _get_percent(self): + p = linux.tail_1(p_file, split=b"\r") + ... + + with ConvertTaskDaemon(dst, task_spec): + linux.create_template(src, dst, dst_format=dst_format, shell=t_shell, + progress_output=p_file, opts=opts) + # qemu-img convert -f qcow2 -O qcow2 -p +``` + +特性: +- 遍历整条 backing chain,输出独立 qcow2 +- 支持进度上报(`-p`) +- 流式转换,无内存限制 +- 通过 `TaskDaemon` 支持取消(取消时删临时文件) + +## 12.4 mv 替换的并发安全 + +- **文件系统场景**:`mv` 同 FS 内是 `rename(2)` 原子操作 +- **LVM 场景**:`lvm.lv_rename` 元数据级原子 +- **读取并发**:rename 前后读到的是旧/新文件,无半态损坏 +- 上层依赖 `chainSubmit` 串行化同一树的操作,避免读到中间状态 + +## 12.5 残留文件清理责任 + +| 场景 | 清理者 | +|---|---| +| 在线非 active commit | `VIR_DOMAIN_BLOCK_COMMIT_DELETE` 自动删 top | +| 在线 active commit | pivot 后 top 游离,由 `deleteVolumeSnapshotAndSyncVolumeSize` 清理 | +| 离线 commit/pull | `deleteVolumeSnapshotAndSyncVolumeSize` 下发 `VolumeSnapshotPrimaryStorageDeletionMsg` | +| SharedBlock commit | `lvm.delete_lv_meta(base)` 删元数据;LV 真删走 `delete_bits` → `lvm.delete_lv` | + +## 12.6 物理删除入口(`VolumeSnapshotTreeBase.java:1307`) + +```java +private void deleteVolumeSnapshotAndSyncVolumeSize(Completion completion) { + VolumeSnapshotPrimaryStorageDeletionMsg pmsg = new VolumeSnapshotPrimaryStorageDeletionMsg(); + pmsg.setUuid(currentRoot.getUuid()); + bus.makeTargetServiceIdByResourceUuid(pmsg, VolumeSnapshotConstant.SERVICE_ID, + currentRoot.getPrimaryStorageUuid()); + bus.send(pmsg, ...); +} +``` + +各存储后端处理 `VolumeSnapshotPrimaryStorageDeletionMsg`,调用各自的 `delete_bits` HTTP 端点。 + +## 12.7 失败补偿 TODO + +`VolumeSnapshotTreeBase.java:1325`: + +```java +//TODO add gc +logger.warn(String.format("failed to delete snapshot[uuid:%s] on primary storage[uuid:%s], ...")); +``` + +物理文件删除失败仅 warn 日志,**无 GC 补偿**,存在文件/LV 泄露风险。 diff --git a/docs/snapshot-single-delete/13-premium-and-cdp.md b/docs/snapshot-single-delete/13-premium-and-cdp.md new file mode 100644 index 00000000000..4bee8388f84 --- /dev/null +++ b/docs/snapshot-single-delete/13-premium-and-cdp.md @@ -0,0 +1,53 @@ +# 13 — Premium / CDP / 灾备兼容性 + +## 13.1 Premium 侧改动 + +搜索 `/d/0zw/zw/premium/` 中与单节点快照删除直接相关的代码: + +| 文件 | 说明 | +|---|---| +| `mevoco/.../VolumeSnapshotDeletionOverlayVmMsg.java`(第6行) | 6 行 OverlayMessage 壳,**无 scope/direction 业务逻辑** | +| `CreateDataVolumeFromVolumeSnapshotGroupFlow.java` | 创建数据卷流程,与删除无关 | +| `CreateRootTemplateFromVolumeSnapshotFlow.java` | 创建模板流程,与删除无关 | +| 阿里云 Hybrid | `AliyunSnapshotCascadeExtension`,**不走** single 路径 | + +**结论**:Premium **未重写** `VolumeSnapshotTreeBase` / `VolumeTree` / `VolumeSnapshotGroupBase`。single 删除完全由开源主库实现,Premium 无额外扩展。 + +## 13.2 CDP / StorageSnapshot 类型 + +`VolumeSnapshotTreeBase.java:836`: + +```java +if (VolumeSnapshotConstant.STORAGE_SNAPSHOT_TYPE.toString().equals(currentRoot.getType()) + || Objects.equals(currentRoot.getVolumeType(), VolumeType.Memory.toString())) { + deleteVolumeSnapshotAndSyncVolumeSize(new Completion(completion) { ... }); + return; +} +``` + +CDP / StorageSnapshot 类型 / Memory 快照绕过整个 commit/pull 逻辑,**直接调用存储层删除**。 + +原因: +- StorageSnapshot 是存储后端原生快照(如 RBD snapshot),ZStack 不掌握其链结构 +- Memory 快照不是 qcow2 文件链 +- 都不需要 commit/pull 合并 + +## 13.3 Ceph 不兼容 + +`CephPrimaryStorageBase` 未实现: +- `CommitVolumeSnapshotOnPrimaryStorageMsg` +- `PullVolumeSnapshotOnPrimaryStorageMsg` + +普通 RBD 快照在 `cephdriver.py:87` 通过 `rbd snap rm` 删除,**无中间节点合并能力**。 + +例外:`CephPrimaryStorageBase.java:2984` 临时快照场景硬编码 `scope=Single, direction=Commit`,但这只是 ZStack 层面的删除消息标志,实际不走 commit 逻辑。 + +## 13.4 灾备 / 备份 + +经搜索: +- **未发现**灾备/CDP/Backup 调用链直接发 `DeleteVolumeSnapshotGroupInnerMsg` +- **未发现**对 single 模式的额外 cascade / 索引同步逻辑 + +## 13.5 OverlayMsg 串行化 + +`VolumeSnapshotDeletionOverlayVmMsg` 作用:把删除消息包裹后路由到 `VmInstance` 的 mailbox,保证与 VM 状态变更操作互斥。Premium 侧的 OverlayMsg 与开源侧一致,无额外业务。 diff --git a/docs/snapshot-single-delete/14-limitations-and-todos.md b/docs/snapshot-single-delete/14-limitations-and-todos.md new file mode 100644 index 00000000000..dc8e4ddcf57 --- /dev/null +++ b/docs/snapshot-single-delete/14-limitations-and-todos.md @@ -0,0 +1,77 @@ +# 14 — 已知限制 / TODO / FIXME + +## 14.1 代码注释中的 TODO + +### `VolumeTree.java:258` +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// it is necessary to consider the dependency relationships of all snapshot nodes in the +// current snapshot tree within the VolumeSnapshotReferenceVO. +``` +链克隆 + single 删除同时启用时,`VolumeSnapshotReferenceVO` 依赖关系未处理。 + +### `VolumeTree.java:394` +```java +// TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, +// the following three functions must take into account the dependencies within the snapshot chain. +``` +针对 `updateDatabaseAfterPullToVolume`、`updateDatabaseAfterPull`、`updateDatabaseAfterCommit`。 + +### `VolumeSnapshotTreeBase.java:355` +```java +// TODO: BUG FIX, when deleting a volume the cascade extension will send messages to all snapshots +// of this volume, which the oldest snapshot will delete descendant snapshots and set the volumeUuid +// to NULL for all snapshots, so the after messages are useless +``` +卷删除时级联消息冗余。 + +### `VolumeSnapshotTreeBase.java:1325` +```java +//TODO add gc +``` +物理文件删除失败无 GC 补偿。 + +### `VolumeSnapshotTreeBase.java:1520` +```java +//TODO: remove this +``` + +### `VolumeSnapshotTreeBase.java:2169` +```java +// TODO: refactor this: VolumeSnapshotGroupVO should has its own cascade extensions! +``` + +## 14.2 限制汇总 + +| 限制 | 位置 | 影响 | +|---|---|---| +| 链克隆 + single 不兼容 | `VolumeTree.java:258, 394` | `VolumeSnapshotReferenceVO` 依赖未维护,可能误删共享数据 | +| 物理删除无 GC | `VolumeSnapshotTreeBase.java:1325` | 文件/LV 泄露 | +| 卷删除级联消息冗余 | `VolumeSnapshotTreeBase.java:355` | 性能浪费,无功能影响 | +| 兄弟节点 parentUuid 暂不一致 | `updateDatabaseAfterCommit` | DB 与物理短暂不一致,依赖后续递归修复 | +| pull 但需 commit 抛 RuntimeException | `VolumeTree.java:371` | 未封装 ErrorCode,前端体验差 | +| VmState 限制 | `VolumeSnapshotTreeBase.java:854` | Migrating / Unknown 状态直接失败 | +| Ceph RBD 不支持 | `CephPrimaryStorageBase` | 无 commit/pull 实现,普通 RBD 快照无法 single 删除 | +| Group 无 Availability 检查 | `VolumeSnapshotGroupBase.java:212` | 删除前不检查组成员状态 | +| Group 并发度固定 5 | `:243` | 大组删除可能耗时长,但不可调 | +| Group 无整体回滚 | `:212-254` | 部分成功保留,需调用方处理错误列表 | + +## 14.3 设计取舍 + +| 决策 | 理由 | +|---|---| +| 默认 `scope=chain` | 保持向后兼容,避免老 API 调用方行为突变 | +| 多子节点强制 pull | commit 会改 dst 路径,破坏其它兄弟语义 | +| 优先非 online 子节点 | 避开 qemu 持有的活跃 backing 链 | +| commit 用 path 互换 | 避免修改快照 uuid,保持外部引用稳定 | +| 失败不回滚 | 存储操作不可逆,靠幂等性支持重试 | +| 删除前不查 GroupAvailability | 由下层 `isOperationAllowed` 自校验,避免重复 | + +## 14.4 后续改进建议(基于代码) + +1. **Ceph 支持**:考虑用 `rbd snap flatten` + RBD clone 实现单节点删除 +2. **GC 机制**:为 `deleteVolumeSnapshotAndSyncVolumeSize` 失败的物理文件加 GC 任务 +3. **错误码封装**:`resolveDirection` 的 `IllegalArgumentException` 换成 `ErrorCode` +4. **链克隆兼容**:`VolumeSnapshotReferenceVO` 在 commit/pull DB 更新时同步处理 +5. **并发度可配**:Group 删除并发度做成 GlobalConfig +6. **VmState 扩展**:评估 Migrating 等状态的支持 diff --git a/docs/snapshot-single-delete/bugs.md b/docs/snapshot-single-delete/bugs.md new file mode 100644 index 00000000000..c2ca518beb5 --- /dev/null +++ b/docs/snapshot-single-delete/bugs.md @@ -0,0 +1,390 @@ +# 单盘快照删除(scope=single) — 当前实现 Bug 清单 + +> 5.5.6 基线,基于场景 02 / 03 / 04 / 05 的源码梳理 + ZSV 真实环境实测整理。 +> 排序:先按"根本性 vs 派生",再按严重度。 +> 加固设计应优先覆盖 🔴 项;🟡 项作为语义修正;🟢 项作为代码质量改进。 + +## ✅ 修复进度(最新) + +| Bug | 状态 | 修复方式 | +|---|---|---| +| Bug 0 | ✅ 已修复 | `VolumeTree.isOnline` 拆为 `isOnAliveChain`(VM 状态无关)+ `isHypervisorOperation`;`stepDelete` 改用 `isOnAliveChain` 选 `aliveChild`,保护对 Running/Stopped 都生效 | +| Bug 1 | ✅ 已修复(顺带) | `resolveDirection` 中 `shouldUseCommitStrategy` 解耦 vmState,Stopped + Auto 现在按结构走 Commit | +| Bug 3 | ✅ 失去影响 | `aliveChild` 显式识别后,`children.get(0)` 顺序不再影响保护 | +| Bug 7 | ✅ 失去影响 | 同上 | +| Bug 5 | 🟢 降级(中→低) | 互换路径变可预测,但仍建议显式记录 "要删的物理路径" | +| Bug 2 | ⚠ 待修复 | `direction=null → Commit` 与"不传 = Auto"惯例不符(1 行可改) | +| Bug 4 | ⚠ 待修复(P0) | 物理推进 + DB 未推进的幽灵态,需 reconciler + 意图日志 | +| Bug 6 | ⚠ 待修复 | 删除期间 VM 状态锁 | +| **Bug 8** | ⚠ 待修复(P0) | API `scope="chain"` 默认与 UI 直觉相反;`auto` 取值文档承诺但未实现;含一段死代码 warn | +| Bug 9 | ⚠ 待修复 | 内部 `DeleteVolumeSnapshotMsg.direction` 无默认 `auto`,cascade 路径退化为 Commit | + +--- + +## Bug 0(根本性 / 🔴 高):`isOnline` 把"alive chain 归属"与"是否走 hypervisor"耦合在同一布尔值 — ✅ **已修复** + +> **修复**:拆 `isOnline` 为 `isOnAliveChain(uuid)` + 静态 `isHypervisorOperation(vmState)`;`stepDelete` 多子节点段改用 `isOnAliveChain` 识别 `aliveChild`,对 Running/Stopped 都生效。原 `isOnline` 签名保留,内部组合两个新方法,行为等价于"既在 alive chain 又走 hypervisor"。`resolveDirection` 中 `shouldUseCommitStrategy` 同步解耦 vmState(顺带修 Bug 1)。 + +### 现状 + +```java +// VolumeTree.java 行 389-392 +public boolean isOnline(boolean current, target, child, VmInstanceState vmState) { + return current + && (vmState == Running || vmState == Paused) // ← 把 vmState 当作 aliveChain 判定 + && aliveChain.contains(target) + && aliveChain.contains(child); +} +``` + +### 问题 + +"alive chain"的真正含义是 **vol 当前依赖的快照链路**(vol.installPath → parentUuid 反向递归),这条链路在 VM Stopped 时**仍然真实存在**,仅仅是 VM 没在跑而已。重启时 libvirt 会照样按这条链拉起。 + +当前代码把两个语义合并: +- 通道选择("用 libvirt 还是 qemu-img"):**由 vmState 决定** +- 链路归属("哪个 child 是 vol 所在的那条链,应该最后处理"):**由 vol.installPath 链决定,与 vmState 无关** + +把这两件事压在一个 `isOnline` 返回值里 → Stopped 时 `isOnline` 永远返回 false → `stepDelete` 多子节点段的"避开 alive 子节点"保护**完全失效**。 + +### 影响范围 + +1. **直接派生** Bug 3(顺序未定义):Stopped 时 `onlineChild = null`,换位 if 进不去,`child = children.get(0)` 由底层 collection 顺序决定 +2. **放大** Bug 4(幽灵态)的爆炸半径:若 vol 所在链被任意一轮选中,半完成态会直接波及 VM 启动链路 +3. 加固设计 reconciler 失去"vol 链是最后被动"这个不变式 + +### 实测证据 + +场景 05(VM Stopped + Commit)的 children=[3,4,5],实测 `children.get(0)` 返回 **4**,不是 distance 最小的 3,也不是 vol 所在的 5。本次"5 最后处理"是 collection 顺序的运气,不是代码语义保证。 + +### 修复方向 + +```java +// 拆开两个独立判定 +public boolean isOnAliveChain(String snapshotUuid) { + return aliveChain.contains(snapshotUuid); // 与 vmState 无关 +} + +public boolean isHypervisorOperation(VmInstanceState vmState) { + return vmState == Running || vmState == Paused; +} + +// stepDelete 改写 +SnapshotInventory aliveChild = children.firstMatch(c -> volumeTree.isOnAliveChain(c.getUuid())); +SnapshotInventory child = children.get(0); +if (aliveChild != null && child == aliveChild) { + child = children.get(1); // 对 Running / Stopped 都生效 +} +boolean online = volumeTree.isOnAliveChain(child) && volumeTree.isHypervisorOperation(vmState); +``` + +效果:Stopped 时 vol 所在 child(如 5)被识别为 aliveChild → 强制最后处理 → 失败半径只到旁支。 + +--- + +## Bug 1(语义错误 / 🟡 中):`direction=Auto` 在 Stopped 下退化为 Pull — ✅ **已修复(随 Bug 0)** + +> **修复**:`resolveDirection` 中 `shouldUseCommitStrategy = current && !targetSnapshotIsLatest && isOnAliveChain(target) && isOnAliveChain(child)`,不再要求 VM Running/Paused。Stopped + Auto + 待删/child 都在 vol 链上 → 返回 Commit,磁盘占用回归单份合并文件。 + +### 现状 + +`VolumeTree.resolveDirection`(行 364-387): + +```java +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; +if ("Auto".equals(initial)) { + return shouldUseCommitStrategy ? Commit : Pull; +} +``` + +VM Stopped → `online=false` → `shouldUseCommitStrategy=false` → Auto 返回 **Pull**。 + +### 问题 + +"Auto"的用户预期是"按最优策略走",但 Stopped 下 Auto = Pull 的代价: + +| 路径 | 物理操作 | 磁盘占用 | +|---|---|---| +| Stopped + Commit | `offline_commit_snapshot` 单次 qcow2_commit | 单份合并文件 | +| **Stopped + Auto/Pull** | N 次 `offline_merge_snapshot`(每 child 一次 qcow2_rebase) | **N 份 (target - parent) 差量副本** | + +N = currentRoot 的 children 数。N=3 时磁盘占用接近 3 倍。 + +### 修复方向 + +`resolveDirection` 里 Auto 在离线场景下也允许返回 Commit。可选规则: +- 简单:`Auto + Stopped + !targetIsLatest` 总是返回 Commit +- 复杂:根据 children 数量 / 差量大小做容量评估 + +### 影响 + +不影响正确性,影响**容量预期**。生产环境如果客户期望"删快照能释放空间",Auto 路径反而把空间放大。 + +--- + +## Bug 2(API 语义不一致 / 🟡 中):`direction=null` 当作 Commit,不是当作 Auto + +### 现状 + +`VolumeTree.resolveDirection` 第一行: + +```java +if (initial == null) { + return VolumeSnapshotDeletionDirection.Commit; +} +``` + +### 问题 + +- 大部分 ZStack API "字段不传 = 默认 = Auto" 是惯例 +- 这里 "字段不传 = 强制 Commit" —— 行为与 `direction=Auto` 显式传入完全不同(参考 Bug 1) + +后果: +- 前端调用方调试时不传 direction,意外触发离线 commit(DB 互换、VO 直接 DELETE) +- 自动化脚本若按 "省略 = 默认" 风格写,行为不可预测 + +### 修复方向 + +任一即可: +- API 入口校验 `direction != null`,否则报错 +- `resolveDirection` 里 `null` 当 Auto 处理(再结合 Bug 1 修复) + +--- + +## Bug 3(派生 / 🟢 低):`children.get(0)` 顺序未定义 — ✅ **失去影响(随 Bug 0)** + +> **修复后**:不管 collection 返回 [3,4,5] / [4,3,5] / [5,3,4],`aliveChild=5` 都会被 `isOnAliveChain` 显式识别并放最后处理。顺序假设不再是行为前提。 + +### 现状 + +`stepDelete` 多子节点段直接 `children.get(0)`,children 来自 `tree.snapshotLeaf(currentRoot).children` 的 Collection。 + +### 实测 + +场景 05 树 [3,4,5] 取出顺序为 [4,3,5],非按 distance 也非按 createDate。 + +### 问题 + +Stopped 时 Bug 0 让换位保护失效 → 任意顺序都可能选中 vol 所在 child。 + +### 与 Bug 0 关系 + +**Bug 0 是因,Bug 3 是果**。修了 Bug 0(按 alive chain 归属避开 vol 链),children 顺序就不重要了 —— 不管返回 [3,4,5] / [4,3,5] / [5,3,4],aliveChild=5 都会被识别并放到最后。 + +如果只想做"小步修复",可单独排序 children(按 distance 或按"是否在 vol 链上"),但根治还是修 Bug 0。 + +--- + +## Bug 4(崩溃半完成态 / 🔴 高):轮 3 `offline_commit` 物理成功 + DB SQLBatch 失败 → 幽灵态 + +### 触发 + +Stopped + Commit + scope=single + 待删节点有子节点(即 commit 路径生效): +- agent `qcow2_commit(top=5, base=2)` + `qcow2_rebase_no_check(vol)` 完成(物理已合并、vol backing 已切) +- Java 端 `updateDatabaseAfterCommit` 的 SQLBatch 失败(DB 死锁 / 连接断 / JVM crash) + +### 物理 vs DB 不一致 + +``` +物理: + vol.qcow2 头部 backing = 2.qcow2(aa72…e70c) + 2.qcow2 含 5+2 合并数据 + 5.qcow2 已被抽空但文件未删(轮 4 还没执行) + +DB(仍是互换前状态): + VO_2.installPath = 2.qcow2 → 仍存在 + VO_5.installPath = 5.qcow2 → 指向已被抽空的文件 + vol.installPath = 5.qcow2(DB 字段一直不变) +``` + +### 后果 + +1. **VM 启动**:libvirt 读 vol.qcow2 头部找 backing → 找到 2.qcow2 → 能启动 → 但 DB 视图错乱 +2. **后续删除请求**:若用户再次发起删 VO_2 / VO_5,stepDelete 会按 DB 推演,与物理状态对不上 +3. **reconciler 误判**:看到 VO_5.installPath=5.qcow2 文件被抽空,可能误判为"5 损坏需要修复",触发重建覆盖已合并数据 + +### 修复方向 + +- 物理操作前写"操作意图日志"(CommitVolumeSnapshotIntentVO 或类似),记录 src/dst/topChildren/target DB 状态 +- 重启时按日志做幂等推进(物理已成功 → 补 DB;DB 已成功 → 跳过) +- 物理 + DB 的对应关系通过日志显式追踪,不依赖内存 inventory + +--- + +## Bug 5(隐式状态传递 / 🟢 低 — 修 Bug 0 后从 🟡 降级):轮 4 删除路径依赖未文档化的内存对象状态 + +### 现状 + +轮 3 互换后 VO_2 整条 DELETE,但 `stepDelete` 调用栈仍持有 currentRoot 的内存 inventory。轮 4 进入 `deleteVolumeSnapshotAndSyncVolumeSize`,传给 agent 的物理路径来自这个内存 inventory。 + +实测(场景 05)轮 4 删的是 `0cab…cd1a.qcow2`(原 VO_5 物理文件),不是 `aa72…e70c.qcow2`(原 VO_2 物理文件,已被 VO_5 接管)—— **删对了**。 + +### 问题 + +这个"删对了"靠的是某处把内存 inventory 的 installPath 字段在互换时改写为了"被删者旧的 src 文件路径"(5 的旧文件)—— 但这个状态传递**没有显式记录**,全靠 SQLBatch 旁的内存写。 + +任何重构(比如把互换改成只动 DB 不动内存对象)都可能让轮 4 删错对象: +- **删错为 `aa72…e70c.qcow2`** → 把含合并数据的文件删掉 → vol 启动失败、真实数据丢失 + +### 修复方向 + +互换 + 物理删的对应关系显式记录: +```java +SwapResult result = updateDatabaseAfterCommit(src, dst); +// result.physicalFileToDelete = "5.qcow2 的物理路径" +// 显式传给轮 4,不靠内存 inventory +``` + +--- + +## Bug 8(API 默认值 / 🔴 高):`scope = "chain"` 默认值与 UI 直觉相反;`auto` 取值文档承诺但未实现 + +### 现状 + +```java +// APIDeleteVolumeSnapshotMsg.java 行 70-71 +// APIDeleteVolumeSnapshotGroupMsg.java 行 31-32 +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; +``` + +```java +// VolumeSnapshotTreeBase.java 行 473-490 +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + if (msg.getScope() == null) { // ← 死代码:上一行已 false + logger.warn("snapshot deletion scope is null, default to Chain scope"); + } + ... + deleteChainFlows(); // 删 currentLeaf 及其所有 descendants +} else { + deleteSingleFlows(); // 仅删该节点 + merge +} +``` + +### 问题 + +1. **默认 `chain` 与"删快照"UI 直觉不符**:用户在快照管理页面点"删除",预期是 single("只删这一个,别动旁支/后代")。默认 chain 会**雪崩删整棵子树**,CLI/SDK 用户漏传 scope 即触发,恢复成本极高。 +2. **快照组(Group)默认 `chain` 风险更大**:一个 group 含多盘,每盘按 chain 默认 → 单次 API 调用可能删几十个 snapshot。 +3. **`auto` 是死字符串**:`Objects.equals(scope, "Chain")` 是硬比较,传 `"auto"` 实际进 else 分支等价于 `single`。文档(validValues)承诺 auto 智能判断,实现完全没有。 +4. **死代码 warn**:`if (msg.getScope() == null) logger.warn(...)` 永远进不来 —— 第一行 `Objects.equals(null, "Chain")` 已返回 false。表明原作者意图"null → Chain"但被 API 层默认值掩盖。 + +### 修复方向 + +- **改默认为 `single`**:单盘 API 默认 single(与 UI 直觉一致);Group 的默认建议同步改 single 或前端强制确认弹窗 +- **实现 `auto` 分支**:如"无 children → single;有 children 且全是叶子 → single;否则按用户场景";或直接从 validValues 移除 `auto` +- **删除死代码 warn**:替换为真正的 null 防御 `if (scope == null) scope = Single;` +- **统一 enum 比较**:用 `DeleteVolumeSnapshotScope.valueOf(scope) == Chain` 而不是字符串硬比,避免大小写 / 拼写漂移 + +### 影响 + +- 误删风险:CLI / 自动化脚本漏传 scope → 整棵子树消失 +- 文档与实现脱节:开放给用户的 `auto` 取值名义存在、行为不存在 +- 加固设计若依赖 scope 语义(如 reconciler 区分单点/链)会被字符串硬比的实现绊倒 + +--- + +## Bug 9(API → 内部 msg 默认值脱钩 / 🟡 中):内部 `DeleteVolumeSnapshotMsg.direction` 没默认 `auto`,cascade 路径退化为 Commit + +### 现状 + +```java +// APIDeleteVolumeSnapshotMsg.java +private String direction = "auto"; // ✅ API 层有默认 + +// DeleteVolumeSnapshotMsg.java +private String direction; // ❌ 内部 msg 无默认 +private String scope; // ❌ 同上 + +// VolumeSnapshotDeletionMsg.java +private String direction; // ❌ 同上 +private String scope; +``` + +### 问题 + +任何**非 API 入口**的调用路径(cascade 删 volume 时联动删 snapshot、snapshot group 内部 split 派发到单盘 msg、定时清理任务等),如果不显式 `setDirection("auto")`,直接传 null 进 `VolumeTree.resolveDirection`,会落到 Bug 2 路径 → 强制 Commit。 + +后果: +- 用户从 UI 操作 = `direction=auto` 路径 +- 系统级联(删 vm/volume 联动)= `direction=null → Commit` 路径 +- **同样的快照树,两条入口行为完全不同**,对账 / 复现困难 + +### 修复方向 + +- 内部 msg 字段也给 `= "auto"` 默认(一行) +- 或在 `VolumeSnapshotTreeBase.handleDeletionMsg` 入口统一兜底:`if (direction == null) direction = "auto";` +- 与 Bug 2 一并修复("resolveDirection 中 null 当 Auto")即可顺带解决,但更稳妥是 msg 层和处理层双兜底 + +### 与 Bug 2 关系 + +Bug 2 是"resolveDirection 把 null 当 Commit";Bug 9 是"为什么内部 msg 会把 null 传进来"。修 Bug 2 解决症状,修 Bug 9 解决源头。两条都修最稳。 + +--- + +## Bug 6(顶替原 Risk 6 / 🟡 中):删除过程中 vmState 无锁,可能与 VM 启动竞争 + +### 触发 + +`deleteSingleFlows` 行 852-859 一次性查 vmState,整个递归 stepDelete 复用该值。期间若 VM 被并发启动(API / 调度器 / autoStart): +- agent 正在做 `qcow2_commit` / `qcow2_rebase` +- libvirt 同时尝试启动 VM,qemu 探测 backing 链 + +后果难以预测:qemu-img 与 qemu 进程对同一文件加锁冲突、或 qemu 读到半完成的 backing 头部。 + +### 修复方向 + +- 删除操作期间在 VM 上加状态锁(如 `LockVmInstanceMsg`) +- 或每轮重新校验 vmState,发现变动即终止 + +--- + +## Bug 7(次要):`children` 排序行为依赖底层实现 — ✅ **失去影响(随 Bug 0)** + +修复后测试不再受 collection 实现顺序影响,因为 `aliveChild` 选择是基于内容(uuid 是否在 aliveChain 中)而非位置。但**为了测试稳定性**,仍建议未来给 children 加确定排序。 + +--- + +## 严重度汇总表 + +| # | Bug | 严重度 | 类型 | 根因 / 派生 | 修复状态 | +|---|---|---|---|---|---| +| **Bug 0** | `isOnline` 耦合 vmState 与 aliveChain | 🔴 高 | 设计层 | 根因 | ✅ 已修复 | +| Bug 1 | Auto 在 Stopped 退化为 Pull,磁盘放大 N 倍 | 🟡 中 | 语义错误 | 独立 | ✅ 随 Bug 0 修复 | +| Bug 2 | direction=null 当作 Commit 而非 Auto | 🟡 中 | API 语义不一致 | 独立 | ⚠ 待修复 | +| Bug 3 | children.get(0) 顺序未定义 | 🟢 低 | 实现细节 | 派生自 Bug 0 | ✅ 失去影响 | +| **Bug 4** | offline commit 物理成功 + SQLBatch 失败 → 幽灵态 | 🔴 高 | 崩溃原子性 | 独立 | ⚠ 待修复(P0) | +| Bug 5 | 轮 4 删除路径靠内存 inventory 传递 | 🟢 低(修 Bug 0 后降级) | 代码质量 | 重构风险 | ⚠ 待修复(P1) | +| Bug 6 | vmState 无锁,删除与 VM 启动可竞争 | 🟡 中 | 并发 | 独立 | ⚠ 待修复 | +| Bug 7 | children 顺序依赖底层 collection 实现 | 🟢 低 | 测试稳定性 | 派生自 Bug 0 | ✅ 失去影响 | +| **Bug 8** | API `scope="chain"` 默认值 + `auto` 取值未实现 + 死代码 warn | 🔴 高 | API 契约 | 独立 | ⚠ 待修复(P0) | +| Bug 9 | 内部 `DeleteVolumeSnapshotMsg.direction` 无默认 `auto` | 🟡 中 | 入口一致性 | 与 Bug 2 同源 | ⚠ 待修复 | + +--- + +## 加固设计优先级建议(剩余项) + +| 优先级 | 任务 | 覆盖 Bug | +|---|---|---| +| ~~P0~~ | ~~拆 `isOnline` 为 `isOnAliveChain` + `isHypervisorOperation`~~ | ~~Bug 0、1、3、7(降级 5)~~ ✅ 已完成 | +| **P0** | reconciler 检测"物理推进 + DB 未推进"幽灵态 + 操作意图日志 | Bug 4 | +| **P0** | API `scope` 默认改 `single`、实现 `auto` 分支或下线 `auto` validValue、删死代码 warn | Bug 8 | +| P1 | `direction=null` 当 Auto + 内部 msg 默认值同步为 `auto` | Bug 2、Bug 9 | +| P1 | 互换 + 物理删的对应关系显式化 | Bug 5 | +| P2 | 删除操作期间 VM 状态锁 | Bug 6 | + +--- + +## Bug → 场景对应 + +| Bug | 在哪些场景文档可见 | +|---|---| +| Bug 0 | 03(口径说明)、04(决策矩阵)、05(实测顺序异常) | +| Bug 1 | 03(Auto/Pull 路径写出磁盘 N 份差量)、04(决策矩阵) | +| Bug 2 | 04("initial=null → Commit"决策表) | +| Bug 3 | 05 §6(实测 children.get(0)=4 非 3) | +| Bug 4 | 05 §7(脆弱点表,Stopped + Commit 最严重故障) | +| Bug 5 | 05 §4 轮 4 / §6 与推演的差异 | +| Bug 6 | 04(vmState 一次性读取) | +| Bug 7 | 05 §6(顺序差异) | +| Bug 8 | 04(scope 决策入口;当前文档未覆盖 chain 路径,建议补一段) | +| Bug 9 | 04(direction 入口路径,cascade / group split 未列出) | diff --git a/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md b/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md new file mode 100644 index 00000000000..85fb23dd395 --- /dev/null +++ b/docs/snapshot-single-delete/proposals/group-disband-symmetry-and-integrity-check.md @@ -0,0 +1,295 @@ +# 快照组解散对称化 + VM 级完整性拦截(A+C 组合方案) + +> 范围:`VolumeSnapshotTreeBase.ungroupAfter*`、`VolumeSnapshotGroupBase`、`VolumeSnapshotGroupChecker`、VM 删除 cascade、Attach/Detach 卷 +> 关联 Bug:bugs.md 中 **Bug 11 / Bug 12 / Bug 13**(待登记) +> 基线:5.5.6 +> 状态:提案(未实施) +> 决策点已确认:拦截 = **VM 级**;VM destroy 时 incomplete = **cascade 自动清理**;force = **API 字段** + +--- + +## 1. 背景 + +`VolumeSnapshotGroupVO` 表示"VM 上多盘一致性快照集",每盘一条 `VolumeSnapshotGroupRefVO`。当前删除快照时存在两条不对称的解散路径: + +| 路径 | 入口 | 触发条件 | 解散行为 | +|---|---|---|---| +| `ungroupAfterDeleteSingleSnapshot`(行 1427-1443) | scope=single 删单快照 | 该快照属于某 group | 仅 `ref.snapshotDeleted=true`;**所有 ref 都 deleted 才删 group VO** | +| `ungroupAfterDeleted`(行 2148-2169) | scope=chain 删子树 | 待删 snapshot 的根 volume 是 **Root** | **立即删除整个 group VO**,data 盘 ref 变孤儿 | + +后果: +- root 盘单删 chain → group VO 消失,data 盘 ref 还指向已不存在的 group → 残留孤儿 +- data 盘单删 chain → group VO 仍在,ref.snapshotDeleted=true → 组 incomplete +- 后续对该 VM 删组 / 建组 / 删 VM / 挂卸盘 → 没有任何拦截,所有操作"看起来正常"实际带病前进 + +本提案双管齐下: +- **A**:解散逻辑统一对称(消除孤儿源头) +- **C**:VM 级完整性拦截(让残留 incomplete 组成为后续操作的硬阻断点) + +--- + +## 2. 方案 A — 解散对称化 + +### 2.1 改动 + +`VolumeSnapshotTreeBase.ungroupAfterDeleted` 行 2148-2169 移除 `Root` 特例: + +```java +private void ungroupAfterDeleted(List snapshots) { + List uuids = snapshots.stream() + .map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); + + SQL.New(VolumeSnapshotGroupRefVO.class) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) + .set(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).update(); + + // 不再区分 root / data,统一查"全 ref deleted 才解散整组" + Set groupUuids = Q.New(VolumeSnapshotGroupRefVO.class) + .select(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) + .listValues().stream().map(Object::toString).collect(Collectors.toSet()); + + for (String groupUuid : groupUuids) { + long remaining = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, false).count(); + if (remaining == 0) { + vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid); + cleanVmHostBackupFilesForGroup(Collections.singletonList(groupUuid)); + dbf.removeByPrimaryKey(groupUuid, VolumeSnapshotGroupVO.class); + } + } +} +``` + +### 2.2 收益 + +- root 盘 chain 删除不再立即删 group VO,与 data 盘行为对齐 +- 不再产生"group 已不存在 / ref 仍在"的孤儿 +- `ungroupAfterDeleteSingleSnapshot` 与 `ungroupAfterDeleted` 行为合并,可后续重构为同一私有方法 + +### 2.3 兼容性 + +- 旧 root 单删 chain 后立即解散的"快"行为消失:仍要等 data 盘 ref 也清理才解散 +- 实际上历史路径就是 bug —— 旧行为留下孤儿 ref,新行为留下 incomplete 组(被 C 拦截后用户必须清理) + +--- + +## 3. 方案 C — VM 级完整性拦截 + +### 3.1 拦截入口 + +| 入口 API | 拦截条件 | 错误信息 | force 字段 | +|---|---|---|---| +| `APIDeleteVolumeSnapshotGroupMsg`(其他组) | VM 上有 incomplete 组(exclude 自身) | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再删除其他快照组` | ✅ | +| `APICreateVolumeSnapshotGroupMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再创建新快照组` | ❌(不应允许) | +| `APIAttachDataVolumeToVmMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再挂载磁盘` | ❌ | +| `APIDetachDataVolumeFromVmMsg` | VM 上有 incomplete 组 | `VM[uuid=%s] 存在不完整快照组%s,请先清理后再卸载磁盘` | ❌ | +| `APIDestroyVmInstanceMsg` | VM 上有 incomplete 组 | **不拦截**(cascade 自动清理) | ❌ | + +**豁免**: +- 删 incomplete 组**自身** → 放行(exclude 当前 group_uuid) +- 单快照 API(`APIDeleteVolumeSnapshotMsg`) → 放行(清债途径) + +### 3.2 incomplete 检测 + +在 `VolumeSnapshotGroupChecker` 新增静态方法: + +```java +public class VolumeSnapshotGroupChecker { + /** + * 返回 VM 上所有 incomplete 组(部分 ref 已 snapshotDeleted=true 但仍存在未删的 ref)。 + * @param excludeGroupUuid 排除指定 group(如删自身时不算违例),null 表示不排除 + */ + public static List findIncompleteGroupsOnVm(String vmUuid, String excludeGroupUuid) { + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .eq(VolumeSnapshotGroupVO_.vmInstanceUuid, vmUuid) + .listValues(); + + List incomplete = new ArrayList<>(); + for (Object o : groupUuids) { + String guuid = o.toString(); + if (guuid.equals(excludeGroupUuid)) continue; + long deletedRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).count(); + long totalRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid).count(); + if (deletedRefs > 0 && deletedRefs < totalRefs) { + incomplete.add(guuid); + } + } + return incomplete; + } +} +``` + +### 3.3 拦截织入示例 + +#### 3.3.1 删除其他组 + +```java +// VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg) +private void handle(APIDeleteVolumeSnapshotGroupMsg msg) { + APIDeleteVolumeSnapshotGroupEvent evt = new APIDeleteVolumeSnapshotGroupEvent(msg.getId()); + String vmUuid = self.getVmInstanceUuid(); + if (!msg.isForce()) { + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(vmUuid, self.getUuid()); + if (!incomplete.isEmpty()) { + evt.setError(operr("VM[uuid=%s] 存在不完整快照组%s,请先清理后再删除其他快照组", + vmUuid, incomplete)); + bus.publish(evt); + return; + } + } + // ... 原逻辑 +} +``` + +#### 3.3.2 创建新组 / 挂卸盘 + +各 API handle 入口: + +```java +List incomplete = VolumeSnapshotGroupChecker.findIncompleteGroupsOnVm(vmUuid, null); +if (!incomplete.isEmpty()) { + bus.replyErrorByMessageType(msg, operr("VM[uuid=%s] 存在不完整快照组%s,请先清理后再 ...", + vmUuid, incomplete)); + return; +} +``` + +#### 3.3.3 VM destroy — cascade 自动清理(不拦截) + +`VolumeSnapshotGroupCascadeExtension`: + +```java +@Override +public void asyncCascade(CascadeAction action, Completion completion) { + if (CascadeConstant.DELETION_CHECK_CODE.equals(action.getActionCode())) { + // VM destroy 不拦截 incomplete 组,由后续 cleanup 阶段处理 + completion.success(); + return; + } + + if (CascadeConstant.DELETION_CLEANUP_CODE.equals(action.getActionCode())) { + String vmUuid = ((VmInstanceInventory) action.getParentIssuer().get(0)).getUuid(); + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(vmUuid, null); + if (!incomplete.isEmpty()) { + // force 删除所有 incomplete 组(包括其残留 ref) + forceDeleteGroups(incomplete, completion); + return; + } + completion.success(); + } +} +``` + +`forceDeleteGroups`:直接 SQLBatch 删 `VolumeSnapshotGroupRefVO` + `VolumeSnapshotGroupVO`,然后调 `vidm.deleteArchiveVmInstanceResourceMetadataGroup` + `cleanVmHostBackupFilesForGroup`。**不再走 chain 删快照** —— VM 销毁时 volume 也会被销毁,对应 snapshot tree 通过各 PS cascade 清理。 + +### 3.4 force 字段(仅 API 层) + +```java +// APIDeleteVolumeSnapshotGroupMsg.java +@APIParam(required = false, + description = "true = 跳过 VM 完整性检查(运维兜底);默认 false") +private boolean force = false; +``` + +仅 `APIDeleteVolumeSnapshotGroupMsg` 加 `force`。其他 API(建组/挂卸盘)不应允许带病前进,不开 force。 + +--- + +## 4. 用户清债的两条路径 + +| 场景 | 操作 | 结果 | +|---|---|---| +| 整组清理 | `APIDeleteVolumeSnapshotGroupMsg(group_uuid=incomplete)` | 走 chain 删剩余快照 → A 解散逻辑收尾 → group VO 删除 | +| 个体清理 | 对每个残留 ref 对应的 snapshot 调 `APIDeleteVolumeSnapshotMsg` | 同上路径触发 A 解散收尾 | +| 紧急绕过 | `APIDeleteVolumeSnapshotGroupMsg(group_uuid=other, force=true)` | 跳过完整性检查删其他组(incomplete 组留待事后处理) | + +--- + +## 5. 行为矩阵 + +| T0 状态 | T1 操作 | T1 结果 | T2 操作 | T2 结果 | +|---|---|---|---|---| +| 组1 完整(root + data 各一) | 删组1 root 单快照 (single) | 组1 ref 一个 deleted;**组1 VO 保留** | 删组2 | C 拦截 | +| 同上 | 同上 | 同上 | 删组1(自身) | 放行(exclude) | +| 同上 | 同上 | 同上 | 删组1 data ref 对应 snapshot | 放行 → 触发 A 解散 | +| 组1 完整 | 删组1 整组 (chain) | 全 ref deleted → 组1 VO 删 | 删组2 | 放行 | +| 组1 incomplete | 升级 management 重启 | 状态持久 | 删 VM | **放行**(cascade 自动清 incomplete) | +| 组1 incomplete | — | — | 删组2 force=true | 放行(带病删,组1 仍在) | +| 组1 incomplete | — | — | 建新组 / 挂盘 / 卸盘 | C 拦截,无 force 兜底 | + +--- + +## 6. 改动清单 + +| # | 文件 | 改动 | +|---|---|---| +| 1 | `storage/.../VolumeSnapshotTreeBase.java` 行 2148-2169 | 移除 root 特例,统一"全 ref deleted 才解散" | +| 2 | `storage/.../group/VolumeSnapshotGroupChecker.java` | 新增 `findIncompleteGroupsOnVm(vmUuid, excludeGroupUuid)` | +| 3 | `storage/.../group/VolumeSnapshotGroupBase.java handle(APIDeleteVolumeSnapshotGroupMsg)` | 入口加 incomplete 检查 + force 旁路 | +| 4 | `storage/.../VolumeSnapshotManagerImpl.java handle(APICreateVolumeSnapshotGroupMsg)` | 入口加 incomplete 检查 | +| 5 | VM Attach/Detach DataVolume API handle | 入口加 incomplete 检查 | +| 6 | `storage/.../group/VolumeSnapshotGroupCascadeExtension.java` | DELETION_CLEANUP 阶段 force 清 incomplete 组(VM destroy 路径) | +| 7 | `header/.../group/APIDeleteVolumeSnapshotGroupMsg.java` | 新增 `boolean force = false` | +| 8 | i18n 错误码表 | 新增 `GROUP_INCOMPLETE_BLOCK_*` 系列 | +| 9 | API 文档 / changelog / 升级公告 | 提示历史 incomplete 组将首次拦截,提供清债指引 | + +--- + +## 7. 兼容矩阵 + +| 场景 | 旧行为 | 新行为 | 兼容 | +|---|---|---|---| +| root chain 删除 | 立即删 group VO,留 data ref 孤儿 | 仅 mark deleted;等 data ref 齐删 | ⚠ break(更合理) | +| data chain 删除 | mark deleted,等齐 | 同(不变) | ✅ | +| single 删除 | mark deleted,等齐 | 同(不变) | ✅ | +| 升级前已存在的 incomplete 组 | 后续操作无任何提示 | 首次触发拦截 | ⚠ break(运维需清债 / force) | +| 升级前正常组 | 正常 | 正常 | ✅ | +| VM destroy 时存在 incomplete 组 | 走旧 cascade,行为不确定 | cascade 自动 force 清 | ✅ 改善 | + +--- + +## 8. 测试要点 + +| 场景 | 预期 | +|---|---| +| root chain 单删 → 不立即解散 | group VO 仍在,root ref deleted=true | +| data chain 单删 → 不解散 | 同上 | +| 全部 ref 都删完 → 自动解散 | group VO 消失 + vidm 调用 + backup file 清 | +| 组1 incomplete → 删组2 | argerr/operr,提示组1 incomplete | +| 组1 incomplete → 删组1 自身 | 放行 | +| 组1 incomplete → 删组1 剩余 snapshot(个体 API) | 放行 → A 收尾解散 | +| 组1 incomplete → 建新组 | operr 拦截 | +| 组1 incomplete → attach/detach data volume | operr 拦截 | +| 组1 incomplete → 删 VM | 放行,cascade 自动清 incomplete 组 | +| 删组2 force=true,组1 incomplete | 放行,组1 保留 | +| 升级旧库 → 已存在 incomplete 组 → 任意操作首次触发拦截 | 报错信息可指导清债 | + +--- + +## 9. 与 bugs.md 的对应 + +| Bug(待登记) | 描述 | 闭环来源 | +|---|---|---| +| Bug 11 | 解散非对称:root 立即删 vs data 等齐 | 方案 A | +| Bug 12 | incomplete 组持续污染后续操作,无任何检测 | 方案 C | +| Bug 13 | `getEffectiveSnapshots` 不过滤 `ref.snapshotDeleted=false` | A 间接缓解 + C 阻断后续触发场景 | + +--- + +## 10. 风险与决策点 + +| 决策点 | 已确认 | 备注 | +|---|---|---| +| 拦截层级 | **VM 级** | 同 VM 上任一 incomplete 组阻断 VM 上其他组操作 | +| VM destroy 时 incomplete 处理 | **cascade 自动清理** | 不拦截,cleanup 阶段 force 删 | +| force 字段位置 | **API 字段** | 仅 `APIDeleteVolumeSnapshotGroupMsg`,建组/挂卸盘不开 force | +| `findIncompleteGroupsOnVm` 性能 | 待 review | 每次 N+1 查询;如 VM 上组数多可改单 SQL JOIN + GROUP BY HAVING | +| 升级公告 | 必须有 | 升级前需提供 SQL 检测脚本:`SELECT vmInstanceUuid, volumeSnapshotGroupUuid FROM VolumeSnapshotGroupRefVO WHERE snapshotDeleted=1 GROUP BY volumeSnapshotGroupUuid HAVING COUNT(*) < (SELECT COUNT(*) FROM VolumeSnapshotGroupRefVO r2 WHERE r2.volumeSnapshotGroupUuid=...)` | diff --git a/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md b/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md new file mode 100644 index 00000000000..1df09b4b0d0 --- /dev/null +++ b/docs/snapshot-single-delete/proposals/scope-direction-api-redesign.md @@ -0,0 +1,283 @@ +# 快照删除 API 参数(`scope` / `direction`)重构提案 + +> 范围:`APIDeleteVolumeSnapshotMsg`、`APIDeleteVolumeSnapshotGroupMsg` 及其内部派生 msg +> 关联 Bug:bugs.md 中 **Bug 2 / Bug 8 / Bug 9** +> 基线:5.5.6 +> 状态:提案(未实施) + +--- + +## 1. 背景 + +历史上"删除快照"的语义只有一种 —— **删除待删节点 + 所有子孙节点**(子树雪崩删)。后来引入"单点删除"(只删该节点本身,子孙 merge 到 parent),通过 `scope` 入参区分两种行为: + +```java +// APIDeleteVolumeSnapshotMsg.java / APIDeleteVolumeSnapshotGroupMsg.java +@APIParam(required = false, validValues = {"single", "chain", "auto"}) +private String scope = "chain"; + +@APIParam(required = false, validValues = {"pull", "commit", "auto"}) +private String direction = "auto"; +``` + +```java +// VolumeSnapshotTreeBase.handleDeletionMsg 行 473-490 +if (Objects.equals(msg.getScope(), DeleteVolumeSnapshotScope.Chain.toString())) { + if (msg.getScope() == null) { + logger.warn("snapshot deletion scope is null, default to Chain scope"); + } + ... + deleteChainFlows(); +} else { + deleteSingleFlows(); +} +``` + +--- + +## 2. 当前设计的问题 + +### 2.1 `scope` 相关 + +| # | 问题 | 影响 | +|---|---|---| +| S1 | `validValues` 列了 `"auto"`,但代码用 `Objects.equals(scope, "Chain")` 字符串硬比,`auto` 实际等价于 `single` | 文档承诺 ≠ 实现;调用方误判 | +| S2 | `if (msg.getScope() == null) logger.warn(...)` 是死代码 —— 上一行 `Objects.equals(null, "Chain")` 已 false,永远进不来 | warn 永远打不出,作者意图("null → Chain 兜底")未生效 | +| S3 | "凡是非 Chain 字符串都默默走 single" —— 拼错 / 大小写漂移 / 老 `auto` 全部静默走 single | 异常值无法被发现,潜在数据破坏 | +| S4 | `chain` 命名容易被误读为"alive chain"或"整棵 tree" | 文档与实现差异,新人误读 | +| S5 | Group API 默认 `chain` 风险高一个量级(多盘 × 子树) | 一次 API 调用可能删几十个 snapshot | + +### 2.2 `direction` 相关 + +| # | 问题 | 影响 | +|---|---|---| +| D1 | API 层默认 `"auto"`,但内部 `DeleteVolumeSnapshotMsg.direction` / `VolumeSnapshotDeletionMsg.direction` 没有默认值(null) | cascade、group split、定时清理路径若不显式 set 即传 null | +| D2 | `VolumeTree.resolveDirection(null) → Commit`,与"不传 = Auto"惯例相反 | 同棵树两条入口(API vs cascade)行为分叉;Stopped 下意外走 offline_commit | + +### 2.3 语义对清 + +为避免"chain"再被歧义解读,先固化术语: + +| 术语 | 定义 | +|---|---| +| **chain(本提案中)** | 以待删节点为根的子树(`currentLeaf.getDescendants()`),含所有子孙、旁支、分叉。**不是** alive chain,**不是**整棵 tree。 | +| **single** | 仅待删节点本身;子孙保留并 merge 到 parent。 | +| **alive chain**(不在本提案 scope 中) | vol 当前依赖的快照链路(vol.installPath → parentUuid 反向递归)。仅出现在 `VolumeTree.aliveChain` 内部判定,与 API `scope` 无关。 | + +--- + +## 3. 设计目标 + +1. **保留默认 `chain`** —— 与老 API 行为兼容,避免 5.x.x 升级断老脚本 / cascade 路径 +2. **删除 `auto` 死值** —— 清理 validValues 中无实现的取值 +3. **enum 显式校验** —— 非法字符串抛 argerr,不再"任意非 Chain 都按 single" +4. **修复死代码 warn** —— 真正生效的 null 兜底分支 +5. **内部 msg 默认值与 API 对齐** —— cascade 路径与 API 路径行为一致 +6. **API 描述明确雪崩删语义** —— 让用户一眼看清"chain = 子树删" + +--- + +## 4. 详细方案 + +### 4.1 入参定义改写 + +```java +// APIDeleteVolumeSnapshotMsg.java +@APIParam(required = false, validValues = {"single", "chain"}, + description = "chain (默认) = 删除该节点及其所有子孙节点(子树删);" + + "single = 仅删除该节点本身,子孙节点 merge 到 parent") +private String scope = "chain"; + +@APIParam(required = false, validValues = {"pull", "commit", "auto"}, + description = "auto (默认) = 按 VM 状态与链路结构自适应选择 commit 或 pull") +private String direction = "auto"; +``` + +`APIDeleteVolumeSnapshotGroupMsg.java` 同步改写(参数定义相同)。 + +变化点: +- `scope` validValues 移除 `"auto"` +- `description` 写清 chain 是雪崩删 +- `direction` validValues 不变(`auto` 是真实实现,与 scope 的死值不同) + +### 4.2 后端处理改写 + +```java +// VolumeSnapshotTreeBase.handleDeletionMsg 行 473 附近 +DeleteVolumeSnapshotScope parsedScope; +if (msg.getScope() == null) { + parsedScope = DeleteVolumeSnapshotScope.Chain; + logger.warn(String.format( + "snapshot[uuid=%s] deletion scope is null, default to Chain (subtree delete)", + msg.getSnapshotUuid())); +} else { + try { + parsedScope = DeleteVolumeSnapshotScope.valueOf(StringUtils.capitalize(msg.getScope())); + } catch (IllegalArgumentException e) { + throw new OperationFailureException(argerr( + "invalid scope[%s], expect one of: single, chain", msg.getScope())); + } +} + +if (parsedScope == DeleteVolumeSnapshotScope.Chain) { + long size = 0; + for (VolumeSnapshotInventory inv : currentLeaf.getDescendants()) { + if (inv.isLatest()) ancestorOfLatest = true; + size += inv.getSize(); + } + requiredSize = Math.min(size, volume.getSize()); + deleteChainFlows(); +} else { + deleteSingleFlows(); +} +``` + +修复点: +- **S1 / S3**:enum 校验,非法字符串(含老 `auto`、拼错、大小写漂移)被 argerr 拦截 +- **S2**:死代码 warn 挪到真正的 null 兜底分支 +- **隐式分支风险**:`else` 不再"凡是非 chain 都按 single",仅 `Single` enum 值进 single 路径(这里 enum 二选一,等价于显式 switch;如未来加第三种值需改 switch) + +### 4.3 内部 msg 默认值同步 + +```java +// header/.../DeleteVolumeSnapshotMsg.java +private String direction = "auto"; // 修 D1 +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); // 与 API 默认对齐 + +// header/.../VolumeSnapshotDeletionMsg.java +private String direction = "auto"; +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); + +// header/.../group/DeleteVolumeSnapshotGroupInnerMsg.java +private String direction = "auto"; +private String scope = DeleteVolumeSnapshotScope.Chain.toString(); +``` + +效果:cascade、group split、定时清理任何路径若不显式 set,行为与 API 默认一致(chain + auto),不再退化为 Commit(Bug 9 闭环)。 + +### 4.4 `direction=null` 兜底(修 Bug 2) + +`VolumeTree.resolveDirection` 第一行: + +```java +// 修改前 +if (initial == null) { + return VolumeSnapshotDeletionDirection.Commit; +} + +// 修改后 +if (initial == null) { + initial = VolumeSnapshotDeletionDirection.Auto.toString(); // 与 API 默认一致 +} +``` + +修了内部 msg 默认值后这条仍是双保险:万一某条 cascade 路径用旧 builder 不带默认值构造 msg,仍能在 resolveDirection 入口兜住。 + +--- + +## 5. Group API 单独评估 + +`APIDeleteVolumeSnapshotGroupMsg` 的 scope 透传给每盘 single msg: + +```java +// VolumeSnapshotGroupBase.handle(APIDeleteVolumeSnapshotGroupMsg) +imsg.setScope(msg.getScope()); // 行 192 / 227 +imsg.setDirection(msg.getDirection()); +``` + +Group + chain 默认风险:**多盘 × 子树**,单次 API 可删几十个 snapshot,回滚成本极高。 + +| 选项 | 描述 | 推荐度 | 兼容性 | +|---|---|---|---| +| A. Group 保留默认 `chain` | 与单盘一致 + 与老脚本兼容;UI/文档单独警示风险 | ⭐⭐⭐ | ✅ 完全兼容 | +| B. Group 默认改 `single` | 与单盘默认拉开,强调"按盘点删" | ⭐⭐⭐ | ⚠ 老脚本行为变化 | +| C. Group `required = true` | 强制用户显式选择 | ⭐⭐⭐⭐ 最安全 | ⚠ 老脚本断 | + +**推荐 A**(最小改动):UI 层 + 文档显著警示,老脚本不动。如果业务上确认"快照组 = 一致性快照集,几乎无人对它做子树删",再走 C 在下个大版本下线默认值。 + +--- + +## 6. 兼容矩阵 + +| 调用方传参 | 旧行为 | 新行为 | 兼容 | +|---|---|---|---| +| 不传 `scope` | chain | chain(默认) | ✅ 等价 | +| `scope=chain` | chain | chain | ✅ 等价 | +| `scope=Chain` | chain(大写恰好等于 enum.toString) | chain(normalize) | ✅ 等价 | +| `scope=CHAIN` | 走 single(字符串非精确 "Chain") | chain(normalize) | ⚠ 行为变化但更合理 | +| `scope=single` | single | single | ✅ 等价 | +| `scope=auto` | 走 single(字符串非 "Chain") | argerr 拒绝 | ⚠ **break** | +| `scope=garbage` | 走 single | argerr 拒绝 | ⚠ **break** | +| 不传 `direction` | auto(API 层默认) | auto | ✅ 等价 | +| 内部 msg 不 set `direction` | null → resolveDirection 返回 Commit | auto → 按 vmState/链路 | ⚠ **行为变化**(更合理) | +| 内部 msg 不 set `scope` | null → 走 single 分支("非 Chain") | chain(默认) → 走 chain 分支 | ⚠ **行为变化**(与 API 默认对齐) | + +### break 项处理 + +1. **`scope=auto` break**:当前实际行为是"伪装成智能、其实落 single",调用方若依赖此行为本身就是 bug 用法。可选过渡:保留 `auto` 在 validValues 一个版本,内部 alias 到 chain(或 single,按实际调用方调研结果决定),warn `"scope=auto is deprecated"`。 +2. **`scope=garbage` break**:原本静默 single,新行为 argerr。这是**好的 break** —— 之前的隐藏 bug 暴露出来。 +3. **内部 msg 默认行为变化**:cascade / group split 路径如果原本依赖 "null=Commit / null=single" 隐含语义,会变。需要全量审查内部 msg 的所有调用点: + +```bash +# 搜索 cascade 路径 +rg "new DeleteVolumeSnapshotMsg\(\)" -l +rg "new VolumeSnapshotDeletionMsg\(\)" -l +``` + +凡是不显式 setDirection / setScope 的,确认是否需要保留旧的隐含语义;如需要,应在该路径显式 setDirection("commit") / setScope("single") 而不是依赖默认。 + +--- + +## 7. 改动清单 + +| # | 文件 | 改动 | +|---|---|---| +| 1 | `header/src/main/java/org/zstack/header/storage/snapshot/APIDeleteVolumeSnapshotMsg.java` | `scope` validValues 移除 `auto`;description 写清子树语义 | +| 2 | `header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java` | 同 1 | +| 3 | `header/src/main/java/org/zstack/header/storage/snapshot/DeleteVolumeSnapshotMsg.java` | `direction = "auto"`、`scope = "Chain"` 默认 | +| 4 | `header/src/main/java/org/zstack/header/storage/snapshot/VolumeSnapshotDeletionMsg.java` | 同 3 | +| 5 | `header/src/main/java/org/zstack/header/storage/snapshot/group/DeleteVolumeSnapshotGroupInnerMsg.java` | 同 3 | +| 6 | `storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java` 行 473 | enum normalize + 死代码 warn 修复 + argerr | +| 7 | `storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java` `resolveDirection` | `null → Auto` 兜底 | +| 8 | API 文档 / changelog | 兼容矩阵公告;UI 建议默认选 single;Group 警示 | +| 9 | 调用点审查 | `rg "new DeleteVolumeSnapshotMsg"` 验证内部 msg 默认变化的影响 | + +--- + +## 8. 测试要点 + +| 测试场景 | 预期 | +|---|---| +| API 不传 scope → 走 chain(兼容老行为) | ✅ | +| API 传 `scope=chain` 删多分支节点 | 子树全删 | +| API 传 `scope=single` 删多分支节点 | 仅该节点删,子孙 merge | +| API 传 `scope=auto` | argerr,不再静默走 single | +| API 传 `scope=GARBAGE` | argerr | +| API 不传 direction → resolveDirection 走 Auto 分支 | ✅ | +| Cascade 路径(删 vm/volume 联动)→ 内部 msg 走 chain + auto | 与 API 一致 | +| Group API 不传 scope → 多盘均走 chain | 兼容老行为 | +| 老脚本传 `scope=Chain`(首字母大写)| 走 chain(兼容) | +| `scope=CHAIN`(全大写)| 走 chain(normalize 后) | + +--- + +## 9. 与 bugs.md 的对应 + +本提案落地后,bugs.md 中的修复进度更新: + +| Bug | 当前状态 | 提案落地后 | +|---|---|---| +| Bug 2 (`direction=null → Commit`) | ⚠ 待修复 | ✅ resolveDirection 兜底为 Auto | +| Bug 8 (`scope` validValues / 死代码 / 默认风险) | ⚠ 待修复(P0) | ✅ enum normalize + argerr + 死代码修复 | +| Bug 9 (内部 msg `direction` 无默认) | ⚠ 待修复 | ✅ 三个内部 msg 同步默认 `auto` + `chain` | + +--- + +## 10. 风险与决策点 + +| 决策点 | 选项 | 备注 | +|---|---|---| +| `scope=auto` 是否保留过渡期 | 直接 break / 一版本 deprecated | 取决于调用方调研:有无脚本真传 auto | +| Group 默认是否改 `single` | A 保持兼容 / B 改 single / C required | 推荐 A,激进可走 C | +| 内部 msg 默认值变化是否需要 cascade 路径全审 | 是 | 必须全量 grep `new DeleteVolumeSnapshotMsg()` 确认无依赖 null 行为 | +| `chain` 是否重命名 `subtree` | 保留 / 重命名 + alias | 已决定保留:term 历史包袱 + 重命名收益小 | diff --git a/docs/snapshot-single-delete/scenarios/00-index.md b/docs/snapshot-single-delete/scenarios/00-index.md new file mode 100644 index 00000000000..bf9c2aaf1f3 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/00-index.md @@ -0,0 +1,23 @@ +# 单盘快照删除 — 场景梳理索引 + +本目录收录"现状代码逻辑梳理"性质的场景文档(与加固设计 spec 隔离)。 +每个文件聚焦一种具体的(存储类型 × VM 状态 × 树结构)组合,按 stepDelete 轮次推演当前实现行为。 + +| 文件 | 存储 | VM 状态 | 树结构 / 待删节点 | +|---|---|---|---| +| `01-multi-children-stepDelete.md` | 通用 | 通用 | 抽象骨架:X→A→{B,C,D},待删 A,多子节点 stepDelete 决策算法 | +| `02-local-running-delete-mid-with-3-children.md` | LocalStorage | Running | 1→2→{3,4,5→vol},待删快照2,含在线 commit + vol.installPath 同步 | +| `03-local-stopped-delete-mid-with-3-children.md` | LocalStorage | Stopped | 同上树结构,全程离线 pull → `offline_merge_snapshot` → `qcow2_rebase`,差量散到每个 child,无 libvirt,无 path 互换 | +| `04-deleteSingleFlows-online-offline-decision.md` | 通用 | 通用 | `deleteSingleFlows` / `stepDelete` / `resolveDirection` / `isOnline` / `commit` / `pull` 中 online 与 direction 的判定时序、四象限到 agent 入口映射 | +| `05-local-stopped-direction-commit-actual.md` | LocalStorage | Stopped | 1→2→{3,4,5→vol},待删快照2,**实测**记录(ZSV 真实环境抓 API uuid 全程 agent POST),direction=Commit + scope=single;轮 1/2 `offlinemerge`,轮 3 `offlinecommit`,轮 4 `delete`;修正源码推演 3 处偏差(child 顺序、VO_2 直接删、vol.installPath 不互换)| + +> 当前实现 Bug 清单已独立成档:`../bugs.md`(位于 `docs/snapshot-single-delete/bugs.md`)。**P0 修复已落地**(拆 `isOnline` 为 `isOnAliveChain` + `isHypervisorOperation`,`resolveDirection` 解耦 vmState),覆盖 Bug 0/1/3/7;剩余 P0/P1(Bug 2/4/5/6)见 bugs.md。 + +> API 参数(`scope` / `direction`)重构提案:`../proposals/scope-direction-api-redesign.md`,覆盖 Bug 2 / Bug 8 / Bug 9。 + +> 待补场景候选(按需追加): +> - NFS / SMP / SharedBlock + 在线 / 离线 各组合 +> - 删根节点(dst 是树根,触发 newTree 创建) +> - 分叉链 + 在线 active commit 链上有多级 snapshot +> - fullRebase 路径(pull 大文件) +> - 快照组(多卷并发) diff --git a/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md b/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md new file mode 100644 index 00000000000..dc3b371eeb7 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/01-multi-children-stepDelete.md @@ -0,0 +1,128 @@ +# 15. 多子节点 stepDelete 处理逻辑(现状梳理) + +> 本文档属于"当前实现梳理",与 `04-scope-and-stepDelete.md` 互补:04 讲 scope/递归框架,本文聚焦 **currentRoot 有多个直接子节点时** 的具体决策与执行顺序。 +> 源码:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java` `deleteSingleFlows()` / `stepDelete()`,行号 828–919(5.5.6 基线)。 + +--- + +## 15.1 调用入口 + +``` +deleteSingleFlows() (行 828) + └─ flow "delete-single-volume-snapshot" + ├─ 类型分流:StorageSnapshot / Memory → 直接 deleteVolumeSnapshotAndSyncVolumeSize + ├─ vmState 校验(Running / Paused / Destroyed / Stopped / Destroying) + └─ stepDelete() ◄── 递归核心 +``` + +`currentRoot` = 待删快照(不是它的兄弟)。后续讨论的 children 都是 **currentRoot 的直接子节点**。 + +--- + +## 15.2 stepDelete 单轮决策表(行 875–919) + +每轮重读 DB 重建 `VolumeTree`,处理一个子节点后递归再调一次 stepDelete。 + +``` +1. vos = Q(VolumeSnapshotVO).eq(treeUuid).list() +2. volumeTree = VolumeTree.fromVOs(vos, current, volumeInv) +3. children = volumeTree.getSnapshotLeaf(currentRoot.uuid).getChildren() + +┌──────────────────────────┬────────────────────────────────────────────────────┐ +│ children.size() │ 行为 │ +├──────────────────────────┼────────────────────────────────────────────────────┤ +│ 0 │ deleteVolumeSnapshotAndSyncVolumeSize(终态,删自身) │ +│ 1 │ resolveDirection → commit 或 pull │ +│ ≥ 2 (多子节点) │ 选一个非 alive chain 上的 child → 离线 pull │ +└──────────────────────────┴────────────────────────────────────────────────────┘ +``` + +### 多子节点选择算法(行 912–918) + +```java +onlineChild = children.stream() + .filter(c -> volumeTree.isOnline(current, currentRoot.uuid, c.uuid, vmState)) + .findFirst().orElse(null); + +child = children.get(0); +if (onlineChild != null && Objects.equals(child.uuid, onlineChild.uuid)) { + child = children.get(1); // 避开 alive 子节点,挑下一个 +} +boolean online = volumeTree.isOnline(current, currentRoot.uuid, child.uuid, vmState); +pull(child, volumeTree, online, comp); +``` + +要点: +- **永远先离线 pull 非 alive 的子节点**:alive chain 上的子节点最后一轮才处理(届时 children.size() 已收敛到 1) +- **只挑 children.get(0) 或 children.get(1)**:每轮处理一个,下一轮再选 +- direction 强制为 pull:多子节点路径不调 resolveDirection,直接 `pull(...)` + +--- + +## 15.3 pull 对一个子节点的物理 + DB 影响 + +设 currentRoot=X,要 pull 的子节点=Y: + +| 层 | 变化 | +|---|---| +| 物理 qcow2 | `qcow2_commit(X → Y)`:X 的差量被合进 Y;Y 的 backing 从 X 翻到 X.parent | +| DB(updateDatabaseAfterPull)| Y.parentUuid = X.parentUuid;Y.distance--;其它 X 的子节点不动 | + +效果:Y 不再依赖 X,从 currentRoot 的 children 列表中"脱离";下一轮 stepDelete 重读 DB 时 Y 已不在 children 里。 + +--- + +## 15.4 完整执行轨迹示例 + +快照树: + +``` + X (待删, currentRoot) + └─ A + ├─ B + ├─ C + └─ D ── vol ← alive chain 末端 +``` + +待删的是 **A**(currentRoot=A,children=[B, C, D])。 + +| 轮 | children 重读 | onlineChild | 选中 child | 决策 | 行为 | +|---|---|---|---|---|---| +| 1 | [B, C, D] | D | B(首个非 alive)| 离线 pull | qcow2_commit(A→B), B.parentUuid=X | +| 2 | [C, D] | D | C | 离线 pull | qcow2_commit(A→C), C.parentUuid=X | +| 3 | [D] (size=1) | D | D | resolveDirection → Commit (latest+online) | 在线 blockCommit(A→D) + pivot | +| 4 | [] | — | — | terminal | deleteVolumeSnapshotAndSyncVolumeSize(A) | + +最终: +- 物理:A 的 qcow2 文件被删;B/C 的 backing 直接指 X;D 通过 in-place commit 把 A 的数据吃掉(D.installPath 不变,但内容含 A) +- DB:A 的 VO 删除;B/C/D 的 parentUuid 全部跳过 A 直接指向 X + +--- + +## 15.5 关键不变量与代码对应 + +| 不变量 | 代码位置 | 作用 | +|---|---|---| +| 每轮重读 DB | 行 876 `Q.New(VolumeSnapshotVO).eq(treeUuid).list()` | 上一轮 DB 翻转后下一轮决策基于最新状态,避免基于陈旧子节点列表做错决定 | +| 多子节点先 pull 非 alive | 行 913–915 `if (child == onlineChild) child = children.get(1)` | 保证 alive chain 上的活跃文件不被离线操作打断 | +| alive 子节点最后处理 | 多轮 pull 后 children.size() 收敛到 1,进入 commit 分支 | 在线 commit 走 libvirt blockCommit,与 alive VM 协同 | +| 同步递归(comp.success → stepDelete)| 行 891–895 | 全程在 chainSubmit 锁内,无并发;reconciler 可同步介入每轮 | +| 终态收敛 | children.isEmpty() → deleteVolumeSnapshotAndSyncVolumeSize | 数据已全部搬走,自身物理 + DB 真删 | + +--- + +## 15.6 资料 children 顺序的依赖 + +代码使用 `children.get(0)` / `children.get(1)`,依赖 `VolumeTree.fromVOs` 返回 children 的顺序。该顺序由 DB 查询顺序决定(无显式 ORDER BY),实践上稳定但不应依赖语义意义。`onlineChild` 通过 `isOnline` 判定,与 children 顺序无关——这保证了"避开 alive"逻辑不会因 DB 顺序波动而失效。 + +--- + +## 15.7 与 commit 单子节点路径的差别 + +| 场景 | direction | 物理操作 | DB 翻转 | +|---|---|---|---| +| 单子节点 + commit | child(src) → currentRoot(dst) | qcow2_commit(src→dst) + 兄弟 rebase 到 dst | dst 移入 src 位置(详见 05-commit-db-swap)| +| 单子节点 + pull | currentRoot(src) → child(dst) | qcow2_commit(src→dst) | dst.parentUuid = src.parentUuid(详见 06-pull-db-rewrite)| +| 多子节点 | 强制 pull | 选一个非 alive child 做 pull | 该 child.parentUuid = currentRoot.parentUuid,其余 children 不动 | + +多子节点本质上是**对 N 个 child 顺序应用 pull**,把多分叉树逐步收敛为单分支,最后回归到"单子节点 commit"路径完成 alive 合并。 diff --git a/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md b/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md new file mode 100644 index 00000000000..bd63251cd11 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/02-local-running-delete-mid-with-3-children.md @@ -0,0 +1,259 @@ +# 场景 02:local + 在线 VM + 删除中间节点(快照2,3 个子节点其中 1 个 alive) + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 源码:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java`、`VolumeTree.java`、`kvmagent/.../vm_plugin.py`、`kvmagent/.../localstorage_plugin.py`。 + +--- + +## 前提 + +- 主存储类型:**LocalStorage** +- VM 状态:**Running**(active commit 路径) +- 待删快照:**快照2**(中间节点,3 个直接子节点,其中 1 个在 alive chain 上) + +## 快照树 + +``` + 快照1 + └─ 快照2 ◄── 待删 currentRoot + ├─ 快照3 + ├─ 快照4 + └─ 快照5 ── vol ← alive chain(VM 当前盘) +``` + +## 物理 backing chain(alive 这条线) + +``` +1.qcow2 ← 2.qcow2 ← 5.qcow2 ← vol +``` + +兄弟分支: + +``` +2.qcow2 ← 3.qcow2 +2.qcow2 ← 4.qcow2 +``` + +--- + +## 总轮次(4 轮 stepDelete) + +| 轮 | currentRoot=2 的 children | 选中 | online? | direction | 物理操作 | DB 关键变更 | +|---|---|---|---|---|---|---| +| 1 | [3, 4, 5] | 3 | false | **强制 pull** | `offline_merge_snapshot` → `qcow2_rebase(1.qcow2, 3.qcow2)`(差量进 3) | 3.parentUuid=1, 3.distance-- | +| 2 | [4, 5] | 4 | false | **强制 pull** | `offline_merge_snapshot` → `qcow2_rebase(1.qcow2, 4.qcow2)`(差量进 4) | 4.parentUuid=1, 4.distance-- | +| 3 | [5] | 5 | **true** | resolveDirection → Commit | libvirt blockCommit(top=5, base=2) + pivot | DB 互换 path | +| 4 | [] | — | — | terminal | 删 VO_2 + 物理(5.qcow2 文件已 libvirt 删)| VO_2 删除 | + +--- + +## 轮 1:离线 pull 快照3 + +代码 `VolumeSnapshotTreeBase.java:912-918`,进入 `children.size() ≥ 2` 分支: + +```java +aliveChild = 5 // 唯一 vol 链上(isOnAliveChain 命中;修复后术语) +child = children.get(0) = 3 // 3 != aliveChild → 不替换 +online = isOnline(2, 3, Running) = false +pull(3, ..., online=false) +``` + +**消息**:`PullVolumeSnapshotOnPrimaryStorageMsg`(local 走主存储路径,不经 hypervisor) + +**后端 → agent 映射**:`LocalStorageKvmBackend.handle(PullVolumeSnapshotOnPrimaryStorageMsg)` 行 3845-3865 → `OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=3.qcow2, fullRebase=false}` → **`OFFLINE_MERGE_PATH = "/localstorage/snapshot/offlinemerge"`** + +**agent 物理动作**(`localstorage.py` `offline_merge_snapshot` 行 834-856): + +``` +linux.qcow2_rebase(srcPath=1.qcow2, destPath=3.qcow2) +# qemu-img rebase 默认(非 -u): +# 把 3.qcow2 旧 backing(2.qcow2) 与新 backing(1.qcow2) 之间的差异 +# 写入 3.qcow2 数据区,然后改写头部 backing 字段为 1.qcow2 +``` + +**DB 翻转**(`updateDatabaseAfterPull`,详见 `../06-pull-db-rewrite.md`): + +``` +VO_3.parentUuid = 1 +VO_3.distance -= 1 +VO_3.installPath 不变 +其它 VO 不动 +``` + +VM 状态:完全无感(3 不在 alive chain)。 + +--- + +## 轮 2:离线 pull 快照4 + +与轮 1 完全对称。 + +**结果**: + +``` +VO_4.parentUuid = 1 +VO_4.distance -= 1 +4.qcow2 物理 backing → 1.qcow2 +``` + +此时 currentRoot=2 在 DB 中的 children 只剩 [5]。 + +--- + +## 轮 3:在线 commit 快照5 → 快照2(最复杂的一轮) + +```java +direction = volumeTree.resolveDirection(2, 5, msg.direction, currentRoot.isLatest, Running) + → Commit (5 在 alive chain + Running) +online = isOnline(2, 5, Running) = true +commit(5, volumeTree, online=true, comp) +``` + +### 3.1 控制面 flow(`commit()` 行 921-1094) + +``` +flow chain: + 1. (条件) SyncVolumeSizeOnPrimaryStorage 仅当 srcSnapshot.uuid == volume.uuid;本例 src=5 ≠ vol → 跳过 + 2. AllocatePrimaryStorageSpaceMsg 预占 size + 3. CommitVolumeSnapshotOnHypervisorMsg online → 走 hypervisor + ├─ srcSnapshot = 5 inventory + ├─ dstSnapshot = 2 inventory + └─ srcChildrenInstallPathInDb = [vol.installPath] # 5 的子节点是 vol leaf + 4. updateDatabaseAfterCommit DB 互换(SQLBatch 单事务) +``` + +### 3.2 数据面(`vm_plugin.py do_block_commit`) + +``` +top = src = 5.qcow2(VM 当前活跃盘) +base = dst = 2.qcow2 + +步骤: + 1. virDomainBlockCommit(disk, base=2.qcow2, top=5.qcow2, + flags=VIR_DOMAIN_BLOCK_COMMIT_ACTIVE | SHALLOW) + → libvirt 把 5 中尚未在 2 的数据 flush 到 2.qcow2 + → 进入 READY 态(active commit 特征) + 2. _wait_for_block_job → READY + 3. virDomainBlockJobAbort(disk, flags=VIR_DOMAIN_BLOCK_JOB_ABORT_PIVOT) + → VM disk source 从 5.qcow2 → 切到 2.qcow2 + 4. for child in srcChildrenInstallPathInDb=[vol.installPath]: + if qcow2_get_backing_file(child) != base: + qcow2_rebase_no_check(base, child) + → 本例 vol 即 5.qcow2 自身,pivot 后 VM 已切到 2.qcow2,通常 noop +``` + +完成后物理: + +``` +2.qcow2 内容:原 5 的全部数据已合并进来 +5.qcow2 物理文件:libvirt 在 pivot 时删除(VIR_DOMAIN_BLOCK_COMMIT_DELETE) +VM 活跃盘 source:2.qcow2 +``` + +### 3.3 DB 翻转(参考 `../05-commit-db-swap.md` §5.3) + +``` +src=5, dst=2 + +互换前: + VO_5.installPath = 5.qcow2 parentUuid = 2 distance = N + VO_2.installPath = 2.qcow2 parentUuid = 1 distance = N-1 + +互换后(**实测修订** —— 见场景 05 §6): + VO_2 **整条 DB 记录被删除**(不是"互换后保留至轮 4") + VO_5.installPath = 2.qcow2 ← 接管旧 dst 文件(含合并数据) + VO_5.parentUuid = 1 ← 跨过 2 + VO_5.distance -= 1 + VO_5.treeUuid = 不变(dst=2 不是树根;若 dst 是根则迁到新 tree) + +GroupRef 同步:被删者(2) 的 GroupRef 一并删除(VO_2 被 DELETE) + +distance 递减:src=5 的所有后代 distance -= 1(本例无更深 snapshot,只有 vol leaf) +``` + +### 3.4 vol.installPath 的同步 + +**实测结论**(场景 05 §5.2 / §6):commit 路径下 `vol.installPath` 字段在 DB 中**不变**。vol 之前挂 5.qcow2(VO_5 旧 installPath),commit + pivot 后物理上 vol 实际挂 2.qcow2(含合并数据的文件),但这个切换通过两个步骤实现: +- **物理层**:libvirt blockCommit pivot 后 vm domain 已经在用 2.qcow2 作为 backing;同步路径里 sibling 的 `qcow2_rebase_no_check(base=2.qcow2, child)` 把 vol 的 backing 链改写到 2.qcow2 +- **DB 层**:vol VO 的 installPath 字段保留原值,但 VO_5 的 installPath 字段被改为 2.qcow2(VO_5 接管 dst 物理文件),vol → VO_5 的 backing 关系仍然指向同一物理文件 + +因此"vol 跟着合并数据走"不是靠 `UPDATE VolumeVO SET installPath=...`,而是靠物理 backing 链 + VO 文件接管的组合。这是 alive 末端 commit 的关键行为(与中间节点 commit 不同:中间节点 commit 没有 vol 需要跟踪)。 + +### 3.5 互换后链状态 + +``` +DB 视角: + vol.installPath = 5.qcow2(**不变**,但物理 backing 已切到 2.qcow2) + VO_5.installPath = 2.qcow2 parentUuid = 1 ← 接管原 dst 文件 + VO_2 已删除 + VO_3.installPath = 3.qcow2 parentUuid = 1 + VO_4.installPath = 4.qcow2 parentUuid = 1 + +物理 backing chain: + vol → 2.qcow2(含合并数据)→ 1.qcow2 + 3.qcow2 → 1.qcow2 + 4.qcow2 → 1.qcow2 + 5.qcow2:libvirt 已删 +``` + +--- + +## 轮 4:物理清扫 5.qcow2 + +> ⚠ **实测修订**:VO_2 在轮 3 的 SQLBatch 中已被直接删除(不是"互换 path 保留至轮 4")。轮 4 的 `children=[]` 是因为 VO_5.parentUuid 已跨过 2 指向 1,VO_2 在树中已不可见。 + +```java +children = [] +deleteVolumeSnapshotAndSyncVolumeSize(comp) +``` + +**消息**:`DeleteVolumeSnapshotOnPrimaryStorageMsg` + +**agent 物理动作**(实测):删 **5.qcow2 物理文件**(即原 VO_5 的旧 installPath;libvirt 在 pivot 时已逻辑解除引用,此处 agent 真正删盘)。注意:传给 agent 的 path 来自 stepDelete 调用栈记住的 currentRoot 物理路径,而非已删除的 VO_2 VO。 + +**DB**:syncVolumeSize 更新 vol 的 size。VO_2 已在轮 3 删除,本轮 DB 无 VO 删除。 + +--- + +## 终态 + +``` +快照树(DB): + 快照1 + ├─ 快照3 installPath=3.qcow2 backing=1.qcow2 + ├─ 快照4 installPath=4.qcow2 backing=1.qcow2 + └─ 快照5 ── vol VO_5.installPath=2.qcow2(接管旧 dst 文件) + vol.installPath=5.qcow2(DB 字段不变,但物理 backing 已切到 2.qcow2) + +VO_2 已删除(轮 3 SQLBatch 中 DELETE) + +物理: + 1.qcow2 ← 2.qcow2 ← vol(VM 活跃,物理上 vol.backing = 2.qcow2,2.qcow2 含原 5+2 合并数据) + 1.qcow2 ← 3.qcow2 + 1.qcow2 ← 4.qcow2 + 5.qcow2 文件已删(轮 4) +``` + +> "VO_5 接管 2.qcow2 / VO_2 直接删 / vol.installPath 不变"这三条对应实测验证记录详见场景 05 §5、§6。 + +--- + +## 全程关键脆弱点(仅梳理,不含加固) + +| 轮 | 失败类型 | 当前后果 | +|---|---|---| +| 1 / 2 | `qcow2_rebase` 失败(agent crash 或 IO 错) | 3 / 4 backing 可能部分改写但未完成;DB 翻转尚未发生,幂等可重试 | +| 1 / 2 | `qcow2_rebase` 成功 + DB 翻转 SQL 失败 | 物理 child.backing=1,DB child.parentUuid=2 → 不一致 | +| 3 | blockCommit 卡住 / pivot 前 agent 死 | VM 可能仍指 5.qcow2,DB 未翻转 | +| 3 | blockCommit 成功但 reply 丢失 / SQLBatch 失败 | 物理已切到 2.qcow2,DB 仍旧态(VO_2 未删 / VO_5.installPath 仍 5.qcow2),重启会按 DB 读 5.qcow2 而 libvirt 已删它 | +| 3 | DB 翻转成功,但 vol 物理 backing 改写失败 | vol.qcow2 头部 backing 仍指 5.qcow2(已删)→ VM 重启失败 | +| 4 | 删 5.qcow2 失败 | 孤儿文件残留 | + +--- + +## 与其它场景对照 + +| 场景 | 轮数 | 核心特征 | +|---|---|---| +| `01-multi-children-stepDelete.md` | 4 | 通用多子节点骨架;以 X→A→{B,C,D} 抽象演示 | +| **本场景 02**(local + Running + 删快照2) | 4 | 落到具体存储 + 在线 + alive 子节点是 vol 直接父;最后一轮在线 commit + vol.installPath 同步是关键差异 | diff --git a/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md b/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md new file mode 100644 index 00000000000..0186fd84512 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/03-local-stopped-delete-mid-with-3-children.md @@ -0,0 +1,304 @@ +# 场景 03:local + 关机 VM + 删除中间节点(快照2,3 个子节点其中 1 个 alive) + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 与 `02-local-running-delete-mid-with-3-children.md` 对照阅读:树结构相同,唯一差别是 VM 状态从 Running 变为 Stopped。 +> 源码: +> - `VolumeSnapshotTreeBase.java` 行 828-919(stepDelete)/ 1097-1290(pull) +> - `VolumeTree.java` 行 364-392(resolveDirection / isOnline) +> - `LocalStorageKvmBackend.java` 行 3845-3865(PullVolumeSnapshotOnPrimaryStorageMsg → OFFLINE_MERGE_PATH) +> - `localstorage.py` 行 834-856(`offline_merge_snapshot`) + +> ⚠ **本场景按 `initial.direction ∈ {Auto, Pull}` 的口径推演**(即最后一轮也走离线 pull)。 +> 如果 API 入参 `direction=Commit`(或前端不传 → resolveDirection 默认返回 Commit),最后一轮的行为完全不同:会走 `offline_commit_snapshot`(数据 5→2,DB 互换 path,VO_2 直接 DELETE),**请参考实测记录 `05-local-stopped-direction-commit-actual.md`**。 +> 决策矩阵见 `04-deleteSingleFlows-online-offline-decision.md` §"与场景 02 / 03 的对应"。 +> +> ⚠ **Bug 0 修复后**(参考 `../bugs.md`):`direction=Auto` 在 Stopped 下不再退化为 Pull —— `shouldUseCommitStrategy` 解耦 vmState 后,Auto + 待删/child 都在 vol 链上时返回 **Commit**,行为等价于场景 05。本场景 03 现在仅适用于 `direction=Pull` 显式入参,且需满足 `shouldCommit=false`(即不在 vol 链上)。 + +--- + +## 前提 + +- 主存储类型:**LocalStorage** +- VM 状态:**Stopped**(合法状态之一,校验在 `deleteSingleFlows()` 行 854-858) +- 待删快照:**快照2** + +## 快照树(与场景 02 完全相同) + +``` + 快照1 + └─ 快照2 ◄── 待删 currentRoot + ├─ 快照3 + ├─ 快照4 + └─ 快照5 ── vol ← alive chain +``` + +## 物理 backing chain + +``` +1.qcow2 ← 2.qcow2 ← 5.qcow2 ← vol(VM 关机,文件无人持有) +2.qcow2 ← 3.qcow2 +2.qcow2 ← 4.qcow2 +``` + +--- + +## 关键差异:所有轮都走"离线 pull" + +`VolumeTree.resolveDirection`(行 364-387): + +```java +boolean online = (vmState == Running || vmState == Paused) && alive(target) && alive(child); +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; +``` + +VM=Stopped → `online=false` → `shouldUseCommitStrategy=false` → Auto / null / Pull 全部解析为 **Pull**;`isOnline`(行 389-392)同样要求 Running/Paused。 + +后果:哪怕快照5 在 alive chain 上,VM 关机时它也走**离线 pull**,agent 不调 libvirt blockCommit,全部 qemu-img 离线操作。 + +**离线 pull 的真实控制面 / 数据面**: + +| 层 | 实现 | +|---|---| +| Java 控制面 | `pull()` 行 1250-1268 → 构造 `PullVolumeSnapshotOnPrimaryStorageMsg`,参数 `srcSnapshotParentPath`(= 快照1.qcow2)、`srcSnapshot`(= 被删的 currentRoot=快照2)、`dstSnapshot`(= 选中 child) | +| 后端转发 | `LocalStorageKvmBackend.handle(PullVolumeSnapshotOnPrimaryStorageMsg)`(行 3845-3865)→ 构造 `OfflineMergeSnapshotCmd { srcPath = srcSnapshotParentPath, destPath = dst.installPath, fullRebase = (srcPath == null) }` → 走 **`OFFLINE_MERGE_PATH = "/localstorage/snapshot/offlinemerge"`** | +| Agent 数据面 | `offline_merge_snapshot`(`localstorage.py` 行 834-856):核心一行 `linux.qcow2_rebase(cmd.srcPath, cmd.destPath)`(fullRebase 时改走 `qcow2.create_template` 扁平化) | + +**关键澄清**:场景 03 的 pull 走的是 `offline_merge_snapshot`,**不是** `offline_commit_snapshot`(后者由 commit 离线分支 `CommitVolumeSnapshotOnPrimaryStorageMsg` 调用)。`qcow2_rebase(backing=快照1, file=child)` 的语义是把 child 的 backing 从原快照2 改成快照1,并**把"快照2 与 快照1 之间的差量数据"复制进 child 文件**(因为快照1 作为基线只读不可写,只能往 child 写)。 + +--- + +## 总轮次(4 轮 stepDelete,与场景 02 同结构但全离线) + +| 轮 | currentRoot=2 的 children | 选中 | online? | direction | 物理操作 | DB 关键变更 | +|---|---|---|---|---|---|---| +| 1 | [3, 4, 5] | 3 | false | 强制 pull | `qcow2_rebase(1.qcow2, 3.qcow2)`(差量进 3.qcow2) | 3.parentUuid=1, distance-- | +| 2 | [4, 5] | 4 | false | 强制 pull | `qcow2_rebase(1.qcow2, 4.qcow2)`(差量进 4.qcow2) | 4.parentUuid=1, distance-- | +| 3 | [5] | 5 | **false** | resolveDirection → **Pull**(不再是 Commit) | `qcow2_rebase(1.qcow2, 5.qcow2)`(差量进 5.qcow2) | **5.parentUuid=1, distance--,不互换 path** | +| 4 | [] | — | — | terminal | 删 VO_2 + 物理 2.qcow2 | VO_2 删除 | + +**全程数据落地**:每一轮把"快照2 相对于快照1 的增量"**复制进当前选中的 child**(3 / 4 / 5 各拿一份独立副本)。快照1.qcow2 内容**不变**,快照2.qcow2 内容也**不变**,直到轮 4 整文件删除。 + +--- + +## 轮 1 / 轮 2:与场景 02 完全相同 + +`stepDelete` 多子节点分支(行 912-918)不依赖 vmState,只依赖 children.size 与 onlineChild 选择算法。Stopped 时 `isOnline` 全部返回 false,`onlineChild = null`,`child = children.get(0)`,不需要"避开 alive 子节点"的替换。 + +```java +onlineChild = null // VM Stopped,没有 alive child +child = children.get(0) = 3 +// if 块未触发 +online = isOnline(2, 3, Stopped) = false +pull(3, ..., online=false) +``` + +控制面 → 后端 → agent: + +``` +PullVolumeSnapshotOnPrimaryStorageMsg{ + srcSnapshotParentPath = "1.qcow2", + srcSnapshot = VO_2, + dstSnapshot = VO_3 +} + → LocalStorageKvmBackend.handle → OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=3.qcow2, fullRebase=false} + → offline_merge_snapshot: + linux.qcow2_rebase(srcPath=1.qcow2, destPath=3.qcow2) + # 物理:3.qcow2 backing 改写为 1.qcow2,差量数据合并入 3.qcow2 +DB: VO_3.parentUuid=1, distance-- +``` + +轮 2 同理对快照4。 + +--- + +## 轮 3:离线 pull 快照5(与场景 02 的根本差别) + +`children.size() == 1` 分支(行 903-911): + +```java +direction = volumeTree.resolveDirection(2, 5, msg.direction, currentRoot.isLatest, Stopped) + → online=false → shouldUseCommitStrategy=false → 解析为 Pull +online = isOnline(2, 5, Stopped) = false +pull(5, volumeTree, online=false, comp) // 离线 pull,不进 commit 分支 +``` + +**关键差异**:场景 02 在轮 3 走在线 commit(libvirt blockCommit + pivot + DB 互换 path);场景 03 走离线 pull,**不互换 path**,DB 修改路径完全不同。 + +### 3.1 控制面 flow(`pull()` 行 1097-1290) + +``` +flow chain: + 1. get-snapshot-backing-chain 获取 srcSnapshotParentPath(= 1.qcow2) + 2. allocate-primary-storage-capacity 预占 size + 3. (条件) get-volume-current-size 仅 dst.uuid == volume.uuid 时;本例 dst=5 ≠ vol → 跳过 + 4. pull-volume-snapshot-on-primary-storage online=false → PullVolumeSnapshotOnPrimaryStorageMsg + online=true 才走 PullVolumeSnapshotOnHypervisorMsg + 5. updateDatabaseAfterPull +``` + +`PullVolumeSnapshotOnHypervisorMsg` 在本场景**完全不会被构造**,因为 `online=false`。所以 hypervisor 端 vm_plugin 的 do_block_commit 路径在场景 03 整个删除过程中**一次都不调用**。 + +### 3.2 数据面(`offline_merge_snapshot`) + +``` +src=快照2, dst=快照5 +OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=5.qcow2, fullRebase=false} + +if linux.qcow2_get_backing_file(destPath=5.qcow2) == srcPath=1.qcow2: + return(已经挂在 1.qcow2,幂等 noop) + +if not cmd.fullRebase: + linux.qcow2_rebase(cmd.srcPath=1.qcow2, cmd.destPath=5.qcow2) + # qemu-img rebase 默认(非 -u): + # 把 5.qcow2 旧 backing(2.qcow2) 与新 backing(1.qcow2) 之间的差异 + # 写入 5.qcow2 的数据区,然后改写 5.qcow2 头部 backing 字段为 1.qcow2 +else: + # fullRebase 路径:扁平化(srcPath 为 null 时触发,本例不触发) + qcow2.create_template(cmd.destPath, tmp) → mv tmp cmd.destPath +``` + +**与之对比的 `offline_commit_snapshot`(commit 离线分支用)**: + +``` +top=child, base=parent # 由 LocalStorageKvmBackend.java:3827-3829 注入 +linux.qcow2_commit(top=child, base=parent) # 把 child flush 进 parent +for c in topChildrenInstallPathInDb: + linux.qcow2_rebase_no_check(base=parent, c) # child 的 children 重挂 parent +``` + +**两者方向相反**: +- `offline_merge_snapshot`(pull 用):数据从 dropped 节点 **流入 child**(每个 child 独立拷一份),dropped 文件不动 +- `offline_commit_snapshot`(commit 用):数据从 src(child) **流入 dst(被删 currentRoot)**,DB 后续会互换 installPath + +场景 03 全程使用前者。 + +### 3.3 DB 翻转(`updateDatabaseAfterPull`,对照 `../06-pull-db-rewrite.md`) + +``` +src=2, dst=5 + +更新前: + VO_5.installPath = 5.qcow2 parentUuid = 2 distance = N + VO_2.installPath = 2.qcow2 parentUuid = 1 distance = N-1 + +更新后: + VO_5.parentUuid = 1 ← 跨过 2 + VO_5.distance -= 1 + VO_5.installPath 不变(仍 5.qcow2,物理上含合并入的 2-vs-1 差量) + VO_5.size = newInstallPathSize(agent 返回,因合并入差量略增) + VO_2 不变(待轮 4 真删) +``` + +**与场景 02 的对照**: + +| 维度 | 场景 02(Running,commit) | 场景 03(Stopped,pull) | +|---|---|---| +| Agent 路径 | `CommitVolumeSnapshotOnHypervisorMsg` → libvirt blockCommit | `PullVolumeSnapshotOnPrimaryStorageMsg` → `offline_merge_snapshot` → `qcow2_rebase` | +| 物理操作位置 | child 数据进入被删者 | 被删者数据复制进 child(每个 child 各一份) | +| dst.installPath | **互换**:VO_2 ↔ VO_5 path 互换 | **不变**:VO_5 path 仍 5.qcow2 | +| vol.installPath | 同步切到 2.qcow2(关键脆弱点) | 不变(仍指 5.qcow2,VM 关机也不影响)| +| treeUuid 迁移 | dst=2 不是根 → 不迁移;若是根则新建 newTree | pull 路径不涉及 treeUuid 迁移 | +| GroupRef installPath | 同步互换 | 不变 | +| libvirt 调用 | blockCommit + pivot + sibling rebase | 完全不调 | +| 被删快照文件何时清 | libvirt pivot 自动删(VIR_DOMAIN_BLOCK_COMMIT_DELETE,文件名是 5.qcow2) | 轮 4 显式删(文件名是 2.qcow2) | + +### 3.4 翻转后链状态 + +``` +DB 视角: + vol.installPath = 5.qcow2(不变,VM 关机重启时按此 backing chain 启动) + VO_5.installPath = 5.qcow2 parentUuid = 1 ← 含合并入的 2-vs-1 差量 + VO_2.installPath = 2.qcow2 parentUuid = 1 ← 待删 + VO_3.installPath = 3.qcow2 parentUuid = 1 + VO_4.installPath = 4.qcow2 parentUuid = 1 + +物理 backing chain: + vol → 5.qcow2 → 1.qcow2 + 3.qcow2 → 1.qcow2 + 4.qcow2 → 1.qcow2 + 2.qcow2:仍存在但已无人引用(待轮 4 删) +``` + +--- + +## 轮 4:删 VO_2 自身 + +```java +children = [] // VO_5.parentUuid 已跨过 2 指向 1 +deleteVolumeSnapshotAndSyncVolumeSize(comp) +``` + +**消息**:`DeleteVolumeSnapshotOnPrimaryStorageMsg` + +**agent 物理动作**:删 VO_2.installPath = **2.qcow2**(场景 02 删的是 5.qcow2,是因为互换后 VO_2 指向 5;本场景未互换,VO_2 仍指 2.qcow2)。 + +**DB**:VO_2 删除,syncVolumeSize 更新 vol 的 size。 + +--- + +## 终态 + +``` +快照树: + 快照1 + ├─ 快照3 installPath=3.qcow2 backing=1.qcow2 含 (2-1) 差量 + ├─ 快照4 installPath=4.qcow2 backing=1.qcow2 含 (2-1) 差量 + └─ 快照5 ── vol installPath=5.qcow2 backing=1.qcow2 含 (2-1) 差量 + +物理: + 1.qcow2 ← 5.qcow2 ← vol + 1.qcow2 ← 3.qcow2 + 1.qcow2 ← 4.qcow2 + 2.qcow2 已删 +``` + +**注意"差量被复制 3 份"**:场景 03 由于走 pull,被删快照(2)与父(1)之间的差量数据会被分别复制到 3、4、5 三个文件中,磁盘占用相比场景 02 偏高(场景 02 只有一份合并文件)。这是 commit-vs-pull 的固有差异,与是否在线无关。 + +与场景 02 终态对比: + +| 维度 | 场景 02 终态 | 场景 03 终态 | +|---|---|---| +| 含合并数据的物理文件 | 单个 2.qcow2(VO_5 占用,含 5+2 全合并) | 3.qcow2 / 4.qcow2 / 5.qcow2 各含一份 (2-1) 差量 | +| vol.installPath 指向 | 2.qcow2 | 5.qcow2 | +| 删除掉的物理文件 | 5.qcow2(libvirt 在 pivot 时删)+ 2.qcow2 实际名(互换后归 VO_2,轮 4 走 delete)| 2.qcow2 | +| 总磁盘占用 | 较低(差量只一份) | 较高(差量 N 份,N=child 数) | + +**功能等价**:vol 拉起的 backing chain 长度都是 2 层(`vol → child → 1.qcow2`),用户视角"快照2 已删,3/4/5 仍在"完全一致。 + +--- + +## 全程关键脆弱点(仅梳理,不含加固) + +| 轮 | 失败类型 | 当前后果 | +|---|---|---| +| 1 / 2 | qcow2_rebase 失败(agent crash 或 IO 错) | 该 child 的 backing 可能已部分改写但数据未完成;DB 翻转尚未发生 → 物理仍指 2 / DB 仍指 2,幂等可重试 | +| 1 / 2 | qcow2_rebase 成功 + DB 翻转 SQL 失败 | 物理 child.backing=1,DB child.parentUuid=2 → 不一致 | +| 3 | 同上(对快照5) | 同上 | +| 3 | DB 翻转 SQL 失败 | 物理 5.qcow2 已挂 1,DB 仍记 parentUuid=2 | +| 4 | 删 2.qcow2 失败 | 孤儿文件残留 | + +注意:场景 03 没有 active commit pivot 的状态机问题,也没有 vol.installPath 必须同步切的脆弱点;最大风险只剩"qcow2_rebase 与 DB 翻转两步非原子"。 + +--- + +## 与场景 02 的核心结论 + +1. **agent 入口完全不同**:Stopped → `OFFLINE_MERGE_PATH` (`offline_merge_snapshot` → `qcow2_rebase`);Running → `CommitVolumeSnapshotOnHypervisorMsg` (libvirt blockCommit) 或 `OFFLINE_COMMIT_PATH` (`offline_commit_snapshot` → `qcow2_commit`) +2. **物理数据落地的文件不同**:场景 02 落到 dst(被删者的 path,单一文件);场景 03 落到每个 child(多份副本) +3. **DB 是否互换 path 不同**:commit 互换、pull 不互换;这直接决定加固设计 reconciler I4(installPath 不一致)的检测要在两条路径上分别考虑 +4. **vol.installPath 同步要求不同**:场景 02 必须切(脆弱点),场景 03 不动(天然安全) +5. **失败模式不同**:场景 03 没有 active commit pivot 状态机问题,但 "qcow2_rebase + DB 翻转" 仍是两步非原子操作;且失败会以"3/4/5 中某些已 rebase、某些未 rebase"的部分推进态出现 + +--- + +## 附:用户直觉表述与代码事实的对应 + +用户口头描述:"把快照2 的内容合并到快照1,删除快照2,快照 3/4/5 重新指定父节点为 1"。 + +按代码事实拆解: + +| 用户语 | 代码事实 | +|---|---| +| "3/4/5 重新指定父节点为 1" | ✅ `qcow2_rebase(1.qcow2, child.qcow2)` 改写 child 头部 backing 字段;DB `VO_child.parentUuid=1` | +| "把快照2 的内容合并到快照1" | ⚠ 严格意义上 1.qcow2 不被写(只读基线)。等效效果:(快照2 - 快照1) 的差量数据被分别**复制进每个 child**,使每个 child 在新的 1.qcow2 backing 下行为等价于原先在 2.qcow2 backing 下 | +| "删除快照2" | ✅ 轮 4 真正物理删 2.qcow2 | diff --git a/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md b/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md new file mode 100644 index 00000000000..e8117418d49 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/04-deleteSingleFlows-online-offline-decision.md @@ -0,0 +1,349 @@ +# 场景 04:`deleteSingleFlows()` 中 online / offline 分支的判定时序 + +> 当前代码逻辑梳理(5.5.6 基线),不含加固设计。 +> 源码: +> - `VolumeSnapshotTreeBase.java` 行 828-1290(`deleteSingleFlows` / `stepDelete` / `commit` / `pull`) +> - `VolumeTree.java` 行 364-392(`resolveDirection` / `isOnline`) + +--- + +## 总览:online / direction 在哪两步被决定 + +### 极简决策图 + +``` + ┌────────────────────────┐ + │ deleteSingleFlows() │ + │ 查 vmState (一次) │ + └───────────┬────────────┘ + │ + ▼ + ┌────────────────────────┐ + │ stepDelete() (每轮) │ + │ children = ? │ + └───────────┬────────────┘ + │ + ┌─────────────────┼──────────────────┐ + │ │ │ + ▼ ▼ ▼ + children=0 children≥2 children=1 + │ │ │ + │ │ ┌─────┴────────┐ + │ │ │ resolveDir │ + │ │ │ +isOnline │ + │ │ └─────┬────────┘ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌─────────┐ + │ deleteVO │ │ pull │ │ commit │ or pull + │ (终结) │ │ (强制) │ │ │ + └──────────┘ └────┬─────┘ └────┬────┘ + │ │ + └────────┬─────────┘ + ▼ + ┌────────────────┐ + │ online? │ + └───┬────────┬───┘ + true │ │ false + ▼ ▼ + Hypervisor PrimaryStorage + Msg Msg +``` + +### 四象限:(direction × online) → agent 入口(一图速查) + +``` + ┌─────────────────────┬─────────────────────┐ + │ online = true │ online = false │ + │ (Running/Paused │ (Stopped/Destroy │ + │ + alive chain) │ 或非 alive) │ + ┌─────────────┼─────────────────────┼─────────────────────┤ + │ Commit │ libvirt blockCommit │ qemu-img commit │ + │ │ + pivot (active) │ child→parent + 子节 │ + │ (默认 / null)│ vm_plugin │ 点 rebase │ + │ │ do_block_commit │ offline_commit_ │ + │ │ │ snapshot │ + ├─────────────┼─────────────────────┼─────────────────────┤ + │ Pull │ block-stream / pull │ qemu-img rebase │ + │ │ on hypervisor │ (parent, child) │ + │ (Auto 在线) │ vm_plugin │ offline_merge_ │ + │ │ do_pull │ snapshot │ + └─────────────┴─────────────────────┴─────────────────────┘ + ↑ ↑ + 场景 02 最后一轮 场景 02 轮 1/2 + 场景 03 全程 +``` + +### "请求 → 多轮 stepDelete → agent 入口"时间线 + +``` +APIDeleteVolumeSnapshotMsg (direction=null/Auto/Pull/Commit) + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ deleteSingleFlows() │ +│ vmState = query (一次, 整请求复用) │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮1 stepDelete children=[3,4,5] │ +│ 多子节点段 → 强制 pull → child=3 → online? │ +│ Running+3∈alive → 在线 pull (但本例 3 非 alive → offline)│ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮2 stepDelete children=[4,5] │ +│ 多子节点段 → 强制 pull → child=4 → offline │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮3 stepDelete children=[5] │ +│ 单子节点段 → resolveDirection → Commit/Pull │ +│ → isOnline → true/false │ +│ → commit() or pull() → hypervisor / PS msg │ +└──────────────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────────┐ +│ 轮4 stepDelete children=[] │ +│ 终结 → deleteVolumeSnapshotAndSyncVolumeSize │ +└──────────────────────────────────────────────────────────────┘ +``` + +--- + +## 详细判定表(保留供查阅) + +整个删除请求只关心**两个布尔 / 枚举判定**: + +| 判定 | 取值 | 决定时机 | 决定位置 | 决定输入 | +|---|---|---|---|---| +| `vmState` | Running / Paused / Stopped / Destroyed / Destroying | `deleteSingleFlows()` flow 开头 | 行 852-859 | `VmInstanceVO.state`(如果 volume 没挂 vm,`vmState=null`) | +| `direction` | Commit / Pull / Auto / null | `stepDelete()` 仅在 **children.size()==1** 时计算 | 行 904-905 | `msg.getDirection()`(API 入参)+ `currentRoot.isLatest()` + `vmState` | +| `online` | true / false | `stepDelete()` 每一轮选完 child 后立即算 | 行 906 / 行 916 | `tree.current` + `vmState` + `target/child ∈ aliveChain` | + +`commit()` / `pull()` 内部再用一次 `online`(参数透传)决定走 hypervisor 消息还是 primary storage 消息。 + +--- + +## 第一步:`vmState` 校验(行 852-859) + +```java +if (volume.getVmInstanceUuid() != null) { + vmState = Q.New(VmInstanceVO.class)...select(state).findValue(); + if (vmState != Running && vmState != Paused + && vmState != Destroyed && vmState != Stopped && vmState != Destroying) { + trigger.fail("vm is not Running/Paused/Destroyed/Stopped/Destroying"); + return; + } +} +``` + +要点: +- volume 未挂 VM → `vmState = null`,后续所有 `online` 计算返回 false(Pull 全走离线) +- 合法的 vmState:5 种,其中 **Running / Paused 才有可能 online**;Stopped / Destroying / Destroyed 一定 offline +- `vmState` 仅查一次,整个删除请求过程中**复用同一快照值**(不在 stepDelete 每轮重查) + +--- + +## 第二步(每轮):`stepDelete()` 选 child 并判定 online / direction + +行 875-919 的伪流程: + +``` +stepDelete(): + children = tree.getSnapshotLeaf(currentRoot.uuid).getChildren() + + if children.isEmpty(): + deleteVolumeSnapshotAndSyncVolumeSize() # 终结分支,无 online/direction 判定 + return + + onlineChild = children.firstMatch(c -> isOnline(currentRoot, c, vmState)) # ⚠ Bug 0 已修复:改为 isOnAliveChain(c),命名也改为 aliveChild + + if children.size() == 1: + child = children.get(0) + direction = tree.resolveDirection(currentRoot, child, msg.direction, currentRoot.isLatest, vmState) + online = tree.isOnline(current, currentRoot, child, vmState) + if direction == Commit: + commit(child, tree, online, comp) + else: + pull(child, tree, online, comp) + else: + # 多子节点:避开 alive child(让它最后一轮单独跑 commit) + if onlineChild != null && children.get(0) == onlineChild: + child = children.get(1) + else: + child = children.get(0) + online = tree.isOnline(current, currentRoot, child, vmState) + pull(child, tree, online, comp) # 多子节点段恒走 pull,不判定 direction +``` + +### 2.1 `direction` 判定的"作用域" + +`direction` **只在 children.size()==1 时计算并使用**。多子节点段恒走 pull(行 917 `pull(...)`,不调 `resolveDirection`)。也就是说: + +- 多子节点段:`msg.getDirection()` 即使是 Commit,也**被忽略**,强制 pull +- 多子节点段最终把所有非 alive 子节点都推下去后,剩 1 个子节点(通常是 alive child)→ 才进入"判 direction"分支 + +### 2.2 `resolveDirection`(`VolumeTree.java` 行 364-387) + +```java +boolean online = (vmState == Running || Paused) + && aliveChain.contains(target) && aliveChain.contains(child); +boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + +if (initialDirection == "Pull" && shouldUseCommitStrategy) + throw "the snapshot will be deleted by block 'commit', but the direction is 'pull'"; + +if (initialDirection == null) return Commit; // 默认 Commit +if (initialDirection == "Auto") return shouldUseCommitStrategy ? Commit : Pull; +return DeleteVolumeSnapshotDirection.fromString(initialDirection); // 显式 Commit / Pull +``` + +输入到决策的真值表(current 树 + child=alive child 的常见情形): + +| `vmState` | `targetIsLatest` | online | shouldCommit | initial=Auto | initial=null | initial=Pull | initial=Commit | +|---|---|---|---|---|---|---|---| +| Running | false | true | **true** | **Commit** | Commit | ❌ throw | Commit | +| Running | true | true | false | Pull | Commit | Pull | Commit | +| Stopped | * | false | false | Pull | Commit | Pull | Commit | +| Paused | false | true | true | Commit | Commit | ❌ throw | Commit | + +> ⚠ **Bug 0 修复后**(参考 `../bugs.md`):`shouldUseCommitStrategy` 已解耦 vmState。新规则只看 "target/child 是否都在 aliveChain"。修复后 `Stopped + target/child∈aliveChain` 行:`shouldCommit=true`、`Auto → Commit`、`Pull → ❌ throw`。Stopped + Auto + 待删/child 都在 vol 链上 → 走 offline commit(与场景 05 路径一致),不再写出 N 份差量。 + +注意三个反直觉点: +1. `initial=null` 总是返回 Commit(不看 online) —— Commit 路径在离线下会落到 `CommitVolumeSnapshotOnPrimaryStorageMsg → offline_commit_snapshot` +2. `initial=Pull` 在 shouldCommit 时直接 throw —— API 拒绝 +3. `initial=Auto` 才会真正按 online 切换;这是 `APIDeleteVolumeSnapshotMsg` 默认值(前端通常不显式指定 → 走 Auto) + +### 2.3 `isOnline`(`VolumeTree.java` 行 389-392) + +```java +return treeIsCurrent + && (vmState == Running || Paused) + && aliveChain.contains(target) && aliveChain.contains(child); +``` + +四个条件全 true 才返回 true: +- `treeIsCurrent`:该 snapshot 树当前挂在 volume 上(VolumeSnapshotTreeVO.current=true) +- `vmState ∈ {Running, Paused}` +- `target`(被删者)在 aliveChain 上 +- `child`(被选中合并方)在 aliveChain 上 + +**关键观察**:`shouldUseCommitStrategy` 的 online 子句**与 `isOnline` 对 `target/child` 的判定本质相同**(除 `treeIsCurrent` 外)。所以 `direction == Commit` 几乎一定意味着 `online == true`(仅"非 current 树"是反例 —— 但非 current 树通常也不在 aliveChain)。 + +--- + +## 第三步:`commit()` / `pull()` 用 `online` 选 hypervisor 还是 primary storage 消息 + +### 3.1 `commit()` 行 1006-1080 + +```java +if (online) { + String hostUuid = ...VmInstanceVO.hostUuid; + CommitVolumeSnapshotOnHypervisorMsg cmsg = new CommitVolumeSnapshotOnHypervisorMsg(); + ... + bus.send(cmsg); // → KVMHost → vm_plugin do_block_commit (libvirt blockCommit + pivot) +} else { + CommitVolumeSnapshotOnPrimaryStorageMsg cmsg = new CommitVolumeSnapshotOnPrimaryStorageMsg(); + ... + bus.send(cmsg); // → LocalStorageKvmBackend.handle → OFFLINE_COMMIT_PATH → offline_commit_snapshot +} +``` + +### 3.2 `pull()` 行 1227-1268 + +```java +if (online) { + PullVolumeSnapshotOnHypervisorMsg pmsg = new PullVolumeSnapshotOnHypervisorMsg(); + ... + bus.send(pmsg); // → KVMHost → vm_plugin do_block_stream / do_block_commit (取决于 hypervisor 实现) +} else { + PullVolumeSnapshotOnPrimaryStorageMsg pmsg = new PullVolumeSnapshotOnPrimaryStorageMsg(); + ... + bus.send(pmsg); // → LocalStorageKvmBackend.handle → OFFLINE_MERGE_PATH → offline_merge_snapshot +} +``` + +### 3.3 (direction × online) 四象限到 agent 入口 + +| direction | online | Java 消息 | Agent 入口 | 物理操作 | +|---|---|---|---|---| +| Commit | true | `CommitVolumeSnapshotOnHypervisorMsg` | KVM `vm_plugin` `do_block_commit` | libvirt blockCommit (active) + pivot | +| Commit | false | `CommitVolumeSnapshotOnPrimaryStorageMsg` | local `offline_commit_snapshot` | `qcow2_commit(child→parent)` + 给 child 的 children 重 rebase 到 parent | +| Pull | true | `PullVolumeSnapshotOnHypervisorMsg` | KVM `vm_plugin`(pull-on-hypervisor 路径,存储具体逻辑因 backend 而异) | online block-stream / commit 子型 | +| Pull | false | `PullVolumeSnapshotOnPrimaryStorageMsg` | local `offline_merge_snapshot` | `qcow2_rebase(parent, child)`(差量进 child) | + +注意第 2 行(Commit + offline)几乎只在 `initial=null`(前端不传 direction)+ Stopped 下被走到。多子节点段被强制 pull 不会落到这里。 + +--- + +## 第四步:判定时序时间线(一次 stepDelete 调用) + +``` +[控制面入口] + deleteSingleFlows() flow start + │ + ├─ Storage / Memory 类型短路 → deleteVolumeSnapshotAndSyncVolumeSize → end + │ + ├─ vmState = query VmInstanceVO.state # 仅一次 + │ 不在 5 种合法状态 → fail + │ + └─ stepDelete() # 递归入口 + │ + ├─ children = tree.snapshotLeaf(currentRoot).children + ├─ if empty → deleteVolumeSnapshotAndSyncVolumeSize → comp.success → 收敛 + │ + ├─ onlineChild = children.firstMatch(isOnline) # 选 alive child + │ + ├─ if size == 1: + │ direction = resolveDirection(target, child, msg.dir, isLatest, vmState) # ★direction 判定★ + │ online = isOnline(current, target, child, vmState) # ★online 判定★ + │ if Commit → commit(child, tree, online, comp) + │ └─ commit 内: if online → CommitOnHypervisor; else → CommitOnPS + │ else → pull(child, tree, online, comp) + │ └─ pull 内: if online → PullOnHypervisor; else → PullOnPS + │ + └─ if size >= 2: + if onlineChild != null && children.get(0) == onlineChild: + child = children.get(1) # 避开 alive,让它最后做 + online = isOnline(current, target, child, vmState) # ★online 判定★(无 direction 判定) + pull(child, tree, online, comp) + └─ pull 内: if online → PullOnHypervisor; else → PullOnPS + +[每轮 child 处理完成后] + comp.success() → stepDelete(comp) # 重新拉一轮 children,递归直至 empty +``` + +每轮 stepDelete 至多产生一条 commit 或 pull 消息;vmState 在整个递归中复用,online 每轮单独算(树结构在变,但 vmState 不变 → online 实际由 "child 是否仍在 aliveChain" 决定)。 + +--- + +## 关键结论速查 + +1. **online / offline 不是请求级开关,是"每轮 × 该轮选中 child"级开关** +2. **direction 仅在最后一轮(children.size==1)才参与决策**;多子节点段恒走 pull +3. **vmState ∈ {Stopped, Destroyed, Destroying} → 整个请求所有轮全 offline**(无论 child 是否在 aliveChain) +4. **vmState ∈ {Running, Paused} 但被删快照不在 aliveChain → 仍 offline**(典型如:删的是分叉的 sibling 而非主链) +5. **`initial=null`(前端不传) → direction 一定 Commit**:在 Stopped 时会把 commit 路径打到 `offline_commit_snapshot`;前端如果想 Auto 行为必须显式传 `direction=Auto` +6. **`initial=Pull` 但 shouldCommit → API 直接 throw**:这是个白名单校验,避免在线 alive chain 被强制走 pull 导致 VM 被踢出 + +--- + +## 与场景 02 / 03 / 05 的对应 + +| 场景 | vmState | initial.direction | 多子节点段(轮 1-2) | 最后一轮(children.size=1) | 类型 | +|---|---|---|---|---|---| +| 02 (Running, 删快照2) | Running | Commit / Auto | online=false → 离线 pull (`offlinemerge`) | direction=Commit + online=true → **在线** commit (libvirt blockCommit + pivot) | 源码推演 | +| 03 (Stopped, 删快照2) | Stopped | Auto / Pull | online=false → 离线 pull (`offlinemerge`) | direction=Pull + online=false → 离线 pull (`offlinemerge`,差量进 5.qcow2,DB 不互换) | 源码推演 | +| **05** (Stopped, 删快照2) | Stopped | **Commit** | online=false → 离线 pull (`offlinemerge`) | direction=Commit + online=false → **离线 commit (`offlinecommit`,数据 5→2,DB 互换 + VO_2 DELETE)** | **实测** | + +注:场景 03 的"最后一轮"实际行为取决于 API 入参的 `direction`: +- `direction=null`(无入参)→ resolveDirection 返回 Commit → **同场景 05 路径** +- `direction=Auto` → 因 `online=false` 返回 Pull → `PullVolumeSnapshotOnPrimaryStorageMsg` → `offline_merge_snapshot`(数据 1→5 差量,DB 不互换) +- `direction=Pull` → 不 throw(因 `shouldCommit=false`)→ 同 Auto +- `direction=Commit` → 落到场景 05 实测路径 + +**`03-...stopped-...md` 按 `initial=Auto/Pull` 口径写**;**`05-...actual.md` 按 `initial=Commit` 实测**。两者覆盖 Stopped 路径的两种 direction 分支。加固设计的"入参矩阵"必须分别覆盖。 diff --git a/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md b/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md new file mode 100644 index 00000000000..78ef39ae929 --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/05-local-stopped-direction-commit-actual.md @@ -0,0 +1,295 @@ +# 场景 05:local + 关机 VM + `direction=Commit` + 删 group 2(**实测**) + +> 实测于 5.5.6 基线 ZSV 环境(管理节点 172.26.53.180)。 +> 与 `02-...running-...md` / `03-...stopped-...md` 对照阅读:本文件是**实测真值**,前两个是源码推演。 +> 实测时间:2026-05-13 16:54:56 ~ 16:54:58(总耗时 ~2s)。 + +--- + +## 1. 环境与入参 + +| 项 | 值 | +|---|---| +| VM uuid | `fa51c9637c024d94a556dd474a5cd74e` | +| VM 状态(操作时) | **Stopped** | +| Host | `69a7844559844d7193c42e78095911e2` | +| 主存储 | LocalStorage `a9222f7b445e4d2ebd1f1f958dec2f7c`(`/vms_ds`) | +| Root volume | `8dea4b2bb57b402e90beb510c8784507` | +| 快照树 | `08ab32b181644617bb4f8cd32804a6dd`(current=1) | +| API | `APIDeleteVolumeSnapshotGroupMsg` | +| API uuid | `e56623d94e294f9bbabd7a1a9eaf31f2` | +| Group uuid | `ee59701943554014a95d2badb0b2b98d`(snap-group "2") | +| 入参 direction | **`Commit`** | +| 入参 scope | `single` | +| 结果 | `success=true`,操作 1 个 snapshot:`59897f45b2d841e98ec588da025dc841`(即"快照2") | + +## 2. 操作前树结构 + +### 2.1 快照 VO 表 + +| 显示名 | snapshot.uuid | parentUuid | distance | latest | installPath 文件名 | +|---|---|---|---|---|---| +| 1 | `aa7290b5…e70c` | NULL | 1 | 0 | `8dea4b2b…4507.qcow2` | +| 2 | `59897f45…c841` | `aa72…e70c` (=1) | 2 | 0 | `aa7290b5…e70c.qcow2` | +| 3 | `92e8b9bc…bc5c` | `59897…c841` (=2) | 3 | 0 | `59897f45…c841.qcow2` | +| 4 | `0baccfe6…d49c` | `59897…c841` (=2) | 3 | 0 | `596c7400…cb54.qcow2` | +| 5 | `be2680f7…5452` | `59897…c841` (=2) | 3 | **1** | `0cabc0f3…cd1a.qcow2` | +| (vol) | volume = `8dea4b2b…4507` | — | — | — | `be2680f7…5452.qcow2` | + +> **命名错位提醒**:ZStack 实现"做快照"为"冻结当前 + 新建当前",所以 snapshot.installPath 的物理**文件名**通常是它**父辈被冻结时的旧文件名**,与该 snapshot 自身的 uuid 不一致。下面用"X.qcow2"代指 VO_X 的物理文件,文件名用括号注明。 + +### 2.2 物理 backing chain(操作前) + +``` +imagecache/template/e4e3cca9…e5c.qcow2 (镜像基线,只读) + ↑ +1.qcow2 (文件: 8dea…4507.qcow2) + ↑ +2.qcow2 (文件: aa72…e70c.qcow2) + ↑ ↑ ↑ +3.qcow2 4.qcow2 5.qcow2 +(59897…c841) (596c…cb54) (0cab…cd1a) + ↑ + vol.qcow2 (be26…5452.qcow2) +``` + +--- + +## 3. 实测 Agent HTTP POST 序列(6 次) + +抓取自 `management-server.log`(`grep 'api=e56623d94e294f9bbabd7a1a9eaf31f2'`)。 + +| # | 时间 | path | 关键参数 | 含义 | +|---|---|---|---|---| +| 1 | 16:54:56.483 | `/localstorage/volume/getbackingchain` | installPath=2.qcow2 | 查"被删者(2)的 backing"→ 得 `srcSnapshotParentPath = 1.qcow2` | +| 2 | 16:54:56.642 | **`/localstorage/snapshot/offlinemerge`** | srcPath=**1.qcow2**
destPath=**4.qcow2** (`596c…cb54`) | **轮 1:离线 pull 4 → 1**(`qcow2_rebase(1, 4)`)| +| 3 | 16:54:56.949 | **`/localstorage/snapshot/offlinemerge`** | srcPath=**1.qcow2**
destPath=**3.qcow2** (`59897…c841`) | **轮 2:离线 pull 3 → 1**(`qcow2_rebase(1, 3)`)| +| 4 | 16:54:57.236 | **`/localstorage/snapshot/offlinecommit`** | top=**5.qcow2** (`0cab…cd1a`)
base=**2.qcow2** (`aa72…e70c`)
topChildrenInstallPathInDb=[vol] | **轮 3:离线 commit 5 → 2**(`qcow2_commit(5, 2)` + 给 5 的子节点 rebase 到 2)| +| 5 | 16:54:57.589 | `/localstorage/delete` | path=**5.qcow2** (`0cab…cd1a`) | **轮 4:删除"5 物理文件"**(commit 后被抽空的 top) | +| 6 | 16:54:57.898 | `/localstorage/volume/getsize` | installPath=vol | syncVolumeSize 收尾 | + +> 全程**无** `/kvm/vm/*`(即未调 libvirt blockCommit)—— 关机路径不经 hypervisor。 + +--- + +## 4. 4 轮 stepDelete 对应 + +`VolumeSnapshotTreeBase.stepDelete()` 行 875-919 的执行展开: + +### 轮 1:children = [3, 4, 5],多子节点段(强制 pull,忽略 `direction=Commit`) + +``` +onlineChild = null (Stopped → isOnline 全 false) +child = children.get(0) = 4 ★ 实测选 4,不是 3 +online = false +pull(4, tree, online=false, comp) + → PullVolumeSnapshotOnPrimaryStorageMsg + → LocalStorageKvmBackend.handle → OFFLINE_MERGE_PATH + → OfflineMergeSnapshotCmd{srcPath=1.qcow2, destPath=4.qcow2, fullRebase=false} + → agent: linux.qcow2_rebase(1.qcow2, 4.qcow2) + # 4.qcow2 backing: 2.qcow2 → 1.qcow2,(2-1) 差量写入 4.qcow2 +DB: VO_4.parentUuid = 1, distance-- +``` + +⚠️ **修订源码推演**:之前 `02 / 03` 文档假设 `children.get(0) = 3`(按 distance/createDate 升序),实测**选到 4**。说明 `VolumeTree.SnapshotLeaf.getChildren()` 返回顺序**不保证按 distance/createDate**,由底层 collection 实现决定。对最终行为无影响(3、4 均非 alive,谁先谁后等价),但加固设计若依赖"3 一定先于 4"应避免此假设。 + +### 轮 2:children = [3, 5],多子节点段 + +``` +child = children.get(0) = 3 +online = false +pull(3, tree, online=false, comp) + → qcow2_rebase(1.qcow2, 3.qcow2) +DB: VO_3.parentUuid = 1, distance-- +``` + +### 轮 3:children = [5],单子节点段(`direction=Commit` 终于生效) + +``` +direction = resolveDirection(2, 5, "Commit", isLatest=true, Stopped) + → return fromString("Commit") = Commit + (initial=Commit 非 Pull、非 null、非 Auto,原样返回; + shouldUseCommitStrategy=false 仅影响 Pull 是否被拒,不拒 Commit) +online = isOnline(current=true, 2, 5, Stopped) = false + (Stopped → 第二个条件失败) +commit(5, tree, online=false, comp) + → online=false 分支 → CommitVolumeSnapshotOnPrimaryStorageMsg + → LocalStorageKvmBackend.handle → OFFLINE_COMMIT_PATH + → OfflineCommitSnapshotCmd{ + top = srcSnapshot(=5).installPath = 5.qcow2 (0cab…cd1a), + base= dstSnapshot(=2).installPath = 2.qcow2 (aa72…e70c), + topChildrenInstallPathInDb = [vol.installPath = be26…5452.qcow2] + } + → agent (offline_commit_snapshot): + if qcow2_get_backing_file(5.qcow2) != qcow2_get_backing_file(2.qcow2): + # 5 backing=2, 2 backing=1,两者不同 → 进合并 + linux.qcow2_commit(top=5.qcow2, base=2.qcow2) + # 把 5 的差量 flush 进 2;2 仍 backing=1 + for child in [vol = be26…5452.qcow2]: + if qcow2_get_backing_file(vol) != 2.qcow2: + # vol.backing 当前是 5.qcow2(0cab…cd1a)→ 不等 + linux.qcow2_rebase_no_check(base=2.qcow2, vol) + # vol.backing: 5.qcow2 → 2.qcow2 +``` + +物理结束态: +- `2.qcow2` (aa72…) 内含原 2 + 5 的合并数据,backing 仍是 1.qcow2 +- `5.qcow2` (0cab…) 已被抽空(数据已合并入 2),但**文件还在** +- `vol.qcow2` (be26…) backing 改写为 `2.qcow2` + +### 轮 3 DB 互换(SQLBatch 单事务,与场景 02 同结构) + +``` +src=5 (be26…5452), dst=2 (59897…c841) + +互换前: + VO_5.installPath = 0cab…cd1a.qcow2 parentUuid = 2 distance = 3 + VO_2.installPath = aa72…e70c.qcow2 parentUuid = 1 distance = 2 + vol.installPath = be26…5452.qcow2 + +互换后: + VO_2 整条 DB 记录删除(commit 路径"dst 即被删者",DB 不再保留旧 path) + VO_5.installPath = aa72…e70c.qcow2 parentUuid = 1 distance = 2 ← 接管 2 的物理文件 + vol.installPath = be26…5452.qcow2 (不变,但物理 backing 已切到 aa72…e70c) +``` + +⚠️ **与之前推演的差异**:源码注释推断"VO_2.installPath 互换为 5 的旧文件名",实测**直接删 VO_2**(连同 Group "2"),VO_2 没有保留任何 path 记录。互换发生在 VO_5 这一侧(VO_5 接管原 2 的文件),同时 VO_2 整条删除。 + +### 轮 4:children=[],物理清扫 + +``` +children = [] // VO_5.parentUuid 已跨过 2 指向 1 +deleteVolumeSnapshotAndSyncVolumeSize(comp) + → DeleteVolumeSnapshotOnPrimaryStorageMsg → /localstorage/delete + path = 0cab…cd1a.qcow2 ★ 删的是 5 的原物理文件(已被抽空) + → SyncVolumeSize → /localstorage/volume/getsize + vol.actualSize 更新 +``` + +--- + +## 5. 操作后实测状态 + +### 5.1 快照 VO 表(实测) + +| name | uuid | parentUuid | distance | latest | installPath 文件名 | +|---|---|---|---|---|---| +| 1 | aa72…e70c | NULL | 1 | 0 | `8dea…4507.qcow2`(不变)| +| 3 | 92e8…bc5c | **aa72…e70c (=1)** | **2** ↓ | 0 | `59897…c841.qcow2`(不变)| +| 4 | 0bac…d49c | **aa72…e70c (=1)** | **2** ↓ | 0 | `596c…cb54.qcow2`(不变)| +| **5** | be26…5452 | **aa72…e70c (=1)** | **2** ↓ | **1** | **`aa72…e70c.qcow2`** ⬅ **变了** | + +VO_2 消失。VolumeSnapshotGroupVO "2" 同步消失。 + +### 5.2 vol.installPath(实测) + +``` +vol.installPath = /vms_ds/.../snapshots/be2680f7…5452.qcow2 +``` + +**未变**(仍是 vol 自己的 uuid 文件)。物理 backing 由原 `0cab…cd1a` 切到 `aa72…e70c`。 + +### 5.3 物理 backing chain(实测 `qemu-img info`) + +``` +imagecache/template/e4e3cca9…e5c.qcow2 + ↑ +8dea…4507.qcow2 [= VO_1 物理文件,未变] + ↑ ↑ ↑ +3.qcow2 4.qcow2 aa72…e70c.qcow2 [= 新 VO_5 物理文件,原 2.qcow2,含 5+2 合并] +(59897…c841) (596c…cb54) ↑ + be26…5452.qcow2 [vol,未变] +``` + +### 5.4 物理 ls(`/vms_ds/rootVolumes/.../snapshots/`) + +| 文件名 | size | 角色 | +|---|---|---| +| `8dea…4507.qcow2` | 18 MiB | VO_1(基础) | +| `aa72…e70c.qcow2` | 6 MiB | **新 VO_5(含合并),原 VO_2 文件被接管** | +| `59897…c841.qcow2` | 6 MiB | VO_3 | +| `596c…cb54.qcow2` | 6 MiB | VO_4 | +| `be26…5452.qcow2` | 18 MiB | vol(当前可写层) | +| ~~`0cabc0f3…cd1a.qcow2`~~ | (已删) | 原 VO_5 物理文件,被轮 4 清除 | +| `92e8…bc5c.qcow2` | 18 MiB | (操作前的 vol 文件?需另查,不影响本场景)| + +--- + +## 6. 与源码推演(场景 03 - Commit 分支)的差异点回顾 + +| 检查点 | 源码推演 | 实测 | 一致 | +|---|---|---|---| +| 多子节点段强制 pull(忽略 direction) | ✓ | ✓ POST `offlinemerge` 而非 `offlinecommit` | ✅ | +| 多子节点段 `child = children.get(0)` 是 distance/createDate 最小者 | 推测"3" | **实测"4"** | ⚠ 顺序假设错 | +| 单子节点段 `direction=Commit` 显式传入 → resolveDirection 原样返回 Commit | ✓ | ✓ | ✅ | +| `online = false`(Stopped)→ 走 `CommitVolumeSnapshotOnPrimaryStorageMsg` | ✓ | ✓ POST 落到 `/localstorage/snapshot/offlinecommit` | ✅ | +| top=child(5), base=被删者(2), topChildren=[vol] | ✓ | ✓ 完全吻合请求 body | ✅ | +| DB 互换 installPath(VO_5 接管 2 的物理文件) | ✓ | ✓ VO_5.installPath = `aa72…e70c.qcow2` | ✅ | +| VO_2 处理方式 | 推测"互换 path 后保留至轮 4" | **实测直接删除(无保留态)** | ⚠ 互换是单边的 | +| vol.installPath 同步 | 推测"切到 2.qcow2 文件名" | **实测不变**(仍 `be26…5452.qcow2`);切换发生在物理 backing 层 `qcow2_rebase_no_check` | ⚠ DB 层 vol.installPath 是稳定的,"vol 跟随物理文件名"靠 backing 链而非 installPath 字段 | +| 轮 4 物理删 = 旧 5 物理文件(0cab…cd1a) | ✓ | ✓ | ✅ | + +### 关键修订(已影响场景 02 / 03 文档) + +1. **`children.get(0)` 顺序不保证按 distance**:场景 02 / 03 文档中"轮 1 删 3、轮 2 删 4"应改为"具体顺序由底层 collection 决定,3 和 4 中任一先后均合法" +2. **VO_2(dst 被删者)在 DB 中是"删除"而非"互换占位保留"**:场景 02 中关于"VO_2.installPath 互换为 5.qcow2 待轮 4 删"的描述需修正——`updateDatabaseAfterCommit` 直接将 VO_2 DELETE,VO_5 接收新 installPath;轮 4 删的是"VO_5 原文件"而非"VO_2 占位" +3. **`vol.installPath` 不参与互换**:commit 路径下 vol.installPath 字段稳定不变;vol 跟随到合并后文件,是通过**物理 backing 链改写**(`qcow2_rebase_no_check`)+ **VO_5 接管旧 dst 文件**的组合,DB 中 vol VO 的 installPath 字段不动 + +> 这三条修订需要回填到 `02-...running-...md` 和 `03-...stopped-...md`,作为后续修订项记入索引。 + +--- + +## 7. 关键脆弱点(基于实测路径) + +| 阶段 | 失败 | 后果 | +|---|---|---| +| 轮 1/2 `offlinemerge` | `qcow2_rebase` 失败 / DB 翻转失败 | 某 child 物理 backing 已切但 DB parentUuid 未翻;或反之 | +| 轮 3 `offlinecommit` 第一步 `qcow2_commit(5,2)` 失败 | 2.qcow2 未含合并数据,但代码已发出请求 | DB 未翻转,幂等可重试 | +| 轮 3 `offlinecommit` 中途崩溃(`qcow2_commit` 成功 + `qcow2_rebase_no_check(vol)` 失败) | 2.qcow2 已含合并,vol.backing 仍指 5.qcow2 | DB 未翻转 → 二次删除请求可触发 reconciler 修复 | +| 轮 3 SQLBatch 失败 | 物理已合并 + vol.backing 已切,DB 仍记 vol→VO_5(0cab…) | **VO_2 仍在 DB,VO_5.installPath 仍是 0cab…,但 0cab… 物理文件已被抽空** —— 数据可见性破坏,需 reconciler 介入 | +| 轮 4 `delete` 失败 | 0cab…cd1a 文件残留 | 孤儿文件,无人引用,GC 清扫即可 | + +**Stopped + Commit 路径最严重故障 = 轮 3 物理操作成功 + DB SQLBatch 失败**:物理上 vol 已挂 2.qcow2,但 DB 仍记 vol 挂 5.qcow2(=0cab…cd1a),重启会按 DB 拉起,导致 backing chain 指向**已被抽空但未删除**的 0cab…cd1a 文件,看不到任何已写入 2.qcow2 的数据。 + +加固设计的 reconciler I3b/I4 必须覆盖此场景。 + +--- + +## 8. 一图总结(实测时序) + +``` +16:54:56.076 APIDeleteVolumeSnapshotGroupMsg 进入 + direction=Commit, scope=single, groupUuid=ee59…2b98d + │ +16:54:56.483 POST /getbackingchain (查 2.qcow2 的 backing → 1.qcow2) + │ +16:54:56.642 [轮 1] POST /offlinemerge(srcPath=1, destPath=4) + agent: qcow2_rebase(1.qcow2, 4.qcow2) + DB: VO_4.parentUuid=1, distance-- + │ +16:54:56.949 [轮 2] POST /offlinemerge(srcPath=1, destPath=3) + agent: qcow2_rebase(1.qcow2, 3.qcow2) + DB: VO_3.parentUuid=1, distance-- + │ +16:54:57.236 [轮 3] POST /offlinecommit(top=5, base=2, topChildren=[vol]) + agent: qcow2_commit(5→2) + qcow2_rebase_no_check(2, vol) + [DB SQLBatch] DELETE VO_2; VO_5.installPath=aa72…(原2文件), + VO_5.parentUuid=1, distance-- + │ +16:54:57.589 [轮 4] POST /delete(path=0cab…=旧5物理文件) + │ +16:54:57.898 POST /getsize (vol) → SyncVolumeSize + │ +16:54:58.009 APIDeleteVolumeSnapshotGroupEvent success + results: [{snapshotUuid=59897f45…c841, success=true}] + 总耗时 ≈ 1.93s +``` + +--- + +## 9. 与场景 02 / 03 / 04 的引用更新建议 + +- `02-...running-...md` 终态表"VO_2.installPath 互换为 5"应修正为"**VO_2 被直接删除**" +- `03-...stopped-...md` 顶部"Stopped + initial=Auto/Pull"小节保留;"Stopped + initial=Commit"分支应**全部引向本文件**而非自行推演 +- `04-deleteSingleFlows-online-offline-decision.md` 末尾"场景 02/03 对应"表添加一行 "场景 05 = Stopped + Commit 实测,最后一轮走 offline commit + DB 互换 + 删 child 旧文件" +- `00-index.md` 添加场景 05 条目 diff --git a/docs/snapshot-single-delete/scenarios/_query_tree.py b/docs/snapshot-single-delete/scenarios/_query_tree.py new file mode 100644 index 00000000000..13bbc53bc7a --- /dev/null +++ b/docs/snapshot-single-delete/scenarios/_query_tree.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +import paramiko +import sys + +HOST = "172.26.53.180" +USER = "root" +PWD = "admin@123" +VM_UUID = "fa51c9637c024d94a556dd474a5cd74e" + +def run(client, cmd): + stdin, stdout, stderr = client.exec_command(cmd) + out = stdout.read().decode("utf-8", errors="replace") + err = stderr.read().decode("utf-8", errors="replace") + return out, err + +def mysql(client, sql): + cmd = "mysql -pzstack.mysql.password zstack -t -e \"" + sql.replace('"', '\\"') + "\"" + out, err = run(client, cmd) + # Filter out mysql password warning + err = "\n".join([l for l in err.splitlines() if "Using a password" not in l and l.strip()]) + return out, err + +def main(): + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + client.connect(HOST, username=USER, password=PWD, timeout=10) + + print("=" * 70) + print("1. VM 基本信息") + print("=" * 70) + out, err = mysql(client, + f"SELECT uuid, name, state, rootVolumeUuid, hostUuid FROM VmInstanceVO WHERE uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("2. Root Volume 信息") + print("=" * 70) + out, err = mysql(client, + f"SELECT v.uuid, v.name, v.type, v.installPath, v.size, v.primaryStorageUuid, v.rootImageUuid " + f"FROM VolumeVO v JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("3. 快照树 VolumeSnapshotTreeVO") + print("=" * 70) + out, err = mysql(client, + f"SELECT t.uuid AS treeUuid, t.volumeUuid, t.current, t.createDate " + f"FROM VolumeSnapshotTreeVO t JOIN VolumeVO v ON t.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("4. 快照树所有节点 VolumeSnapshotVO") + print("=" * 70) + out, err = mysql(client, + f"SELECT s.uuid, s.name, s.parentUuid, s.treeUuid, s.distance, s.latest, s.size, s.primaryStorageInstallPath " + f"FROM VolumeSnapshotVO s " + f"JOIN VolumeVO v ON s.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid " + f"WHERE vm.uuid='{VM_UUID}' " + f"ORDER BY s.distance, s.createDate\\G") + print(out); print(err) if err else None + + print("=" * 70) + print("5. 物理 backing chain(在物理机上 qemu-img info)") + print("=" * 70) + # First get rootVolume installPath + out, err = run(client, + f"mysql -pzstack.mysql.password zstack -N -e \"" + f"SELECT v.installPath FROM VolumeVO v JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid WHERE vm.uuid='{VM_UUID}'\"") + root_path = out.strip().splitlines()[-1].strip() if out.strip() else "" + print(f"vol.installPath = {root_path}") + + # Get all snapshot paths + out, err = run(client, + f"mysql -pzstack.mysql.password zstack -N -e \"" + f"SELECT s.name, s.primaryStorageInstallPath FROM VolumeSnapshotVO s " + f"JOIN VolumeVO v ON s.volumeUuid=v.uuid " + f"JOIN VmInstanceVO vm ON v.uuid=vm.rootVolumeUuid " + f"WHERE vm.uuid='{VM_UUID}'\"") + print("快照物理路径列表:") + print(out) + + # Trace backing chain from vol + if root_path: + print(f"\n--- qemu-img info --backing-chain {root_path} ---") + out, err = run(client, f"qemu-img info --backing-chain {root_path} 2>&1") + print(out) + + client.close() + +if __name__ == "__main__": + main() diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md new file mode 100644 index 00000000000..2b8178990d4 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/00-overview.md @@ -0,0 +1,93 @@ +# 单盘快照删除一致性加固设计 - 总览 + +- 状态:Draft +- 日期:2026-05-13 +- 关联:ZSV-5799;MR zstack#7674 / premium#10776 / utility#5743 +- 调研基线:`docs/snapshot-single-delete/00-overview.md` + +## 文档拆分 + +| 文件 | 内容 | +|---|---| +| `00-overview.md` | 背景 / 目标 / 约束 / 整体架构(本文) | +| `01-control-plane-reconciler.md` | 控制面 VolumeSnapshotTreeReconciler 设计 | +| `02-data-plane-validation.md` | 数据面 4 层 L1-L4 校验 | +| `03-flowchain-recovery.md` | FlowChain 混合恢复策略与异常场景 | +| `04-testing-strategy.md` | 测试金字塔与用例清单 | +| `05-rollout-plan.md` | 灰度 / 监控 / 回滚 / 风险登记 | +| `06-invariants-and-scope.md` | 不变量护栏总结 / 范围之外 | + +--- + +## 1. 背景 + +ZSV-5799 引入了 `scope=single` 单节点快照删除(commit/pull 路径)。现有实现的关键不足(详见 `docs/snapshot-single-delete/14-limitations-and-todos.md`): + +- **物理文件泄漏**:commit/pull/delete 物理失败后只 warn,文件/LV 残留 +- **DB 不一致**:DB 翻转后失败留下错位 path、悬空 parentUuid、兄弟节点 backing 与 DB parentUuid 不一致 +- **重试不幂等**:失败后中间状态可能让重试失败 +- **节点孤立**:分叉链兄弟节点物理 rebase 完成、DB 未更新 +- **在线 VM**:active commit pivot 状态机不严谨 + +## 2. 目标 + +加固现有删除单盘快照逻辑,确保: + +1. **不变量 1**:操作结束后 DB `(uuid, installPath, parentUuid, distance, treeUuid)` 与物理 qcow2 backing chain 必须一致 +2. **不变量 2**:失败重试可从任意中间状态推进到目标态,**不依赖任何额外状态字段** +3. **不变量 3**:物理删除失败不破坏不变量 1(VO 删,孤儿文件由 warn 记录) + +## 3. 约束与决策 + +| 维度 | 决策 | +|---|---| +| 一致性范围 | 物理泄漏 + DB 一致 + 重试幂等 + 在线 VM 安全,全部覆盖 | +| 状态机 | **不加新表 / 不加新字段**,靠扫描 + qcow2 物理状态推断 | +| GC 触发 | **只在操作完成 / 失败后** 跑当前快照树的局部对账 | +| 控制面预检 | 不做;首次执行走轻量路径 | +| 数据面校验 | L1 dump + L2 verify + L3 check + L4 blockJob 状态机加固,全开 | +| 物理删除失败 | 维持现状(VO 删 + warn) | +| 失败恢复 | 混合策略:可逆 flow rollback;不可逆 flow 由 reconciler 前进式补全 | + +## 4. 整体架构 + +``` + ┌─────────────────────────────────────────────┐ + 用户 / API │ 控制面(zstack management) │ + APIDeleteVolumeSnapshotMsg │ + │ │ ┌─────────────────────────┐ │ + ▼ │ │ VolumeSnapshotTreeBase │ │ + VolumeSnapshotTreeBase │ │ deletion() │ │ + │ │ │ stepDelete() │ │ + │ commit/pull/del │ └────────┬─────────────────┘ │ + ▼ │ │ success/fail │ + FlowChain │ ▼ │ + │ │ ┌─────────────────────────┐ │ + │ each step ends │ │ VolumeSnapshotTreeReconciler (新) │ + └────────────────►│ │ reconcile(treeUuid) │ + │ │ 1) 拉物理 backing chain │ + │ │ 2) 与 DB 比对 │ + │ │ 3) 输出 fix actions(受限动作集) │ + │ │ 4) 顺序执行;记 remaining │ + │ └────────┬─────────────────┘ │ + │ │ │ + └───────────┼────────────────────────────────────┘ + │ GetVolumeBackingChainFromPrimaryStorageMsg + ▼ + ┌─────────────────────────────────────────────┐ + │ 数据面(kvm agent) │ + │ │ + │ vm_plugin.py / *_plugin.py │ + │ ├─ L1 操作前 dump chain → recovery file │ + │ ├─ qemu-img commit/rebase 主操作 │ + │ ├─ L2 操作后 verify_backing_chain │ + │ ├─ L3 异常路径 qemu-img check │ + │ └─ L4 _wait_for_block_job 状态机加固 │ + └─────────────────────────────────────────────┘ +``` + +**核心组件三处**: + +1. **控制面**:抽出 `VolumeSnapshotTreeReconciler`(新类,不是新服务),负责 DB ↔ 物理对账 +2. **数据面**:4 层校验工具集中在 `kvmagent/zstacklib/utils/snapshot_recovery.py`(新建),所有存储后端共享 +3. **FlowChain**:success / fail 回调都先调 reconciler diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md new file mode 100644 index 00000000000..0742ba329b9 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/01-control-plane-reconciler.md @@ -0,0 +1,157 @@ +# 控制面:VolumeSnapshotTreeReconciler + +## 5.1 类设计 + +**位置**:`storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeReconciler.java` + +```java +public class VolumeSnapshotTreeReconciler { + @Autowired private CloudBus bus; + @Autowired private DatabaseFacade dbf; + + public ReconcileResult reconcile(String treeUuid, String volumeUuid, + ReconcileTrigger trigger); +} + +public class ReconcileResult { + boolean consistent; + List appliedActions; + List remaining; +} + +public enum ReconcileTrigger { + AfterCommitSuccess, AfterCommitFail, + AfterPullSuccess, AfterPullFail, + AfterDeleteSuccess, AfterDeleteFail, +} +``` + +## 5.2 工作流程 + +``` +reconcile(treeUuid, volumeUuid, trigger): + 1. 读 DB:Q.New(VolumeSnapshotVO).eq(treeUuid).list() + 若结果为空(dst 是树根的 commit 已完成切换到新 treeUuid 场景) + → 通过 volumeUuid 查 latest VO,反推真实 treeUuid 重新加载 + 2. 读物理:对每个 alive 叶节点发 GetVolumeBackingChainFromPrimaryStorageMsg + (分叉链时多发,合并去重得到全树物理 chains) + + GetSnapshotInstalledPathExistenceMsg + 3. 比对 → InconsistencyReport[] + 4. 翻译为 FixAction(受限动作集) + 5. 顺序执行;失败的进 remaining +``` + +**注**:step 2 对分叉链需遍历所有 alive 叶节点,而不是仅当前 volume.installPath 这条线性 chain, +否则 I4(installPath 错位到非当前叶所在分支)会漏检。 + +## 5.3 不一致检测(5 类) + +| ID | 名称 | 检测 | 修复 | +|---|---|---|---| +| **I1** | 物理已不存在 / DB 仍有 | `physical.exists=false && dbVO != null` | DELETE_DB_VO + 重算 distance/parent | +| **I2** | DB 已删 / 物理仍在 | `physical.exists=true && dbVO=null` | SCHEDULE_GC_ORPHAN_FILE | +| **I3** | parentUuid 不一致 | `db.parent != null && physical.backing != db.parent.installPath`(必须先排除悬空 → I3b 优先评估)| UPDATE_DB_PARENT_UUID + distance | +| **I3b** | 悬空 parentUuid | `db.parentUuid != null && Q(VolumeSnapshotVO).eq(uuid, parentUuid) == null`(兄弟 rebase 完成后 parent VO 已被删,自身 parentUuid 仍指向已删 UUID)| 三种子情形:(a) `physical.backing` 能反查到树内某 alive VO → UPDATE_DB_PARENT_UUID = 该 VO.uuid;(b) `physical.backing == null`(已 rebase 到卷 base)→ UPDATE_DB_PARENT_UUID(null);(c) `physical.backing` 存在但反查不到任何 alive VO(指向已被 stepDelete 的 VO 物理路径,物理 rebase 尚未发生)→ 不动 DB,记 remaining 由下次重试推动物理 rebase 后再修 | +| **I4** | installPath 不一致 | DB.installPath 物理不存在但能在树内任一 alive 叶 backing chain 中找到该 uuid 对应物理位置 | UPDATE_DB_INSTALL_PATH + size | +| **I5** | latest 标志错位 | aliveChain 末端 latest=false 或非末端 latest=true | UPDATE_DB_LATEST_FLAG | + +## 5.4 受限动作集 + +```java +public enum FixActionType { + DELETE_DB_VO, + UPDATE_DB_PARENT_UUID, + UPDATE_DB_INSTALL_PATH, + UPDATE_DB_LATEST_FLAG, + SCHEDULE_GC_ORPHAN_FILE +} +``` + +**显式禁止**:reconciler 不发 Commit/Pull/Delete*Msg、不调 agent rebase。修物理的责任全部在 agent 层。 + +**评估顺序**(强制): +1. I1(自身物理不存在) +2. I3b(parent 悬空)— 必须先于 I3,避免 `db.parent` 为 null 时 I3 NPE +3. I3(parent 存在但 installPath 不一致)— 仅在 `db.parent != null` 时评估 +4. I4(自身 installPath 错位) +5. I5(latest flag 错位) +6. I2(孤儿物理文件)— 最后处理,避免误删与 I1/I4 修复相关文件 + +## 5.5 调用点 + +`VolumeSnapshotTreeBase.java` 修改: + +```java +private void commit(VolumeSnapshotLeaf child, VolumeTree tree, boolean online, Completion comp) { + final String treeUuid = currentRoot.getTreeUuid(); + final String volumeUuid = volume.getUuid(); + final boolean dstIsRoot = (dstSnapshotInv.getParentUuid() == null); + + FlowChain chain = ... .done(new FlowDoneHandler(comp) { + public void handle(Map data) { + logReconcile(reconciler.reconcile(treeUuid, volumeUuid, AfterCommitSuccess)); + // dst 是根节点:updateDatabaseAfterCommit 会创建新 treeUuid 并迁移 VO + // 此时旧 treeUuid 下已无 VO,需对账新 treeUuid(reconciler 内部通过 volumeUuid 反查) + // 此处显式再调一次以护栏 + if (dstIsRoot) { + logReconcile(reconciler.reconcile(null, volumeUuid, AfterCommitSuccess)); + } + comp.success(); + } + }).error(new FlowErrorHandler(comp) { + public void handle(ErrorCode err, Map data) { + try { logReconcile(reconciler.reconcile(treeUuid, volumeUuid, AfterCommitFail)); } + catch (Throwable t) { logger.warn("reconcile failed", t); } + comp.fail(err); + } + }); + chain.start(); +} +``` + +`pull()` 与 `deleteVolumeSnapshotAndSyncVolumeSize()` 同结构改造。 + +**dst-is-root 双树对账**:commit 根节点时 SQLBatch 会 `persist(newTree)` 并把 src 子树迁到新 treeUuid(详见 `docs/snapshot-single-delete/05-commit-db-swap.md` §5.3)。 +若调用方持有的是旧 treeUuid,reconciler step 1 会扫到空集合 → 通过 volumeUuid 反查 latest VO 即可拿到新 treeUuid, +所以传 `null` treeUuid 是合法签名,由 reconciler 自动解析。 + +**成功路径触发策略**: +- Phase 1-2(灰度观察期):`done` 和 `error` 都触发,验证 reconciler 检测准确率 +- Phase 4(默认开启后):保留双触发。理由:L2 失败抛 `PostOpVerifyError` 已走 `error` 分支; + 但 SQLBatch 成功 + agent reply 路径也可能由于"agent 实际成功 reply 误标 fail"(场景 1 镜像)使 DB 与物理静默漂移, + 成功路径对账可在低概率下捕获这种漏报。每次成功操作的对账代价由 ISSUE 1 的锁外异步采样化解。 + +## 5.6 设计不变量 + +- **幂等收敛**:多次调用结果相同;不会把已一致状态修坏 +- **不抛异常给调用方**:reconciler 失败不让 commit/pull 的成功变失败 +- **同步运行在 chainSubmit 锁内**:reconciler 在 commit/pull 的 done/error 回调内同步执行,期间持 chainSubmit 锁;不引入额外锁、不做 CAS。串行性由外层 vm 队列 + chainSubmit 双重保证(见 §5.6.1) +- **SQLBatch 单事务**:所有 DB 修补原子 + +### 5.6.1 串行性来源(不需要额外锁的依据) + +`APIDeleteVolumeSnapshotGroupMsg` → `VolumeSnapshotGroupBase.handleDelete` 通过 `overlaySend(DeleteVolumeSnapshotGroupInnerMsg)` 把请求排到 vm 队列;`completion.done()` 在 `overlaySend` 回调内调用,回调返回前下一个排队请求无法进入。叠加同一棵快照树的 `chainSubmit` 串行: + +``` +vm 队列 ──► chainSubmit ──► commit/pull flow ──► done/error ──► reconciler ──► comp.success/fail ──► chainSubmit 释放 ──► vm 队列释放 +``` + +因此 reconciler 跑完前不会有任何同卷 / 同组的新请求观察到中间状态。原计划的"段 2 释放 chainSubmit + CAS"为冗余设计,已废弃。reconciler 内部仍然按 §5.2 顺序"读 DB → 拉物理 → SQLBatch 修补"线性执行,全程持锁。 + +## 5.7 熔断与降级 + +| GlobalConfig | 默认 | 含义 | +|---|---|---| +| `volumeSnapshot.reconciler.enabled` | true | 总开关 | +| `volumeSnapshot.reconciler.timeout.sec` | 30 | 拉物理 chain 超时 | +| `volumeSnapshot.reconciler.maxFixActions` | 50 | 单次最多修补数(熔断)| + +## 5.8 可观测性 + +``` +[VolumeSnapshotTreeReconciler] tree= trigger=AfterCommitSuccess + inconsistencies: I3(snap-a parentUuid mismatch), I2(orphan-file /xxx.qcow2) + applied: UPDATE_DB_PARENT_UUID(snap-a), SCHEDULE_GC_ORPHAN_FILE(/xxx.qcow2) + remaining: [] + duration_ms: 152 +``` diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md new file mode 100644 index 00000000000..43e471a9eed --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/02-data-plane-validation.md @@ -0,0 +1,260 @@ +# 数据面:四层校验 + +## 6.1 共享工具模块 + +**新建** `zstacklib/zstacklib/utils/snapshot_recovery.py`: + +```python +class ChainSnapshot: + path: str + backing_file: str | None + virtual_size: int + actual_size: int + file_format: str + md5_header: str + +class ChainSnapshotSet: + operation: str + timestamp: float + snapshots: dict[str, ChainSnapshot] + def dump_to_file(self, path): ... + @staticmethod + def load_from_file(path) -> "ChainSnapshotSet": ... + +def take_chain_snapshot(paths: list[str]) -> ChainSnapshotSet: ... +def verify_post_op(before: ChainSnapshotSet, expected: dict[str, str]) -> VerifyResult: ... +``` + +**扩展** `linux.py`: + +```python +def qcow2_get_backing_chain_strict(path) -> list[str]: + """读 qcow2 backing chain,遇错抛 QcowReadError""" + +def qemu_img_check(path, repair=None) -> CheckResult: + """qemu-img check,结构化结果""" +``` + +## 6.2 L1 — 操作前 chain 快照 + +**目的**:进程崩溃 / 宿主机断电后,重启能根据 dump 判断"上次进度" + +**dump 路径**:`/var/lib/zstack/snapshot-recovery/-.json` + +接入示例(`vm_plugin.py` block_commit handler): + +```python +@kvmagent.replyerror +def block_commit(self, req): + cmd = jsonobject.loads(req[http.REQUEST_BODY]) + + # L1:dump pre-op chain + paths = [cmd.top, cmd.base] + (cmd.topChildrenInstallPathInDb or []) + pre_snap = take_chain_snapshot(paths) + pre_snap.operation = 'commit' + recovery_file = "/var/lib/zstack/snapshot-recovery/%s-%s.json" % ( + cmd.volumeUuid, uuidhelper.uuid()) + pre_snap.dump_to_file(recovery_file) + + try: + vm = get_vm_by_uuid(cmd.vmUuid) + vm.do_block_commit(cmd, cmd.volume) + for child in (cmd.topChildrenInstallPathInDb or []): + if linux.qcow2_get_backing_file(child) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, child) + + # L2:post-op verify + verify_post_commit(pre_snap, cmd.base) + + linux.rm_file_force(recovery_file) + return jsonobject.dumps(rsp) + except Exception: + raise # 失败保留 recovery 文件 +``` + +**生命周期**: +- 成功 → 删除 +- 失败 → 保留供下次操作 / 启动恢复消费 +- 超 24h → kvmagent 启动时清理 + +### 6.2.1 其它路径 L1 接入模板 + +**离线 commit**(`localstorage.py:859` / `nfs:.625` / `smp:.506` / `sb:.1285`): + +```python +# paths = top + base + 兄弟节点(commit 后兄弟需 rebase 到 base) +paths = [cmd.top, cmd.base] + (cmd.topChildrenInstallPathInDb or []) +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'offline-commit' +recovery_file = ".../%s-%s.json" % (cmd.volumeUuid, uuidhelper.uuid()) +pre_snap.dump_to_file(recovery_file) +try: + linux.qcow2_commit(cmd.top, cmd.base) + for child in (cmd.topChildrenInstallPathInDb or []): + if linux.qcow2_get_backing_file(child) != cmd.base: + linux.qcow2_rebase_no_check(cmd.base, child) + verify_post_commit(pre_snap, cmd.base) + linux.rm_file_force(recovery_file) +except Exception: + raise +``` + +**离线 pull**(`localstorage.py:835` 等): + +```python +# paths = src + dst + dst.children(pull 后 dst.children 需 rebase 到 src) +paths = [cmd.srcPath, cmd.dstPath] + (cmd.dstChildrenInstallPathInDb or []) +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'offline-pull' +... +linux.qcow2_commit(cmd.dstPath, cmd.srcPath) # pull = reverse commit +verify_post_pull(cmd.srcPath, expected_backing=pre_snap.snapshots[cmd.srcPath].backing_file, + full_rebase=False) +``` + +**fullRebase**(`create_template_with_task_daemon` + mv,详见 `docs/snapshot-single-delete/12-fullrebase-and-cleanup.md`): + +```python +# paths = dst + dst 整条 backing chain(fullRebase 会全部展平进 tmp) +chain = linux.qcow2_get_backing_chain_strict(cmd.destPath) +paths = [cmd.destPath] + chain +pre_snap = take_chain_snapshot(paths) +pre_snap.operation = 'fullRebase' +pre_snap.metadata['tmp_path'] = cmd.destPath + '.tmp' # 登记临时文件路径 +recovery_file = ... +pre_snap.dump_to_file(recovery_file) +try: + create_template_with_task_daemon(cmd.destPath, cmd.destPath + '.tmp') + linux.mv(cmd.destPath + '.tmp', cmd.destPath) + verify_post_pull(cmd.destPath, expected_backing=None, full_rebase=True) + linux.rm_file_force(recovery_file) +except Exception: + # 若 tmp 残留,启动恢复扫到 metadata.tmp_path 即可清理 + raise +``` + +**SharedBlock**:`paths` 用 LV 设备路径(`/dev//`),`take_chain_snapshot` 内部对 LV 路径做 `qemu-img info` 即可,无需特殊分支。 + +## 6.3 L2 — 操作后自检 + +```python +def verify_post_commit(pre, base): + actual_backing = linux.qcow2_get_backing_chain_strict(base)[0:1] + expected_backing = pre.snapshots[base].backing_file + if actual_backing and actual_backing[0] != expected_backing: + raise PostOpVerifyError(...) + # size 检查降级为 warn:commit src 可能是零差量、qcow2 压缩、稀疏文件,不能强制断言增大 + # 阈值与容差由 [snapshot_recovery] size_check_threshold_bytes / size_check_tolerance_ratio 配置(见 6.7) + if pre.snapshots[base].actual_size > config.size_check_threshold_bytes: + new_size = linux.get_local_file_disk_usage(base) + if new_size < pre.snapshots[base].actual_size * config.size_check_tolerance_ratio: + logger.warn("base %s disk usage shrank from %d to %d after commit, " + "possibly compression/sparse, verify backing OK" % + (base, pre.snapshots[base].actual_size, new_size)) + +def verify_post_rebase(target, expected_backing): + actual = linux.qcow2_get_backing_file(target) + if actual != expected_backing: + raise PostOpVerifyError(...) + +def verify_post_pull(dst, expected_backing, full_rebase): + actual = linux.qcow2_get_backing_file(dst) + if full_rebase and actual: + raise PostOpVerifyError(...) + if not full_rebase and actual != expected_backing: + raise PostOpVerifyError(...) +``` + +接入点: + +| 操作 | 文件位置 | 验证 | +|---|---|---| +| 在线 blockCommit | `vm_plugin.py:9845` 主操作完成 | verify_post_commit | +| 在线兄弟 rebase | `vm_plugin.py:9857` 循环内 | verify_post_rebase | +| 离线 commit | `localstorage.py:859` / `nfs:.625` / `smp:.506` / `sb:.1285` | commit + rebase | +| 离线 pull | `localstorage.py:835` 等 | verify_post_pull | +| fullRebase mv 后 | 同上 | verify_post_pull(full_rebase=True) | + +失败抛 `PostOpVerifyError`(继承 `kvmagent.KvmError`)→ HTTP 500 → 控制面 FlowChain error → reconciler 介入 + +## 6.4 L3 — qemu-img check(异常路径) + +```python +def qemu_img_check(path, repair=None): + args = ['check', '-f', 'qcow2'] + if repair: args += ['-r', repair] + args.append(path) + out = shell.call(qemu_img.cmd(args)) + return parse_check_output(out) +``` + +触发: +1. L2 失败前先跑一次区分"qemu-img 静默错误" vs "文件已损坏" +2. 启动恢复诊断时 +3. 控制面 `CheckSnapshotIntegrityMsg` 显式触发 + +**仅检测,不自动修复**。`-r` 修复仅在控制面 API 显式批准时使用。 + +## 6.5 L4 — blockJob 状态机加固 + +```python +class BlockJobState(enum.Enum): + NOT_STARTED, RUNNING, READY, COMPLETED, PIVOTED, CANCELLED, FAILED + +class BlockJobMonitor: + def __init__(self, domain, disk_name, active_commit, timeout_sec): ... + def poll(self) -> BlockJobState: ... + def wait_until(self, target_states: set, timeout: int) -> BlockJobState: ... +``` + +**active commit 状态机**: + +``` +NOT_STARTED → RUNNING ──(timeout)──► FAILED → raise + │ ready event + ▼ + READY ──(timeout)──► FAILED → blockJobAbort(no pivot) → CANCELLED → raise + │ blockJobAbort(PIVOT) + ▼ + PIVOTED ──verify domain XML source==base──► COMPLETED + │ no + ▼ FAILED → raise +``` + +**改造点**: +1. 用 `wait_until({READY})` 替换"轮询 job 不在" +2. pivot 前必须确认 READY +3. 任何超时显式 CANCELLED +4. 终态通过读 domain XML disk source 二次确认 + +## 6.6 启动恢复 + +```python +def on_kvmagent_startup(): + for f in glob('/var/lib/zstack/snapshot-recovery/*.json'): + snap = ChainSnapshotSet.load_from_file(f) + if time.time() - snap.timestamp > 86400: + linux.rm_file_force(f); continue + for path in snap.snapshots: + if not os.path.exists(path): continue + result = qemu_img_check(path) + if result.image_corrupted: + logger.error("recovery: corrupted file %s" % path) + write_diagnostic_report(snap, f.replace('.json', '.report.json')) +``` + +**只诊断不改文件**。控制面通过 `GET /snapshot-recovery/report` 端点读取诊断报告。 + +## 6.7 配置 + +```ini +[snapshot_recovery] +enable_l1_dump = true +enable_l2_verify = true +enable_l3_check_on_error = true +recovery_dir = /var/lib/zstack/snapshot-recovery +recovery_max_age_hours = 24 +blockjob_timeout_sec = 3600 +size_check_threshold_bytes = 104857600 # 100 MiB;base.actual_size 大于此值才做 size warn +size_check_tolerance_ratio = 0.9 # 允许新尺寸不低于旧尺寸 * 此比值,否则记 warn +``` diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md new file mode 100644 index 00000000000..c4e159fb14f --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/03-flowchain-recovery.md @@ -0,0 +1,76 @@ +# FlowChain 改造(混合恢复策略) + +## 7.1 可逆性分类 + +| Flow | 可逆性 | 失败策略 | +|---|---|---| +| AllocatePrimaryStorageSpaceMsg | ✅ 可逆 | FlowChain 自带 rollback | +| CommitOnHypervisor/PrimaryStorage | ❌ 不可逆 | 不回滚,reconciler 前进式补全 | +| updateDatabaseAfterCommit | ✅ 事务回滚 | SQLBatch 自然回滚 + reconciler 二次对账 | +| 兄弟节点 rebase | ❌ 不可逆 | agent L2 抛错 → reconciler 修 DB parentUuid | + +## 7.2 改造模板 + +见 `01-control-plane-reconciler.md` 5.5 节代码示例:error 回调先 reconcile 再 fail,原错误向上抛但 DB 已尽力收敛。 + +## 7.3 子流程失败处理 + +- **doCommitOnHypervisorOrPrimaryStorageFlow**:agent 抛错 → flow fail → reconciler 反查物理实际状态 → 修 DB;agent 实际成功但回复丢失的场景,reconciler 把 DB 推到"成功后状态",但**仍返回原错误**给用户 +- **updateDatabaseAfterCommitFlow**:SQLBatch 失败 → 物理已变 DB 未变 → reconciler 反推应有 DB 状态 → 重新 SQL;二次失败进 remaining +- **兄弟节点 rebase**:agent 单个 child 失败立即抛 → reconciler 比对每个 child backing 与 DB parentUuid 逐个修 + +## 7.4 异常场景验证(手算) + +**场景 1:在线 active commit pivot 后 agent 进程死** + +1. agent L1 dump 已写盘 +2. 控制面 commit flow 超时 → error 回调 +3. reconciler.reconcile(AfterCommitFail) + - 拉物理 chain:base 已合并完成、top 已删 + - 检 I4(installPath 不一致)→ UPDATE_DB_INSTALL_PATH + - 检 I3(src.parentUuid 仍指 dst)→ UPDATE_DB_PARENT_UUID +4. 用户收到原错误(commit timeout) +5. 重试删除 → DB 已收敛 → 走快速路径直接 deleteVolumeSnapshotAndSyncVolumeSize + +✅ 闭环 + +**场景 2:DB 翻转 SQL 失败** + +1. 物理已 commit 完成,updateDatabaseAfterCommitFlow 失败 +2. reconciler 反推修 DB + +✅ 闭环 + +**场景 3:兄弟节点 rebase 中途失败(5 个兄弟 rebase 完 2 个失败)** + +1. agent L2 在第 3 个兄弟报错 → flow fail +2. reconciler 读所有兄弟 backing: + - 已 rebase 的 2 个:`physical.backing` 已变(指向 base),DB `parentUuid` 仍指 dst(被删 VO) + → I3b 子情形 (a) 触发:physical.backing 反查到 base.uuid → UPDATE_DB_PARENT_UUID = base.uuid + - 未 rebase 的 3 个:`physical.backing` 仍指 dst.installPath(dst VO 已删,反查不到 alive VO) + → I3b 子情形 (c) 触发:不动 DB,记 remaining,等下次重试推动物理 rebase + - dst 自身:物理仍存在 + DB VO 已被 stepDelete 删 → I2 触发,SCHEDULE_GC_ORPHAN_FILE + - 注:因 I2 评估顺序最末(见 `01` §5.4),不会误删尚被未 rebase 兄弟引用的 dst.installPath; + SCHEDULE_GC 内部会再检物理是否仍被引用,若是则放弃删除 +3. 重试删除请求 → reconciler 第二轮:未 rebase 的 3 个仍是 I3b(c),agent 重做 rebase 后变 (a);最后 dst 失去引用,GC 才真清 + +✅ 闭环(依赖 I3b 三子情形 + I2 末位评估,详见 `01-control-plane-reconciler.md` §5.3 / §5.4) + +**场景 4:reconciler 自身 SQL 失败** + +1. remaining[] 记录 + warn 日志 +2. 下次任何对该树操作再次触发对账 +3. 持续不一致 → 运维介入 + +✅ 至少不越修越坏 + +## 7.5 并发与锁 + +- reconciler 在 chainSubmit 锁内同步执行(commit/pull 的 done/error 仍持锁,期间不释放) +- **不引入额外锁、不做 CAS**:串行性由外层双重保护—— + - vm 队列:`APIDeleteVolumeSnapshotGroupMsg` 通过 `overlaySend` 排到 vm 队列,`completion.done()` 在 reconciler 跑完后才执行,下一个请求才能出队(见 `01` §5.6.1) + - chainSubmit:同一棵快照树的所有 commit/pull 已串行 +- 跨树并发:reconciler 只动当前 treeUuid VO,无冲突 +- GC 异步框架自身去重,与新业务并发无影响 + +**代价权衡**:reconciler 期间持 chainSubmit + vm 队列锁,意味着同卷 / 同组下一个请求最多等待一次 reconcile(含 `GetVolumeBackingChainFromPrimaryStorageMsg` 网络往返,超时由 `volumeSnapshot.reconciler.timeout.sec=30` 兜底)。但用户调用本来就是串行排队,等待落在原本要排队的请求上,没有放大延迟。 diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md new file mode 100644 index 00000000000..1d8c70ca69d --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/04-testing-strategy.md @@ -0,0 +1,63 @@ +# 测试策略 + +## 8.1 测试金字塔 + +``` + ┌──────────────────────┐ + │ E2E (~5 cases) │ + ├──────────────────────┤ + │ Integration (~30) │ + ├──────────────────────┤ + │ Unit (~100) │ + └──────────────────────┘ +``` + +## 8.2 单元测试(控制面) + +`storage/src/test/.../VolumeSnapshotTreeReconcilerTest.java`: + +- test_I1_physical_missing_db_present +- test_I2_orphan_file +- test_I3_parent_uuid_mismatch +- test_I4_install_path_swap +- test_I5_latest_flag_wrong +- test_idempotent_double_call +- test_max_fix_actions_circuit_breaker +- test_physical_unreachable +- test_sql_batch_fail +- **test_no_business_action_dispatched**(不变量护栏:spy CloudBus 验证从未发 Commit/Pull/Delete*Msg) + +## 8.3 单元测试(数据面) + +`kvmagent/test/test_snapshot_recovery.py`: + +- test_chain_snapshot_dump_load +- test_take_chain_snapshot_with_missing_file +- test_verify_post_commit_backing_unchanged +- test_verify_post_commit_size_shrank +- test_verify_post_rebase_mismatch +- test_verify_post_pull_full_rebase +- test_qemu_img_check_corrupted +- test_blockjob_state_machine_pivot_path +- test_blockjob_timeout_cancellation +- test_recovery_file_lifecycle + +## 8.4 集成测试(ZSTACK_SIMULATOR) + +- TestSingleSnapshotDeleteCommitSuccess +- TestSingleSnapshotDeleteCommitFailReconcile +- TestSingleSnapshotDeletePullForkChain +- TestSingleSnapshotDeleteSqlBatchFail +- TestSingleSnapshotDeleteRetryIdempotent +- TestSingleSnapshotDeleteOrphanGc +- TestSingleSnapshotDeleteSiblingDbCorrection + +## 8.5 E2E 测试 + +| 编号 | 步骤 | +|---|---| +| E1 | 5 层链 → 删中间快照(在线 commit)→ 验证文件链与 DB | +| E2 | 同 E1 + 中途 `kill -9 kvmagent` → 重启 → 验证 reconcile + 重试 | +| E3 | 分叉链(2 子节点)→ 删根节点 → 验证两子各自 backing 与 DB | +| E4 | 离线 pull 大文件(10 GB qcow2 fullRebase)→ 中途断电 → 启动恢复诊断 | +| E5 | 快照组(3 卷),其中 1 卷 reconcile 失败 → 验证其它卷不受影响 | diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md new file mode 100644 index 00000000000..11388f13b74 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/05-rollout-plan.md @@ -0,0 +1,55 @@ +# 上线计划 + +## 9.1 灰度 + +``` +Phase 1 (周 1):默认 false 上线 + - 仅日志旁路:reconcile 跑但不执行 FixAction + - 验证检测准确率 + +Phase 2 (周 2):测试环境开启 + - 全测试集群 enabled=true,跑 E2E + 压力 + +Phase 3 (周 3-4):开发/UAT 集群灰度 + - 一台真实业务集群打开,观察一周 + +Phase 4 (周 5+):默认开启 + - release notes,保留 GlobalConfig 关闭通道 +``` + +## 9.2 监控告警 + +| 日志 grep | 阈值 | +|---|---| +| `[VolumeSnapshotTreeReconciler] applied:` | > 10/h | +| `[VolumeSnapshotTreeReconciler] remaining:` | > 0 | +| `[VolumeSnapshotTreeReconciler] circuit-breaker triggered` | 立即 | +| `PostOpVerifyError` | > 5/h | +| `recovery: corrupted file` | 立即 | + +## 9.3 文档 + +| 产出 | 位置 | +|---|---| +| 设计 spec | 本目录 | +| 运维手册 | `docs/snapshot-single-delete/15-operation-runbook.md` | +| Reconciler 排错指南 | 同上附录 | +| GlobalConfig | release notes | + +## 9.4 回滚预案 + +1. **快速止血**:`updateGlobalConfig volumeSnapshot reconciler.enabled false` +2. **代码回滚**:reconciler 调用全 try-catch,关闭等价于现状 +3. **数据修复**:reconciler 只动 DB 不动物理,最坏 SQL 反向恢复 + +agent 侧 L1/L2/L4 经 `kvmagent.conf` 开关独立回滚。 + +## 9.5 风险登记 + +| 风险 | 等级 | 缓解 | +|---|---|---| +| reconciler 误判物理状态错改 DB | 高 | 单元测试 + 灰度日志旁路 + circuit-breaker | +| L1 dump 文件累积撑爆磁盘 | 中 | 24h 自动清理 + 磁盘监控 | +| L4 状态机改造引入回归 | 中 | 单元测试 + fallback 开关 | +| 对账 SQL 与并发新建快照冲突 | 低 | chainSubmit 已串行 | +| GCJob 入队过多 | 低 | 现有框架去重 | diff --git a/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md new file mode 100644 index 00000000000..1b79fe9a045 --- /dev/null +++ b/docs/superpowers/specs/2026-05-13-snapshot-single-delete-hardening/06-invariants-and-scope.md @@ -0,0 +1,23 @@ +# 不变量护栏总结 / 范围之外 + +## 10. 不变量护栏总结 + +设计的核心护栏(任意一项被破坏即视为设计失败): + +1. reconciler 永远不发 Commit/Pull/Delete*Msg(单元测试强制) +2. reconciler 不抛异常给调用方 +3. reconciler 多次调用结果一致(幂等收敛) +4. agent L2 失败必抛 PostOpVerifyError,不静默 +5. L1 dump 文件成功必删,失败必留 +6. FlowChain error 路径必先 reconcile 后 fail +7. maxFixActions 熔断保护(默认 50) +8. 所有 GlobalConfig / kvmagent.conf 开关可独立关闭 + +## 11. 范围之外 + +- Ceph RBD:本设计不涉及(普通 RBD 快照不支持 commit/pull,超出 single 删除范围) +- **StorageSnapshot / Memory 快照 / CDP**:在 `VolumeSnapshotTreeBase.java:836` 提前 return,绕过 commit/pull 路径,由 `deleteVolumeSnapshotAndSyncVolumeSize` 直接处理,无需加固(详见 `docs/snapshot-single-delete/13-premium-and-cdp.md` §13.2) +- 链克隆 + single 删除并存(VolumeSnapshotReferenceVO TODO):独立议题 +- 全量定时 GC:本设计不引入;只做"操作后局部对账" +- VmState 扩展(如 Migrating):独立议题 +- 快照组并发度可配:独立议题 diff --git a/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java b/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java index cb1f8dde454..2a534d29203 100644 --- a/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java +++ b/header/src/main/java/org/zstack/header/storage/snapshot/group/APIDeleteVolumeSnapshotGroupMsg.java @@ -31,6 +31,9 @@ public class APIDeleteVolumeSnapshotGroupMsg extends APIDeleteMessage implements @APIParam(required = false, validValues = {"single", "chain", "auto"}) private String scope = "chain"; + @APIParam(required = false) + private boolean force = false; + @APINoSee private String vmUuid; @@ -58,6 +61,14 @@ public void setScope(String scope) { this.scope = scope; } + public boolean isForce() { + return force; + } + + public void setForce(boolean force) { + this.force = force; + } + public String getVmUuid() { return vmUuid; } diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java index beb8b044d1a..7aa6adea686 100755 --- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeSnapshotTreeBase.java @@ -884,8 +884,8 @@ private void stepDelete(Completion completion) { return; } - VolumeTree.VolumeSnapshotLeaf onlineChild = children.stream() - .filter(child -> volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState)) + VolumeTree.VolumeSnapshotLeaf aliveChild = children.stream() + .filter(child -> volumeTree.isOnAliveChain(child.getUuid())) .findFirst().orElse(null); Completion comp = new Completion(completion) { @@ -910,7 +910,10 @@ public void fail(ErrorCode errorCode) { pull(child, volumeTree, online, comp); } } else { - if (onlineChild != null && Objects.equals(child.getUuid(), onlineChild.getUuid())) { + // Multi-children: defer the alive-chain child to the final round so that any in-flight failure + // on a non-alive sibling does not corrupt the volume's live backing chain. This guard now + // applies to Stopped VMs as well, because isOnAliveChain is vmState-independent. + if (aliveChild != null && Objects.equals(child.getUuid(), aliveChild.getUuid())) { child = children.get(1); } boolean online = volumeTree.isOnline(current, currentRoot.getUuid(), child.getUuid(), vmState); @@ -2142,26 +2145,37 @@ protected Boolean scripts() { return cleanup; } - // The logic for cleaning up snapshot groups when deleting a snapshot chain + // The logic for cleaning up snapshot groups when deleting a snapshot chain. + // Symmetric with ungroupAfterDeleteSingleSnapshot: regardless of root/data volume type, + // a group is only disbanded after ALL its refs have snapshotDeleted=true. + // This avoids leaving orphan refs (root chain delete used to immediately drop the group VO, + // leaving data-volume refs pointing to a non-existent group). private void ungroupAfterDeleted(List snapshots) { List uuids = snapshots.stream().map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); SQL.New(VolumeSnapshotGroupRefVO.class).in(VolumeSnapshotGroupRefVO_.volumeSnapshotUuid, uuids) .set(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).update(); - if (currentRoot.getVolumeType().equals(VolumeType.Root.toString())) { - List groupUuids = new ArrayList<>(); - for (VolumeSnapshotInventory snapshot : snapshots) { - String groupUuid = snapshot.getGroupUuid(); - if (groupUuid != null) { - logger.debug(String.format("root volume snapshot[uuid:%s, name:%s] has been deleted, " + - "ungroup snapshot group[uuid:%s]", snapshot.getUuid(), snapshot.getName(), groupUuid)); - groupUuids.add(groupUuid); - } + Set touchedGroupUuids = snapshots.stream() + .map(VolumeSnapshotInventory::getGroupUuid) + .filter(Objects::nonNull) + .collect(Collectors.toSet()); + + List groupsToDelete = new ArrayList<>(); + for (String groupUuid : touchedGroupUuids) { + long remaining = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, false).count(); + if (remaining == 0) { + logger.debug(String.format("snapshot group[uuid:%s] all volume snapshots have been deleted, " + + "disbanding group", groupUuid)); + groupsToDelete.add(groupUuid); } + } - groupUuids.forEach(groupUuid -> vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid)); - cleanVmHostBackupFilesForGroup(groupUuids); - dbf.removeByPrimaryKeys(groupUuids, VolumeSnapshotGroupVO.class); + if (!groupsToDelete.isEmpty()) { + groupsToDelete.forEach(groupUuid -> vidm.deleteArchiveVmInstanceResourceMetadataGroup(groupUuid)); + cleanVmHostBackupFilesForGroup(groupsToDelete); + dbf.removeByPrimaryKeys(groupsToDelete, VolumeSnapshotGroupVO.class); } } diff --git a/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java b/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java index ea669fd6202..3f23b637408 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/VolumeTree.java @@ -361,12 +361,38 @@ public List getAliveChainSnapshotUuids() { return aliveChain.stream().map(VolumeSnapshotInventory::getUuid).collect(Collectors.toList()); } + /** + * Pure alive-chain membership query, VM-state-independent. + * A snapshot is "on the alive chain" iff this tree is the current tree of the volume + * AND the snapshot is one of the ancestors of the live volume node. + *

+ * This is intentionally decoupled from {@link #isHypervisorOperation(VmInstanceState)}; + * the two were previously conflated in {@link #isOnline(boolean, String, String, VmInstanceState)}, + * causing the multi-children "avoid alive child" protection in + * {@code VolumeSnapshotTreeBase.stepDelete} to silently fail when the VM is Stopped. + */ + public boolean isOnAliveChain(String snapshotUuid) { + return current && getAliveChainSnapshotUuids().contains(snapshotUuid); + } + + /** + * Whether physical snapshot operations should be routed through the hypervisor (libvirt blockCommit/blockPull) + * instead of the primary storage agent (qemu-img). Purely a function of VM run-state. + */ + public static boolean isHypervisorOperation(VmInstanceState vmState) { + return vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused; + } + public DeleteVolumeSnapshotDirection resolveDirection(String targetSnapshotUuid, String childSnapshotUuid, String initialDirection, boolean targetSnapshotIsLatest, VmInstanceState vmState) { - boolean online = (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) - && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) && getAliveChainSnapshotUuids().contains(childSnapshotUuid); - - boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && online; + // shouldUseCommitStrategy reflects "would the commit path move data along the live chain", which is purely + // a property of the tree structure (vol's ancestor chain) and should not depend on whether the VM is currently + // running. Previously this was conjoined with vmState ∈ {Running, Paused}, which caused Stopped + Auto to + // silently degrade to Pull (writing N copies of (target - parent) delta to each child file) even when the + // commit path would have produced a single merged file. + boolean targetOnAliveChain = isOnAliveChain(targetSnapshotUuid); + boolean childOnAliveChain = isOnAliveChain(childSnapshotUuid); + boolean shouldUseCommitStrategy = current && !targetSnapshotIsLatest && targetOnAliveChain && childOnAliveChain; if (Objects.equals(initialDirection, DeleteVolumeSnapshotDirection.Pull.toString()) && shouldUseCommitStrategy) { throw new IllegalArgumentException("the snapshot will be deleted by block 'commit', but the direction is 'pull', " + @@ -386,9 +412,19 @@ public DeleteVolumeSnapshotDirection resolveDirection(String targetSnapshotUuid, return DeleteVolumeSnapshotDirection.fromString(initialDirection); } + /** + * Compound predicate: target and child are both on the alive chain AND the VM is currently running. + * Used to decide whether to route through the hypervisor (libvirt) path vs the primary storage agent path. + *

+ * Equivalent to {@code treeIsCurrent && isHypervisorOperation(vmState) + * && isOnAliveChain(target) && isOnAliveChain(child)}, with the {@code current} check + * folded into both {@code isOnAliveChain} calls. + */ public boolean isOnline(boolean treeIsCurrent, String targetSnapshotUuid, String childSnapshotUuid, VmInstanceState vmState) { - return treeIsCurrent && (vmState == VmInstanceState.Running || vmState == VmInstanceState.Paused) - && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) && getAliveChainSnapshotUuids().contains(childSnapshotUuid); + return treeIsCurrent + && isHypervisorOperation(vmState) + && getAliveChainSnapshotUuids().contains(targetSnapshotUuid) + && getAliveChainSnapshotUuids().contains(childSnapshotUuid); } // TODO(clone) : When both chain cloning and single-node snapshot deletion are enabled, diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java index a9e47dc2d69..63bbb7e6932 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupBase.java @@ -186,6 +186,20 @@ public String getName() { private void handleDelete(APIDeleteVolumeSnapshotGroupMsg msg, NoErrorCompletion completion) { APIDeleteVolumeSnapshotGroupEvent event = new APIDeleteVolumeSnapshotGroupEvent(msg.getId()); + + if (!msg.isForce()) { + List incomplete = VolumeSnapshotGroupChecker + .findIncompleteGroupsOnVm(self.getVmInstanceUuid(), self.getUuid()); + if (!incomplete.isEmpty()) { + event.setError(operr("VM[uuid:%s] has incomplete snapshot group(s) %s, " + + "please clean them up first (or pass force=true) before deleting other snapshot groups", + self.getVmInstanceUuid(), incomplete)); + bus.publish(event); + completion.done(); + return; + } + } + DeleteVolumeSnapshotGroupInnerMsg imsg = new DeleteVolumeSnapshotGroupInnerMsg(); imsg.setUuid(msg.getUuid()); imsg.setDeletionMode(msg.getDeletionMode()); diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java new file mode 100644 index 00000000000..786c4e1330d --- /dev/null +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupCascadeExtension.java @@ -0,0 +1,153 @@ +package org.zstack.storage.snapshot.group; + +import org.springframework.beans.factory.annotation.Autowired; +import org.zstack.core.cascade.AbstractAsyncCascadeExtension; +import org.zstack.core.cascade.CascadeAction; +import org.zstack.core.cascade.CascadeConstant; +import org.zstack.core.db.DatabaseFacade; +import org.zstack.core.db.Q; +import org.zstack.core.db.SQL; +import org.zstack.header.core.Completion; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO_; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO_; +import org.zstack.header.vm.VmDeletionStruct; +import org.zstack.header.vm.VmInstanceVO; +import org.zstack.header.vm.additions.VmHostBackupFileVO; +import org.zstack.header.vm.additions.VmHostBackupFileVO_; +import org.zstack.header.vm.additions.VmHostFileManager; +import org.zstack.header.vm.devices.VmInstanceResourceMetadataManager; +import org.zstack.utils.Utils; +import org.zstack.utils.logging.CLogger; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * Cascade extension keyed on VmInstance for cleaning up VolumeSnapshotGroup VOs + * when a VM is destroyed. + * + * Background: snapshot groups are VM-scoped. When a VM is destroyed, any remaining + * group VOs (whether complete or incomplete due to partial single-snapshot deletions) + * become orphaned. Without this cleanup, those rows would survive beyond the VM + * and pollute downstream queries. + * + * On DELETION_CHECK we do NOT block — VM destroy should proceed even with + * incomplete groups (per product decision); cleanup is automatic. + */ +public class VolumeSnapshotGroupCascadeExtension extends AbstractAsyncCascadeExtension { + private static final CLogger logger = Utils.getLogger(VolumeSnapshotGroupCascadeExtension.class); + + private static final String NAME = VolumeSnapshotGroupVO.class.getSimpleName(); + + @Autowired + private DatabaseFacade dbf; + @Autowired + private VmInstanceResourceMetadataManager vidm; + @Autowired + private VmHostFileManager vmHostFileManager; + + @Override + public void asyncCascade(CascadeAction action, Completion completion) { + if (action.isActionCode(CascadeConstant.DELETION_CLEANUP_CODE)) { + handleDeletionCleanup(action, completion); + } else if (action.isActionCode(CascadeConstant.DELETION_DELETE_CODE, + CascadeConstant.DELETION_FORCE_DELETE_CODE)) { + handleDeletion(action, completion); + } else { + completion.success(); + } + } + + private void handleDeletion(CascadeAction action, Completion completion) { + if (!VmInstanceVO.class.getSimpleName().equals(action.getParentIssuer())) { + completion.success(); + return; + } + + List vmUuids = vmUuidsFromAction(action); + if (vmUuids.isEmpty()) { + completion.success(); + return; + } + + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .in(VolumeSnapshotGroupVO_.vmInstanceUuid, vmUuids) + .listValues(); + if (groupUuids.isEmpty()) { + completion.success(); + return; + } + + logger.debug(String.format("VM destroy cascade: force-removing %d snapshot group(s) %s for vm(s) %s " + + "(includes any incomplete groups from prior single-snapshot deletions)", + groupUuids.size(), groupUuids, vmUuids)); + + // 1. drop all refs first (FK-like constraint via business logic) + SQL.New(VolumeSnapshotGroupRefVO.class) + .in(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, groupUuids) + .delete(); + + // 2. clean associated metadata + backup files + groupUuids.forEach(vidm::deleteArchiveVmInstanceResourceMetadataGroup); + cleanVmHostBackupFilesForGroup(groupUuids); + + // 3. remove group VOs + dbf.removeByPrimaryKeys(groupUuids, VolumeSnapshotGroupVO.class); + + completion.success(); + } + + private void cleanVmHostBackupFilesForGroup(List groupUuids) { + if (groupUuids.isEmpty()) { + return; + } + + List backupUuidList = Q.New(VmHostBackupFileVO.class) + .in(VmHostBackupFileVO_.resourceUuid, groupUuids) + .select(VmHostBackupFileVO_.uuid) + .listValues(); + + backupUuidList.forEach(vmHostFileManager::cleanVmHostBackupFile); + } + + private void handleDeletionCleanup(CascadeAction action, Completion completion) { + try { + dbf.eoCleanup(VolumeSnapshotGroupVO.class); + } catch (Throwable t) { + logger.warn("eoCleanup VolumeSnapshotGroupVO failed: " + t.getMessage()); + } finally { + completion.success(); + } + } + + private List vmUuidsFromAction(CascadeAction action) { + Object ctx = action.getParentIssuerContext(); + if (ctx == null) { + return Collections.emptyList(); + } + List uuids = new ArrayList<>(); + if (ctx instanceof List) { + for (Object o : (List) ctx) { + if (o instanceof VmDeletionStruct) { + uuids.add(((VmDeletionStruct) o).getInventory().getUuid()); + } + } + } + return uuids; + } + + @Override + public List getEdgeNames() { + return Arrays.asList(VmInstanceVO.class.getSimpleName()); + } + + @Override + public String getCascadeResourceName() { + return NAME; + } +} diff --git a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java index 9749984de47..3c3f9385953 100644 --- a/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java +++ b/storage/src/main/java/org/zstack/storage/snapshot/group/VolumeSnapshotGroupChecker.java @@ -3,6 +3,7 @@ import org.zstack.core.db.Q; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupAvailability; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO; +import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupRefVO_; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO; import org.zstack.header.storage.snapshot.group.VolumeSnapshotGroupVO_; import org.zstack.header.vo.ResourceVO; @@ -25,6 +26,49 @@ public static boolean isAvailable(String uuid) { return getAvailability(uuid).isAvailable(); } + /** + * Find all incomplete snapshot groups on a VM. + * An incomplete group is one where part of its refs have snapshotDeleted=true + * but at least one ref is still alive (snapshotDeleted=false). + * Such groups represent a "debt" that pollutes subsequent group/VM operations. + * + * @param vmInstanceUuid the VM to inspect + * @param excludeGroupUuid group uuid to exclude from the result (e.g. when the caller is + * itself trying to delete that group, do not flag it as a blocker); + * pass null to include all groups + * @return list of incomplete group uuids (excluding excludeGroupUuid); empty if none + */ + public static List findIncompleteGroupsOnVm(String vmInstanceUuid, String excludeGroupUuid) { + if (vmInstanceUuid == null) { + return Collections.emptyList(); + } + + List groupUuids = Q.New(VolumeSnapshotGroupVO.class) + .select(VolumeSnapshotGroupVO_.uuid) + .eq(VolumeSnapshotGroupVO_.vmInstanceUuid, vmInstanceUuid) + .listValues(); + + List incomplete = new ArrayList<>(); + for (Object o : groupUuids) { + String guuid = o.toString(); + if (guuid.equals(excludeGroupUuid)) { + continue; + } + long deletedRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid) + .eq(VolumeSnapshotGroupRefVO_.snapshotDeleted, true).count(); + if (deletedRefs == 0) { + continue; + } + long totalRefs = Q.New(VolumeSnapshotGroupRefVO.class) + .eq(VolumeSnapshotGroupRefVO_.volumeSnapshotGroupUuid, guuid).count(); + if (deletedRefs < totalRefs) { + incomplete.add(guuid); + } + } + return incomplete; + } + public static List getAvailability(List uuids) { List results = new ArrayList<>(); List groups = Q.New(VolumeSnapshotGroupVO.class) diff --git a/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java b/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java index 09e41c229f7..b956b750ce8 100755 --- a/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java +++ b/storage/src/main/java/org/zstack/storage/volume/VolumeApiInterceptor.java @@ -44,6 +44,7 @@ import org.zstack.header.storage.snapshot.VolumeSnapshotVO; import org.zstack.header.storage.snapshot.VolumeSnapshotVO_; import org.zstack.header.storage.snapshot.group.MemorySnapshotValidatorExtensionPoint; +import org.zstack.storage.snapshot.group.VolumeSnapshotGroupChecker; import org.zstack.header.tag.SystemTagVO; import org.zstack.header.vm.APICreateVmInstanceMsg; import org.zstack.header.vm.DiskAO; @@ -213,6 +214,8 @@ private void validate(APICreateVolumeSnapshotGroupMsg msg) { throw new ApiMessageInterceptionException(argerr("volume[uuid:%s] is not root volume", msg.getRootVolumeUuid())); } + checkIncompleteSnapshotGroupsOnVm(vmvo.getUuid(), "create new snapshot group"); + if (msg.isWithMemory() && !(vmvo.getState().equals(VmInstanceState.Running) || (vmvo.getState().equals(VmInstanceState.Paused)))) { throw new ApiMessageInterceptionException(argerr("Can not take memory snapshot, vm current state[%s], but expect state are [%s, %s]", vmvo.getState().toString(), VmInstanceState.Running.toString(), VmInstanceState.Paused.toString())); @@ -316,9 +319,13 @@ private void validate(APIDetachDataVolumeFromVmMsg msg) { throw new ApiMessageInterceptionException(operr("the volume[uuid:%s, name:%s, type:%s] can't detach it", vol.getUuid(), vol.getName(), vol.getType())); } + + String vmUuid = msg.getVmUuid() != null ? msg.getVmUuid() : vol.getVmInstanceUuid(); + checkIncompleteSnapshotGroupsOnVm(vmUuid, "detach data volume"); } private void validate(APIAttachDataVolumeToVmMsg msg) { + checkIncompleteSnapshotGroupsOnVm(msg.getVmInstanceUuid(), "attach data volume"); new SQLBatch() { @Override protected void scripts() { @@ -691,6 +698,29 @@ public boolean start() { return true; } + /** + * Block VM-scoped operations when the VM has any incomplete snapshot group. + * An incomplete group is one whose refs are partially deleted (some snapshotDeleted=true, + * but at least one alive). Such groups must be cleaned up first to avoid pollution + * of subsequent group operations on this VM. + * + * Exempt operations: deleting an incomplete group itself (handled by + * {@code VolumeSnapshotGroupBase#handleDelete} which excludes self), single-snapshot + * deletion, and VM destroy (handled by VolumeSnapshotGroupCascadeExtension cleanup). + */ + private void checkIncompleteSnapshotGroupsOnVm(String vmUuid, String operationDesc) { + if (vmUuid == null) { + return; + } + List incomplete = VolumeSnapshotGroupChecker.findIncompleteGroupsOnVm(vmUuid, null); + if (!incomplete.isEmpty()) { + throw new ApiMessageInterceptionException(operr( + "VM[uuid:%s] has incomplete snapshot group(s) %s, " + + "please clean them up first before %s", + vmUuid, incomplete, operationDesc)); + } + } + @Override public boolean stop() { return true;