diff --git a/.github/workflows/release-chart.yml b/.github/workflows/release-chart.yml index afff80a9..eddfb417 100644 --- a/.github/workflows/release-chart.yml +++ b/.github/workflows/release-chart.yml @@ -4,6 +4,7 @@ on: push: branches: - master + - 'release/[0-9]+.[0-9]+.*' jobs: release: diff --git a/CHANGELOG.md b/CHANGELOG.md index 28bcef43..969b5d16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ ## unreleased + +## v3.5.7-rc2 - 2025.12.23 * Minimum supported Kubernetes version is now 1.28. +* add volume locks to avoid concurrent operations on the same volume +* add detection if symlinks don't add up to the right volume +* increase logging once more + +## v3.5.7-rc1 - 2025.12.17 +* Adjust udevadm calls & reorder them, and put them behind a mutex to avoid a potential race condition +* Improve logging by adding debug logs +* Add `--log-level=(trace|debug|info|warn|error)` flag to customize log level ## v3.5.6 - 2024.04.18 * ~~Add support for Kubernetes 1.30~~ diff --git a/Makefile b/Makefile index f209580b..b69bb154 100644 --- a/Makefile +++ b/Makefile @@ -74,10 +74,10 @@ build: compile .PHONY: push push: -ifeq ($(DOCKER_REPO),cloudscalech/cloudscale-csi-plugin) - ifneq ($(BRANCH),master) +ifeq ($(DOCKER_REPO),quay.io/cloudscalech/cloudscale-csi-plugin) + ifeq ($(filter master,$(BRANCH))$(filter release/%,$(BRANCH)),) ifneq ($(VERSION),dev) - $(error "Only the `dev` tag can be published from non-master branches") + $(error "Only the `dev` tag can be published from non-master/non-release branches") endif endif endif diff --git a/README.md b/README.md index 1ab57dcd..168e63b6 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ secret `my-pvc-luks-key`. ## Releases The cloudscale.ch CSI plugin follows [semantic versioning](https://semver.org/). -The current version is: **`v3.5.6`**. +The current version is: **`v3.5.7-rc2`**. * Bug fixes will be released as a `PATCH` update. * New features (such as CSI spec bumps) will be released as a `MINOR` update. @@ -85,13 +85,13 @@ We recommend using the latest cloudscale.ch CSI driver compatible with your Kube | 1.21 | v2.0.0 | v3.5.2 | | 1.22 | v3.1.0 | v3.5.2 | | 1.23 | v3.1.0 | v3.5.2 | -| 1.24 | v3.1.0 | v3.5.6 | -| 1.25 | v3.3.0 | v3.5.6 | -| 1.26 | v3.3.0 | v3.5.6 | -| 1.27 | v3.3.0 | v3.5.6 | -| 1.28 | v3.3.0 | v3.5.6 | -| 1.29 | v3.3.0 | v3.5.6 | -| 1.30 | v3.3.0 | v3.5.6 | +| 1.24 | v3.1.0 | v3.5.7-rc2 | +| 1.25 | v3.3.0 | v3.5.7-rc2 | +| 1.26 | v3.3.0 | v3.5.7-rc2 | +| 1.27 | v3.3.0 | v3.5.7-rc2 | +| 1.28 | v3.3.0 | v3.5.7-rc2 | +| 1.29 | v3.3.0 | v3.5.7-rc2 | +| 1.30 | v3.3.0 | v3.5.7-rc2 | | 1.31 | v3.3.0 | v3.5.6 | | 1.32 | v3.3.0 | v3.5.6 | | 1.33 | v3.3.0 | v3.5.6 | @@ -198,10 +198,10 @@ $ helm install -g -n kube-system --set controller.image.tag=dev --set node.image Before you continue, be sure to checkout to a [tagged release](https://github.com/cloudscale-ch/csi-cloudscale/releases). Always use the [latest stable version](https://github.com/cloudscale-ch/csi-cloudscale/releases/latest) -For example, to use the latest stable version (`v3.5.6`) you can execute the following command: +For example, to use the latest stable version (`v3.5.7-rc2`) you can execute the following command: ``` -$ kubectl apply -f https://raw.githubusercontent.com/cloudscale-ch/csi-cloudscale/master/deploy/kubernetes/releases/csi-cloudscale-v3.5.6.yaml +$ kubectl apply -f https://raw.githubusercontent.com/cloudscale-ch/csi-cloudscale/master/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc2.yaml ``` The storage classes `cloudscale-volume-ssd` and `cloudscale-volume-bulk` will be created. The @@ -421,15 +421,23 @@ $ git push origin After it's merged to master, [create a new Github release](https://github.com/cloudscale-ch/csi-cloudscale/releases/new) from -master with the version `v3.5.6` and then publish a new docker build: +master with the version `v3.5.7-rc2` and then publish a new docker build: ``` $ git checkout master $ make publish ``` -This will create a binary with version `v3.5.6` and docker image pushed to -`cloudscalech/cloudscale-csi-plugin:v3.5.6` +This will create a binary with version `v3.5.7-rc2` and docker image pushed to +`cloudscalech/cloudscale-csi-plugin:v3.5.7-rc2` + +### Release a pre-release version + +To release a new pre-release (or a patch version based on an existing tag) version, follow the following steps: + +1. create a new branch called `release/x.y` (or with the full version) +2. push it to GitHub +3. Follow the flow for creating a new release, making sure to set the appropriate versions. ## Contributing diff --git a/VERSION b/VERSION index 6864723d..1031c077 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v3.5.6 +v3.5.7-rc2 diff --git a/charts/csi-cloudscale/Chart.yaml b/charts/csi-cloudscale/Chart.yaml index fd0f8c58..e20e8932 100644 --- a/charts/csi-cloudscale/Chart.yaml +++ b/charts/csi-cloudscale/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: csi-cloudscale description: A Container Storage Interface Driver for cloudscale.ch volumes. type: application -version: 1.3.6 -appVersion: "3.5.6" +version: 1.3.7-rc2 +appVersion: "3.5.7-rc2" home: https://github.com/cloudscale-ch/csi-cloudscale sources: - https://github.com/cloudscale-ch/csi-cloudscale.git diff --git a/charts/csi-cloudscale/templates/daemonset.yaml b/charts/csi-cloudscale/templates/daemonset.yaml index 3695ef82..18a46a48 100644 --- a/charts/csi-cloudscale/templates/daemonset.yaml +++ b/charts/csi-cloudscale/templates/daemonset.yaml @@ -53,6 +53,7 @@ spec: args : - "--endpoint=$(CSI_ENDPOINT)" - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level={{ .Values.node.logLevel }}" {{- with .Values.node.resources }} resources: {{ toYaml . | indent 12 }} diff --git a/charts/csi-cloudscale/templates/statefulset.yaml b/charts/csi-cloudscale/templates/statefulset.yaml index 36733885..fd095b03 100644 --- a/charts/csi-cloudscale/templates/statefulset.yaml +++ b/charts/csi-cloudscale/templates/statefulset.yaml @@ -77,6 +77,7 @@ spec: args : - "--endpoint=$(CSI_ENDPOINT)" - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level={{ .Values.controller.logLevel }}" {{- with .Values.controller.resources }} resources: {{ toYaml . | indent 12 }} diff --git a/charts/csi-cloudscale/values.yaml b/charts/csi-cloudscale/values.yaml index 2ea7ba73..9f5a54ad 100644 --- a/charts/csi-cloudscale/values.yaml +++ b/charts/csi-cloudscale/values.yaml @@ -84,9 +84,10 @@ controller: image: registry: quay.io repository: cloudscalech/cloudscale-csi-plugin - tag: v3.5.6 + tag: v3.5.7-rc2 pullPolicy: IfNotPresent serviceAccountName: + logLevel: info resources: {} # limits: # cpu: 100m @@ -99,11 +100,12 @@ node: image: registry: quay.io repository: cloudscalech/cloudscale-csi-plugin - tag: v3.5.6 + tag: v3.5.7-rc2 pullPolicy: IfNotPresent nodeSelector: {} tolerations: [] serviceAccountName: + logLevel: info resources: {} # limits: # cpu: 100m diff --git a/cmd/cloudscale-csi-plugin/main.go b/cmd/cloudscale-csi-plugin/main.go index c869356f..b4a40758 100644 --- a/cmd/cloudscale-csi-plugin/main.go +++ b/cmd/cloudscale-csi-plugin/main.go @@ -24,6 +24,7 @@ import ( "os" "github.com/cloudscale-ch/csi-cloudscale/driver" + "github.com/sirupsen/logrus" ) func main() { @@ -32,6 +33,7 @@ func main() { token = flag.String("token", "", "cloudscale.ch access token") url = flag.String("url", "https://api.cloudscale.ch/", "cloudscale.ch API URL") version = flag.Bool("version", false, "Print the version and exit.") + logLevel = flag.String("log-level", "info", "Log level (trace, debug, info, warn, error, fatal, panic)") ) flag.Parse() @@ -44,7 +46,12 @@ func main() { os.Exit(0) } - drv, err := driver.NewDriver(*endpoint, *token, *url) + level, err := logrus.ParseLevel(*logLevel) + if err != nil { + log.Fatalf("invalid log level %q: %v", *logLevel, err) + } + + drv, err := driver.NewDriver(*endpoint, *token, *url, level) if err != nil { log.Fatalln(err) } diff --git a/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc1.yaml b/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc1.yaml new file mode 100644 index 00000000..4ed9656f --- /dev/null +++ b/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc1.yaml @@ -0,0 +1,416 @@ +--- +# Source: csi-cloudscale/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-cloudscale-controller-sa + namespace: kube-system +--- +# Source: csi-cloudscale/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-cloudscale-node-sa + namespace: kube-system +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-ssd + namespace: kube-system + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: ssd +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-ssd-luks + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: ssd + csi.cloudscale.ch/luks-encrypted: "true" + csi.cloudscale.ch/luks-cipher: "aes-xts-plain64" + csi.cloudscale.ch/luks-key-size: "512" + csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace} + csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}-luks-key +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-bulk + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: bulk +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-bulk-luks + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: bulk + csi.cloudscale.ch/luks-encrypted: "true" + csi.cloudscale.ch/luks-cipher: "aes-xts-plain64" + csi.cloudscale.ch/luks-key-size: "512" + csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace} + csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}-luks-key +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-provisioner-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["list", "watch", "create", "update", "patch"] + - apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshots"] + verbs: ["get", "list"] + - apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshotcontents"] + verbs: ["get", "list"] + - apiGroups: [ "storage.k8s.io" ] + resources: [ "csinodes" ] + verbs: [ "get", "list", "watch" ] + - apiGroups: [ "" ] + resources: [ "nodes" ] + verbs: [ "get", "list", "watch" ] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-attacher-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments/status"] + verbs: ["patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-resizer-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["update", "patch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["list", "watch", "create", "update", "patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-node-driver-registrar-role + namespace: kube-system +rules: + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch", "create", "update", "patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-provisioner-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-provisioner-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-resizer-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-resizer-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-attacher-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-attacher-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-node-driver-registrar-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-node-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-node-driver-registrar-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/daemonset.yaml +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-cloudscale-node + namespace: kube-system +spec: + selector: + matchLabels: + app: csi-cloudscale-node + template: + metadata: + labels: + app: csi-cloudscale-node + role: csi-cloudscale + spec: + priorityClassName: system-node-critical + serviceAccount: csi-cloudscale-node-sa + hostNetwork: true + containers: + - name: csi-node-driver-registrar + image: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.5.1" + imagePullPolicy: IfNotPresent + args: + - "--v=5" + - "--csi-address=$(ADDRESS)" + - "--kubelet-registration-path=$(DRIVER_REG_SOCK_PATH)" + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "rm -rf /registration/csi.cloudscale.ch /registration/csi.cloudscale.ch-reg.sock"] + env: + - name: ADDRESS + value: /csi/csi.sock + - name: DRIVER_REG_SOCK_PATH + value: /var/lib/kubelet/plugins/csi.cloudscale.ch/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: plugin-dir + mountPath: /csi/ + - name: registration-dir + mountPath: /registration/ + - name: csi-cloudscale-plugin + image: "quay.io/cloudscalech/cloudscale-csi-plugin:v3.5.7-rc1" + imagePullPolicy: IfNotPresent + args : + - "--endpoint=$(CSI_ENDPOINT)" + - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level=info" + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: CLOUDSCALE_API_URL + value: https://api.cloudscale.ch/ + - name: CLOUDSCALE_MAX_CSI_VOLUMES_PER_NODE + value: "125" + - name: CLOUDSCALE_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: cloudscale + key: access-token + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN"] + allowPrivilegeEscalation: true + volumeMounts: + - name: plugin-dir + mountPath: /csi + - name: pods-mount-dir + mountPath: /var/lib/kubelet + # needed so that any mounts setup inside this container are + # propagated back to the host machine. + mountPropagation: "Bidirectional" + - name: device-dir + mountPath: /dev + - name: tmpfs + mountPath: /tmp + volumes: + - name: registration-dir + hostPath: + path: /var/lib/kubelet/plugins_registry/ + type: DirectoryOrCreate + - name: plugin-dir + hostPath: + path: /var/lib/kubelet/plugins/csi.cloudscale.ch + type: DirectoryOrCreate + - name: pods-mount-dir + hostPath: + path: /var/lib/kubelet + type: Directory + - name: device-dir + hostPath: + path: /dev + # to make sure temporary stored luks keys never touch a disk + - name: tmpfs + emptyDir: + medium: Memory +--- +# Source: csi-cloudscale/templates/statefulset.yaml +kind: StatefulSet +apiVersion: apps/v1 +metadata: + name: csi-cloudscale-controller + namespace: kube-system +spec: + serviceName: "csi-cloudscale" + selector: + matchLabels: + app: csi-cloudscale-controller + replicas: 1 + template: + metadata: + labels: + app: csi-cloudscale-controller + role: csi-cloudscale + spec: + hostNetwork: true + priorityClassName: system-cluster-critical + serviceAccount: csi-cloudscale-controller-sa + containers: + - name: csi-provisioner + image: "registry.k8s.io/sig-storage/csi-provisioner:v3.2.1" + imagePullPolicy: IfNotPresent + args: + - "--csi-address=$(ADDRESS)" + - "--default-fstype=ext4" + - "--v=5" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-attacher + image: "registry.k8s.io/sig-storage/csi-attacher:v4.0.0" + imagePullPolicy: IfNotPresent + args: + - "--csi-address=$(ADDRESS)" + - "--v=5" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-resizer + image: "registry.k8s.io/sig-storage/csi-resizer:v1.6.0" + args: + - "--csi-address=$(ADDRESS)" + - "--timeout=30s" + - "--v=5" + - "--handle-volume-inuse-error=false" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + imagePullPolicy: IfNotPresent + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-cloudscale-plugin + image: "quay.io/cloudscalech/cloudscale-csi-plugin:v3.5.7-rc1" + args : + - "--endpoint=$(CSI_ENDPOINT)" + - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level=info" + env: + - name: CSI_ENDPOINT + value: unix:///var/lib/csi/sockets/pluginproxy/csi.sock + - name: CLOUDSCALE_API_URL + value: https://api.cloudscale.ch/ + - name: CLOUDSCALE_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: cloudscale + key: access-token + imagePullPolicy: IfNotPresent + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + volumes: + - name: socket-dir + emptyDir: {} +--- +# Source: csi-cloudscale/templates/csi_driver.yaml +apiVersion: storage.k8s.io/v1 +kind: CSIDriver +metadata: + name: csi.cloudscale.ch +spec: + attachRequired: true + podInfoOnMount: true diff --git a/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc2.yaml b/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc2.yaml new file mode 100644 index 00000000..308cdd4a --- /dev/null +++ b/deploy/kubernetes/releases/csi-cloudscale-v3.5.7-rc2.yaml @@ -0,0 +1,417 @@ +--- +# Source: csi-cloudscale/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-cloudscale-controller-sa + namespace: kube-system +--- +# Source: csi-cloudscale/templates/serviceaccount.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: csi-cloudscale-node-sa + namespace: kube-system +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-ssd + namespace: kube-system + annotations: + storageclass.kubernetes.io/is-default-class: "true" +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: ssd +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-ssd-luks + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: ssd + csi.cloudscale.ch/luks-encrypted: "true" + csi.cloudscale.ch/luks-cipher: "aes-xts-plain64" + csi.cloudscale.ch/luks-key-size: "512" + csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace} + csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}-luks-key +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-bulk + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: bulk +--- +# Source: csi-cloudscale/templates/storageclass.yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: cloudscale-volume-bulk-luks + namespace: kube-system +provisioner: csi.cloudscale.ch +allowVolumeExpansion: true +reclaimPolicy: Delete +volumeBindingMode: Immediate +parameters: + csi.cloudscale.ch/volume-type: bulk + csi.cloudscale.ch/luks-encrypted: "true" + csi.cloudscale.ch/luks-cipher: "aes-xts-plain64" + csi.cloudscale.ch/luks-key-size: "512" + csi.storage.k8s.io/node-stage-secret-namespace: ${pvc.namespace} + csi.storage.k8s.io/node-stage-secret-name: ${pvc.name}-luks-key +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-provisioner-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch", "update"] + - apiGroups: ["storage.k8s.io"] + resources: ["storageclasses"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["list", "watch", "create", "update", "patch"] + - apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshots"] + verbs: ["get", "list"] + - apiGroups: ["snapshot.storage.k8s.io"] + resources: ["volumesnapshotcontents"] + verbs: ["get", "list"] + - apiGroups: [ "storage.k8s.io" ] + resources: [ "csinodes" ] + verbs: [ "get", "list", "watch" ] + - apiGroups: [ "" ] + resources: [ "nodes" ] + verbs: [ "get", "list", "watch" ] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-attacher-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["csinodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: ["storage.k8s.io"] + resources: ["volumeattachments/status"] + verbs: ["patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-resizer-role +rules: + - apiGroups: [""] + resources: ["persistentvolumes"] + verbs: ["get", "list", "watch", "update", "patch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["persistentvolumeclaims/status"] + verbs: ["update", "patch"] + - apiGroups: [""] + resources: ["events"] + verbs: ["list", "watch", "create", "update", "patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-node-driver-registrar-role + namespace: kube-system +rules: + - apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch", "create", "update", "patch"] +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-provisioner-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-provisioner-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-resizer-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-resizer-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-attacher-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-controller-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-attacher-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: csi-cloudscale-node-driver-registrar-binding +subjects: + - kind: ServiceAccount + name: csi-cloudscale-node-sa + namespace: kube-system +roleRef: + kind: ClusterRole + name: csi-cloudscale-node-driver-registrar-role + apiGroup: rbac.authorization.k8s.io +--- +# Source: csi-cloudscale/templates/daemonset.yaml +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: csi-cloudscale-node + namespace: kube-system +spec: + selector: + matchLabels: + app: csi-cloudscale-node + template: + metadata: + labels: + app: csi-cloudscale-node + role: csi-cloudscale + spec: + priorityClassName: system-node-critical + serviceAccount: csi-cloudscale-node-sa + hostNetwork: true + containers: + - name: csi-node-driver-registrar + image: "registry.k8s.io/sig-storage/csi-node-driver-registrar:v2.15.0" + imagePullPolicy: IfNotPresent + args: + - "--v=5" + - "--csi-address=$(ADDRESS)" + - "--kubelet-registration-path=$(DRIVER_REG_SOCK_PATH)" + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "rm -rf /registration/csi.cloudscale.ch /registration/csi.cloudscale.ch-reg.sock"] + env: + - name: ADDRESS + value: /csi/csi.sock + - name: DRIVER_REG_SOCK_PATH + value: /var/lib/kubelet/plugins/csi.cloudscale.ch/csi.sock + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: plugin-dir + mountPath: /csi/ + - name: registration-dir + mountPath: /registration/ + - name: csi-cloudscale-plugin + image: "quay.io/cloudscalech/cloudscale-csi-plugin:v3.5.7-rc2" + imagePullPolicy: IfNotPresent + args : + - "--endpoint=$(CSI_ENDPOINT)" + - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level=info" + env: + - name: CSI_ENDPOINT + value: unix:///csi/csi.sock + - name: CLOUDSCALE_API_URL + value: https://api.cloudscale.ch/ + - name: CLOUDSCALE_MAX_CSI_VOLUMES_PER_NODE + value: "125" + - name: CLOUDSCALE_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: cloudscale + key: access-token + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN"] + allowPrivilegeEscalation: true + volumeMounts: + - name: plugin-dir + mountPath: /csi + - name: pods-mount-dir + mountPath: /var/lib/kubelet + # needed so that any mounts setup inside this container are + # propagated back to the host machine. + mountPropagation: "Bidirectional" + - name: device-dir + mountPath: /dev + - name: tmpfs + mountPath: /tmp + volumes: + - name: registration-dir + hostPath: + path: /var/lib/kubelet/plugins_registry/ + type: DirectoryOrCreate + - name: plugin-dir + hostPath: + path: /var/lib/kubelet/plugins/csi.cloudscale.ch + type: DirectoryOrCreate + - name: pods-mount-dir + hostPath: + path: /var/lib/kubelet + type: Directory + - name: device-dir + hostPath: + path: /dev + # to make sure temporary stored luks keys never touch a disk + - name: tmpfs + emptyDir: + medium: Memory +--- +# Source: csi-cloudscale/templates/statefulset.yaml +kind: StatefulSet +apiVersion: apps/v1 +metadata: + name: csi-cloudscale-controller + namespace: kube-system +spec: + serviceName: "csi-cloudscale" + selector: + matchLabels: + app: csi-cloudscale-controller + replicas: 1 + template: + metadata: + labels: + app: csi-cloudscale-controller + role: csi-cloudscale + spec: + hostNetwork: true + priorityClassName: system-cluster-critical + serviceAccount: csi-cloudscale-controller-sa + containers: + - name: csi-provisioner + image: "registry.k8s.io/sig-storage/csi-provisioner:v5.3.0" + imagePullPolicy: IfNotPresent + args: + - "--csi-address=$(ADDRESS)" + - "--default-fstype=ext4" + - "--v=5" + - "--feature-gates=Topology=false" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-attacher + image: "registry.k8s.io/sig-storage/csi-attacher:v4.10.0" + imagePullPolicy: IfNotPresent + args: + - "--csi-address=$(ADDRESS)" + - "--v=5" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-resizer + image: "registry.k8s.io/sig-storage/csi-resizer:v2.0.0" + args: + - "--csi-address=$(ADDRESS)" + - "--timeout=30s" + - "--v=5" + - "--handle-volume-inuse-error=false" + env: + - name: ADDRESS + value: /var/lib/csi/sockets/pluginproxy/csi.sock + imagePullPolicy: IfNotPresent + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + - name: csi-cloudscale-plugin + image: "quay.io/cloudscalech/cloudscale-csi-plugin:v3.5.7-rc2" + args : + - "--endpoint=$(CSI_ENDPOINT)" + - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level=info" + env: + - name: CSI_ENDPOINT + value: unix:///var/lib/csi/sockets/pluginproxy/csi.sock + - name: CLOUDSCALE_API_URL + value: https://api.cloudscale.ch/ + - name: CLOUDSCALE_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: cloudscale + key: access-token + imagePullPolicy: IfNotPresent + volumeMounts: + - name: socket-dir + mountPath: /var/lib/csi/sockets/pluginproxy/ + volumes: + - name: socket-dir + emptyDir: {} +--- +# Source: csi-cloudscale/templates/csi_driver.yaml +apiVersion: storage.k8s.io/v1 +kind: CSIDriver +metadata: + name: csi.cloudscale.ch +spec: + attachRequired: true + podInfoOnMount: true diff --git a/driver/driver.go b/driver/driver.go index 276c3a72..e9f4dde3 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -65,6 +65,10 @@ type Driver struct { mounter Mounter log *logrus.Entry + // A map storing all volumes with ongoing operations so that additional operations + // for that same volume (as defined by VolumeID) return an Aborted error + volumeLocks *VolumeLocks + // ready defines whether the driver is ready to function. This value will // be used by the `Identity` service via the `Probe()` method. readyMu sync.Mutex // protects ready @@ -74,7 +78,7 @@ type Driver struct { // NewDriver returns a CSI plugin that contains the necessary gRPC // interfaces to interact with Kubernetes over unix domain sockets for // managaing cloudscale.ch Volumes -func NewDriver(ep, token, urlstr string) (*Driver, error) { +func NewDriver(ep, token, urlstr string, logLevel logrus.Level) (*Driver, error) { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: token, }) @@ -98,7 +102,9 @@ func NewDriver(ep, token, urlstr string) (*Driver, error) { } cloudscaleClient.BaseURL = baseURL - log := logrus.New().WithFields(logrus.Fields{ + logger := logrus.New() + logger.SetLevel(logLevel) + log := logger.WithFields(logrus.Fields{ "zone": zone, "node_id": serverId, "version": version, @@ -111,6 +117,7 @@ func NewDriver(ep, token, urlstr string) (*Driver, error) { cloudscaleClient: cloudscaleClient, mounter: newMounter(log), log: log, + volumeLocks: NewVolumeLocks(), }, nil } diff --git a/driver/driver_test.go b/driver/driver_test.go index e0cd7034..164452d7 100644 --- a/driver/driver_test.go +++ b/driver/driver_test.go @@ -29,12 +29,11 @@ import ( "time" "github.com/cenkalti/backoff/v5" - "github.com/google/uuid" - "k8s.io/mount-utils" - "github.com/cloudscale-ch/cloudscale-go-sdk/v6" + "github.com/google/uuid" "github.com/kubernetes-csi/csi-test/v5/pkg/sanity" "github.com/sirupsen/logrus" + "k8s.io/mount-utils" ) func init() { @@ -71,6 +70,7 @@ func TestDriverSuite(t *testing.T) { cloudscaleClient: cloudscaleClient, mounter: fm, log: logrus.New().WithField("test_enabed", true), + volumeLocks: NewVolumeLocks(), } defer driver.Stop() @@ -174,9 +174,9 @@ func (f *fakeMounter) HasRequiredSize(log *logrus.Entry, path string, requiredSi return true, nil } -func (f *fakeMounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, target string) (*string, error) { +func (f *fakeMounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, target string) (string, error) { path := "SomePath" - return &path, nil + return path, nil } type FakeVolumeServiceOperations struct { diff --git a/driver/driver_volume_type_test.go b/driver/driver_volume_type_test.go index 32dbc403..90ebc689 100644 --- a/driver/driver_volume_type_test.go +++ b/driver/driver_volume_type_test.go @@ -90,7 +90,7 @@ func TestCreateVolumeInvalidType(t *testing.T) { ) assert.Error(t, err) - //assert.Error(t, err, "invalid volume capabilities requested for LUKS xx.") + // assert.Error(t, err, "invalid volume capabilities requested for LUKS xx.") } func TestCreateVolumeInvalidLUKSAndRaw(t *testing.T) { @@ -183,5 +183,6 @@ func createDriverForTest(t *testing.T) *Driver { mounter: &fakeMounter{}, log: logrus.New().WithField("test_enabled", true), cloudscaleClient: cloudscaleClient, + volumeLocks: NewVolumeLocks(), } } diff --git a/driver/luks_util.go b/driver/luks_util.go index 6b75d2c7..d0f2595d 100644 --- a/driver/luks_util.go +++ b/driver/luks_util.go @@ -20,7 +20,6 @@ package driver import ( "errors" "fmt" - "io/ioutil" "os" "os/exec" "strings" @@ -204,9 +203,9 @@ func luksPrepareMount(source string, ctx LuksContext, log *logrus.Entry) (string return "", err } defer func() { - e := os.Remove(filename) - if e != nil { - log.Errorf("cannot delete temporary file %s: %s", filename, e.Error()) + err := os.Remove(filename) + if err != nil { + log.Errorf("cannot delete temporary file %s: %s", filename, err.Error()) } }() @@ -378,10 +377,14 @@ func writeLuksKey(key string, log *logrus.Entry) (string, error) { if !checkTmpFs("/tmp") { return "", errors.New("temporary directory /tmp is not a tmpfs volume; refusing to write luks key to a volume backed by a disk") } - tmpFile, err := ioutil.TempFile("/tmp", "luks-") + tmpFile, err := os.CreateTemp("/tmp", "luks-") if err != nil { return "", err } + defer func() { + _ = tmpFile.Close() + }() + _, err = tmpFile.WriteString(key) if err != nil { log.WithField("tmp_file", tmpFile.Name()).Warnf("Unable to write luks key file: %s", err.Error()) diff --git a/driver/mounter.go b/driver/mounter.go index cf0b4ddc..bb4317f3 100644 --- a/driver/mounter.go +++ b/driver/mounter.go @@ -18,23 +18,24 @@ limitations under the License. package driver import ( + "context" "encoding/json" "errors" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" + "regexp" "strconv" "strings" + "sync" "syscall" "time" - "k8s.io/mount-utils" - kexec "k8s.io/utils/exec" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + "k8s.io/mount-utils" + kexec "k8s.io/utils/exec" ) const ( @@ -87,7 +88,7 @@ type Mounter interface { // Used to find a path in /dev/disk/by-id with a serial that we have from // the cloudscale API. - FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, VolumeId string) (*string, error) + FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, VolumeId string) (string, error) // GetStatistics returns capacity-related volume statistics for the given // volume path. @@ -201,7 +202,9 @@ func (m *mounter) Mount(source, target, fsType string, luksContext LuksContext, if err != nil { return fmt.Errorf("failed to create target file for raw block bind mount: %v", err) } - file.Close() + if err := file.Close(); err != nil { + m.log.WithFields(logrus.Fields{"target": target}).Error("failed to close file handle") + } } else { // create target, os.Mkdirall is noop if directory exists err := os.MkdirAll(target, 0750) @@ -219,10 +222,33 @@ func (m *mounter) Mount(source, target, fsType string, luksContext LuksContext, }).Error("failed to prepare luks volume for mounting") return err } + // source is /dev/mapper/ now source = luksSource } + // Resolve source symlink for debug logging + resolvedSource, resolveErr := filepath.EvalSymlinks(source) + if resolveErr != nil { + m.log.WithFields(logrus.Fields{ + "source": source, + "target": target, + "fs_type": fsType, + "options": options, + "resolve_error": resolveErr, + }).Debug("Mount: failed to resolve source symlink") + } else { + m.log.WithFields(logrus.Fields{ + "source": source, + "resolved_source": resolvedSource, + "target": target, + "fs_type": fsType, + "options": options, + }).Debug("Mount: resolved source device") + } + m.log.WithFields(logrus.Fields{ + "source": source, + "target": target, "options": options, }).Info("executing mount command") err := m.kMounter.Mount(source, target, fsType, options) @@ -242,6 +268,9 @@ func (m *mounter) Unmount(target string, luksContext LuksContext) error { // a luks volume needs to be closed after unmounting; get the source // of the mount to check if that is a luks volume mountSources, err := getMountSources(target) + if err != nil { + return fmt.Errorf("failed to get mount sources for target %q: %v", target, err) + } err = mount.CleanupMountPoint(target, m.kMounter, true) if err != nil { @@ -419,7 +448,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -func guessDiskIDPathByVolumeID(volumeID string) *string { +func guessDiskIDPathByVolumeID(volumeID string, logger *logrus.Entry) string { // Get the first part of the UUID. // The linux kernel limits volume serials to 20 bytes: // include/uapi/linux/virtio_blk.h:#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ @@ -427,64 +456,158 @@ func guessDiskIDPathByVolumeID(volumeID string) *string { globExpr := diskIDPath + "/*" + linuxSerial + "*" matches, _ := filepath.Glob(globExpr) + + logger.WithFields(logrus.Fields{ + "volumeID": volumeID, + "linuxSerial": linuxSerial, + "matches": matches, + }).Debug("guessDiskIDPathByVolumeID") + if len(matches) > 0 { - return &matches[0] + return matches[0] } - return nil + return "" } -func (m *mounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, volumeID string) (*string, error) { +func (m *mounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, volumeID string) (string, error) { numTries := 0 for { - probeAttachedVolume(logger) - - diskIDPath := guessDiskIDPathByVolumeID(volumeID) - if diskIDPath != nil { - return diskIDPath, nil + diskIDPath := guessDiskIDPathByVolumeID(volumeID, logger) + if diskIDPath != "" { + // Resolve and log the actual device for debugging + resolved, err := filepath.EvalSymlinks(diskIDPath) + if err != nil { + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "error": err, + }).Error("FinalizeVolumeAttachmentAndFindPath: found path but failed to resolve symlink") + return "", fmt.Errorf("FinalizeVolumeAttachmentAndFindPath: found path %s but failed to resolve symlink: %w", diskIDPath, err) + } + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "resolved_device": resolved, + "num_tries": numTries, + }).Debug("FinalizeVolumeAttachmentAndFindPath: found device path") + + devFsSerial, innerErr := getScsiSerial(resolved) + if innerErr != nil { + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "resolved_device": resolved, + "num_tries": numTries, + }).Error("FinalizeVolumeAttachmentAndFindPath: unable to get device serial") + return "", fmt.Errorf("FinalizeVolumeAttachmentAndFindPath: unable to get serial number for disk %s at path %s: %w", diskIDPath, resolved, innerErr) + } + // success: found a path in /dev/disk/by-id/* which resolved to a symlink in /dev/* and that returned the right serial. + if devFsSerial != "" && devFsSerial == volumeID { + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "resolved_device": resolved, + "serial": devFsSerial, + "num_tries": numTries, + }).Debug("FinalizeVolumeAttachmentAndFindPath: found device and resolved serial") + return diskIDPath, nil + } + // A /dev/* path exists, but it's not matching the right serial. Attempt to repair by triggering udevadm. } + logger.WithFields(logrus.Fields{ + "num_tries": numTries, + }).Debug("FinalizeVolumeAttachmentAndFindPath: device not found, probing") + + probeAttachedVolume(logger) + numTries++ - if numTries == 10 { + if numTries == 30 { break } time.Sleep(time.Second) } - return nil, errors.New("Could not attach disk: Timeout after 10s") + return "", errors.New("FinalizeVolumeAttachmentAndFindPath: Timeout after 30s") } -func probeAttachedVolume(logger *logrus.Entry) error { - // rescan scsi bus - scsiHostRescan() +// getScsiSerial assumes that scsiIdPath exists and will error if it +// doesnt. It is the callers responsibility to verify the existence of this +// tool. Calls scsi_id on the given devicePath to get the serial number reported +// by that device. +func getScsiSerial(devicePath string) (string, error) { + out, err := exec.Command( + "/usr/lib/udev/scsi_id", + "--page=0x83", + "--whitelisted", + fmt.Sprintf("--device=%v", devicePath)).CombinedOutput() + if err != nil { + return "", fmt.Errorf("scsi_id failed for device %q with output %s: %w", devicePath, string(out), err) + } + + return parseScsiSerial(string(out)) +} + +var ( + // scsi_id output should be in the form of: + // 0QEMU QEMU HARDDISK + scsiPattern = `^0QEMU\s+QEMU\sHARDDISK\s+([\S]+)\s*$` + // regex to parse scsi_id output and extract the serial + scsiRegex = regexp.MustCompile(scsiPattern) +) - // udevadm settle waits for udevd to process the device creation - // events for all hardware devices, thus ensuring that any device - // nodes have been created successfully before proceeding. - argsSettle := []string{"settle"} - cmdSettle := exec.Command("udevadm", argsSettle...) - _, errSettle := cmdSettle.CombinedOutput() - if errSettle != nil { - logger.Errorf("error running udevadm settle %v\n", errSettle) +// Parse the output returned by scsi_id and extract the serial number +func parseScsiSerial(output string) (string, error) { + substrings := scsiRegex.FindStringSubmatch(output) + if substrings == nil { + return "", fmt.Errorf("scsi_id output cannot be parsed: %q", output) } - args := []string{"trigger"} - cmd := exec.Command("udevadm", args...) - _, err := cmd.CombinedOutput() + return substrings[1], nil +} + +func runCmdWithTimeout(name string, args []string, logger *logrus.Entry, timeout time.Duration) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + out, err := exec.CommandContext(ctx, name, args...).CombinedOutput() if err != nil { - logger.Errorf("error running udevadm trigger %v\n", err) - return err + logger.WithError(err). + WithFields(logrus.Fields{"out": out, "name": name, "args": args}). + Warn("unable to run cmd " + name) } - logger.Debugf("Successfully probed all attachments") - return nil } -func scsiHostRescan() { - scsiPath := "/sys/class/scsi_host/" - if dirs, err := ioutil.ReadDir(scsiPath); err == nil { - for _, f := range dirs { - name := scsiPath + f.Name() + "/scan" - data := []byte("- - -") - ioutil.WriteFile(name, data, 0666) - } +var probeLock sync.Mutex + +func probeAttachedVolume(logger *logrus.Entry) { + const triggerTimeout = 15 * time.Second + + // host rescan and udevadm are global actions and if run concurrently, may run into issues with + // symlinking and partial updates. + probeLock.Lock() + defer probeLock.Unlock() + + // rescan scsi bus + logger.Debug("probeAttachedVolume: rescanning SCSI hosts") + scsiHostRescan(logger) + + logger.Debug("probeAttachedVolume: running udevadm trigger") + runCmdWithTimeout("udevadm", []string{"trigger"}, logger, triggerTimeout) + + logger.Debug("probeAttachedVolume: running udevadm settle") + runCmdWithTimeout("udevadm", []string{"settle"}, logger, triggerTimeout) + + logger.Debugf("probeAttachedVolume: done") +} + +func scsiHostRescan(logger *logrus.Entry) { + const scsiPath = "/sys/class/scsi_host/" + dirs, err := os.ReadDir(scsiPath) + if err != nil { + logger.WithError(err).Warn("scsiHostRescan: cannot read scsi_host directory") + return + } + + for _, f := range dirs { + name := scsiPath + f.Name() + "/scan" + data := []byte("- - -") + _ = os.WriteFile(name, data, 0666) } } @@ -495,20 +618,20 @@ func (m *mounter) GetDeviceName(mounter mount.Interface, mountPath string) (stri // FindAbsoluteDeviceByIDPath follows the /dev/disk/by-id symlink to find the absolute path of a device func (m *mounter) FindAbsoluteDeviceByIDPath(volumeName string) (string, error) { - path := guessDiskIDPathByVolumeID(volumeName) - if path == nil { + path := guessDiskIDPathByVolumeID(volumeName, m.log) + if path == "" { return "", fmt.Errorf("could not find device-path for volume: %s", volumeName) } // EvalSymlinks returns relative link if the file is not a symlink // so we do not have to check if it is symlink prior to evaluation - resolved, err := filepath.EvalSymlinks(*path) + resolved, err := filepath.EvalSymlinks(path) if err != nil { - return "", fmt.Errorf("could not resolve symlink %q: %v", *path, err) + return "", fmt.Errorf("could not resolve symlink %q: %v", path, err) } if !strings.HasPrefix(resolved, "/dev") { - return "", fmt.Errorf("resolved symlink %q for %q was unexpected", resolved, *path) + return "", fmt.Errorf("resolved symlink %q for %q was unexpected", resolved, path) } return resolved, nil diff --git a/driver/node.go b/driver/node.go index 873ea69d..b575a4f0 100644 --- a/driver/node.go +++ b/driver/node.go @@ -15,19 +15,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Code generated by protoc-gen-go. DO NOT EDIT. - -// NOTE: THIS IS NOT GENERATED. We have to add the line above to prevent golint -// checking this file. This is needed because some methods end with xxxId, but -// golint wants them to be xxxID. But we're not able to change it as the -// official CSI spec is that way and we have to implement the interface -// exactly. - package driver import ( "context" "os" + "path/filepath" "strconv" "github.com/container-storage-interface/spec/lib/go/csi" @@ -39,8 +32,6 @@ import ( ) const ( - diskDOPrefix = "scsi-0DO_Volume_" - // Current technical limit is 128 // - 1 for root // - 1 for /var/lib/docker @@ -71,14 +62,39 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe return nil, status.Error(codes.InvalidArgument, "NodeStageVolume Volume Capability must be provided") } + if acquired := d.volumeLocks.TryAcquire(req.VolumeId); !acquired { + return nil, status.Errorf(codes.Aborted, "an operation with the given Volume ID %s already exists", req.VolumeId) + } + defer d.volumeLocks.Release(req.VolumeId) + // Apparently sometimes we need to call udevadm trigger to get the volume // properly registered in /dev/disk. More information can be found here: // https://github.com/cloudscale-ch/csi-cloudscale/issues/9 - sourcePtr, err := d.mounter.FinalizeVolumeAttachmentAndFindPath(d.log.WithFields(logrus.Fields{"volume_id": req.VolumeId}), req.VolumeId) + source, err := d.mounter.FinalizeVolumeAttachmentAndFindPath(d.log.WithFields(logrus.Fields{"volume_id": req.VolumeId}), req.VolumeId) if err != nil { return nil, err } - source := *sourcePtr + + d.log.WithFields(logrus.Fields{ + "volume_id": req.VolumeId, + "device_path": source, + }).Info("successfully found attached volume_id at device_path") + + // Debug logging to help diagnose potential race conditions with concurrent volume mounts + resolvedSource, resolveErr := filepath.EvalSymlinks(source) + if resolveErr != nil { + d.log.WithFields(logrus.Fields{ + "volume_id": req.VolumeId, + "source": source, + "resolve_error": resolveErr, + }).Debug("failed to resolve source symlink") + } else { + d.log.WithFields(logrus.Fields{ + "volume_id": req.VolumeId, + "source_symlink": source, + "resolved_device": resolvedSource, + }).Debug("resolved source device path") + } publishContext := req.GetPublishContext() if publishContext == nil { @@ -99,7 +115,7 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe return &csi.NodeStageVolumeResponse{}, nil } - target := req.StagingTargetPath + stagingTargetPath := req.StagingTargetPath mnt := req.VolumeCapability.GetMount() options := mnt.MountFlags @@ -137,19 +153,21 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe ll.Info("source device is already formatted") } - ll.Info("mounting the volume for staging") + ll.Info("checking if stagingTargetPath is already mounted") - mounted, err := d.mounter.IsMounted(target) + mounted, err := d.mounter.IsMounted(stagingTargetPath) if err != nil { + ll.WithError(err).Error("unable to check if already mounted") return nil, err } if !mounted { - if err := d.mounter.Mount(source, target, fsType, luksContext, options...); err != nil { + ll.Info("not mounted yet, mounting the volume for staging") + if err := d.mounter.Mount(source, stagingTargetPath, fsType, luksContext, options...); err != nil { return nil, status.Error(codes.Internal, err.Error()) } } else { - ll.Info("source device is already mounted to the target path") + ll.Info("source device is already mounted to the stagingTargetPath path") } ll.Info("formatting and mounting stage volume is finished") @@ -166,6 +184,11 @@ func (d *Driver) NodeUnstageVolume(ctx context.Context, req *csi.NodeUnstageVolu return nil, status.Error(codes.InvalidArgument, "NodeUnstageVolume Staging Target Path must be provided") } + if acquired := d.volumeLocks.TryAcquire(req.VolumeId); !acquired { + return nil, status.Errorf(codes.Aborted, "an operation with the given Volume ID %s already exists", req.VolumeId) + } + defer d.volumeLocks.Release(req.VolumeId) + luksContext := LuksContext{VolumeLifecycle: VolumeLifecycleNodeUnstageVolume} ll := d.log.WithFields(logrus.Fields{ @@ -213,6 +236,11 @@ func (d *Driver) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolu return nil, status.Error(codes.InvalidArgument, "NodePublishVolume Volume Capability must be provided") } + if acquired := d.volumeLocks.TryAcquire(req.VolumeId); !acquired { + return nil, status.Errorf(codes.Aborted, "an operation with the given Volume ID %s already exists", req.VolumeId) + } + defer d.volumeLocks.Release(req.VolumeId) + publishContext := req.GetPublishContext() if publishContext == nil { return nil, status.Error(codes.InvalidArgument, "PublishContext must be provided") @@ -260,6 +288,11 @@ func (d *Driver) NodeUnpublishVolume(ctx context.Context, req *csi.NodeUnpublish return nil, status.Error(codes.InvalidArgument, "NodeUnpublishVolume Target Path must be provided") } + if acquired := d.volumeLocks.TryAcquire(req.VolumeId); !acquired { + return nil, status.Errorf(codes.Aborted, "an operation with the given Volume ID %s already exists", req.VolumeId) + } + defer d.volumeLocks.Release(req.VolumeId) + luksContext := LuksContext{VolumeLifecycle: VolumeLifecycleNodeUnpublishVolume} ll := d.log.WithFields(logrus.Fields{ diff --git a/driver/volumelocks.go b/driver/volumelocks.go new file mode 100644 index 00000000..401ed6a1 --- /dev/null +++ b/driver/volumelocks.go @@ -0,0 +1,54 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package driver + +import ( + "sync" + + "k8s.io/apimachinery/pkg/util/sets" +) + +// VolumeLocks implements a map with atomic operations. It stores a set of all volume IDs +// with an ongoing operation. +type VolumeLocks struct { + locks sets.String + mux sync.Mutex +} + +func NewVolumeLocks() *VolumeLocks { + return &VolumeLocks{ + locks: sets.NewString(), + } +} + +// TryAcquire tries to acquire the lock for operating on volumeID and returns true if successful. +// If another operation is already using volumeID, returns false. +func (vl *VolumeLocks) TryAcquire(volumeID string) bool { + vl.mux.Lock() + defer vl.mux.Unlock() + if vl.locks.Has(volumeID) { + return false + } + vl.locks.Insert(volumeID) + return true +} + +func (vl *VolumeLocks) Release(volumeID string) { + vl.mux.Lock() + defer vl.mux.Unlock() + vl.locks.Delete(volumeID) +}