From 9cdc143b0a6700c213e6324df68224ad919a31a8 Mon Sep 17 00:00:00 2001 From: Michael Weibel Date: Fri, 12 Dec 2025 10:44:40 +0100 Subject: [PATCH] adjust udevadm calls, increase logging amount udevadm when called concurrently may potentially redo symlinks. Adding a lock, so if multiple pods attach at the same time, we don't call udevadm concurrently. Adjusts the order so we first check if the device is available, before doing udevadm calls. Adjust order of udevadm calls itself: first trigger, then settle. Reason: - settle waits for new events to finish - trigger triggers new events but doesn't wait for them Improve logging and increase the amount of logs (in debug level) to potentially debug issues with more data. --- .../csi-cloudscale/templates/daemonset.yaml | 1 + .../csi-cloudscale/templates/statefulset.yaml | 1 + charts/csi-cloudscale/values.yaml | 2 + cmd/cloudscale-csi-plugin/main.go | 9 +- driver/driver.go | 17 +- driver/driver_test.go | 11 +- driver/mounter.go | 154 ++++++++++++------ driver/node.go | 25 ++- 8 files changed, 157 insertions(+), 63 deletions(-) diff --git a/charts/csi-cloudscale/templates/daemonset.yaml b/charts/csi-cloudscale/templates/daemonset.yaml index 3695ef82..18a46a48 100644 --- a/charts/csi-cloudscale/templates/daemonset.yaml +++ b/charts/csi-cloudscale/templates/daemonset.yaml @@ -53,6 +53,7 @@ spec: args : - "--endpoint=$(CSI_ENDPOINT)" - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level={{ .Values.node.logLevel }}" {{- with .Values.node.resources }} resources: {{ toYaml . | indent 12 }} diff --git a/charts/csi-cloudscale/templates/statefulset.yaml b/charts/csi-cloudscale/templates/statefulset.yaml index 187c619f..293aad5d 100644 --- a/charts/csi-cloudscale/templates/statefulset.yaml +++ b/charts/csi-cloudscale/templates/statefulset.yaml @@ -76,6 +76,7 @@ spec: args : - "--endpoint=$(CSI_ENDPOINT)" - "--url=$(CLOUDSCALE_API_URL)" + - "--log-level={{ .Values.controller.logLevel }}" {{- with .Values.controller.resources }} resources: {{ toYaml . | indent 12 }} diff --git a/charts/csi-cloudscale/values.yaml b/charts/csi-cloudscale/values.yaml index 8bec898b..be06c75e 100644 --- a/charts/csi-cloudscale/values.yaml +++ b/charts/csi-cloudscale/values.yaml @@ -87,6 +87,7 @@ controller: tag: v3.5.6 pullPolicy: IfNotPresent serviceAccountName: + logLevel: info resources: {} # limits: # cpu: 100m @@ -104,6 +105,7 @@ node: nodeSelector: {} tolerations: [] serviceAccountName: + logLevel: info resources: {} # limits: # cpu: 100m diff --git a/cmd/cloudscale-csi-plugin/main.go b/cmd/cloudscale-csi-plugin/main.go index c869356f..b4a40758 100644 --- a/cmd/cloudscale-csi-plugin/main.go +++ b/cmd/cloudscale-csi-plugin/main.go @@ -24,6 +24,7 @@ import ( "os" "github.com/cloudscale-ch/csi-cloudscale/driver" + "github.com/sirupsen/logrus" ) func main() { @@ -32,6 +33,7 @@ func main() { token = flag.String("token", "", "cloudscale.ch access token") url = flag.String("url", "https://api.cloudscale.ch/", "cloudscale.ch API URL") version = flag.Bool("version", false, "Print the version and exit.") + logLevel = flag.String("log-level", "info", "Log level (trace, debug, info, warn, error, fatal, panic)") ) flag.Parse() @@ -44,7 +46,12 @@ func main() { os.Exit(0) } - drv, err := driver.NewDriver(*endpoint, *token, *url) + level, err := logrus.ParseLevel(*logLevel) + if err != nil { + log.Fatalf("invalid log level %q: %v", *logLevel, err) + } + + drv, err := driver.NewDriver(*endpoint, *token, *url, level) if err != nil { log.Fatalln(err) } diff --git a/driver/driver.go b/driver/driver.go index 4faaf7f9..37b7cfce 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -48,10 +48,9 @@ var ( // Driver implements the following CSI interfaces: // -// csi.IdentityServer -// csi.ControllerServer -// csi.NodeServer -// +// csi.IdentityServer +// csi.ControllerServer +// csi.NodeServer type Driver struct { endpoint string serverId string @@ -71,7 +70,7 @@ type Driver struct { // NewDriver returns a CSI plugin that contains the necessary gRPC // interfaces to interact with Kubernetes over unix domain sockets for // managaing cloudscale.ch Volumes -func NewDriver(ep, token, urlstr string) (*Driver, error) { +func NewDriver(ep, token, urlstr string, logLevel logrus.Level) (*Driver, error) { tokenSource := oauth2.StaticTokenSource(&oauth2.Token{ AccessToken: token, }) @@ -95,7 +94,9 @@ func NewDriver(ep, token, urlstr string) (*Driver, error) { } cloudscaleClient.BaseURL = baseURL - log := logrus.New().WithFields(logrus.Fields{ + logger := logrus.New() + logger.SetLevel(logLevel) + log := logger.WithFields(logrus.Fields{ "zone": zone, "node_id": serverId, "version": version, @@ -172,7 +173,9 @@ func (d *Driver) Stop() { // When building any packages that import version, pass the build/install cmd // ldflags like so: -// go build -ldflags "-X github.com/cloudscale-ch/csi-cloudscale/driver.version=0.0.1" +// +// go build -ldflags "-X github.com/cloudscale-ch/csi-cloudscale/driver.version=0.0.1" +// // GetVersion returns the current release version, as inserted at build time. func GetVersion() string { return version diff --git a/driver/driver_test.go b/driver/driver_test.go index d798c422..02a62f6b 100644 --- a/driver/driver_test.go +++ b/driver/driver_test.go @@ -20,8 +20,6 @@ package driver import ( "context" "errors" - "github.com/google/uuid" - "k8s.io/mount-utils" "math/rand" "net/http" "net/url" @@ -30,9 +28,12 @@ import ( "testing" "time" - "github.com/cloudscale-ch/cloudscale-go-sdk/v4" + "github.com/google/uuid" "github.com/kubernetes-csi/csi-test/v5/pkg/sanity" "github.com/sirupsen/logrus" + "k8s.io/mount-utils" + + "github.com/cloudscale-ch/cloudscale-go-sdk/v4" ) func init() { @@ -172,9 +173,9 @@ func (f *fakeMounter) HasRequiredSize(log *logrus.Entry, path string, requiredSi return true, nil } -func (f *fakeMounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, target string) (*string, error) { +func (f *fakeMounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, target string) (string, error) { path := "SomePath" - return &path, nil + return path, nil } type FakeVolumeServiceOperations struct { diff --git a/driver/mounter.go b/driver/mounter.go index 4505e0b6..921a608f 100644 --- a/driver/mounter.go +++ b/driver/mounter.go @@ -18,22 +18,23 @@ limitations under the License. package driver import ( + "context" "encoding/json" "errors" "fmt" - "io/ioutil" - "k8s.io/mount-utils" - kexec "k8s.io/utils/exec" "os" "os/exec" "path/filepath" "strconv" "strings" + "sync" "syscall" "time" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + "k8s.io/mount-utils" + kexec "k8s.io/utils/exec" ) const ( @@ -86,7 +87,7 @@ type Mounter interface { // Used to find a path in /dev/disk/by-id with a serial that we have from // the cloudscale API. - FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, VolumeId string) (*string, error) + FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, VolumeId string) (string, error) // GetStatistics returns capacity-related volume statistics for the given // volume path. @@ -221,7 +222,29 @@ func (m *mounter) Mount(source, target, fsType string, luksContext LuksContext, source = luksSource } + // Resolve source symlink for debug logging + resolvedSource, resolveErr := filepath.EvalSymlinks(source) + if resolveErr != nil { + m.log.WithFields(logrus.Fields{ + "source": source, + "target": target, + "fs_type": fsType, + "options": options, + "resolve_error": resolveErr, + }).Debug("Mount: failed to resolve source symlink") + } else { + m.log.WithFields(logrus.Fields{ + "source": source, + "resolved_source": resolvedSource, + "target": target, + "fs_type": fsType, + "options": options, + }).Debug("Mount: resolved source device") + } + m.log.WithFields(logrus.Fields{ + "source": source, + "target": target, "options": options, }).Info("executing mount command") err := m.kMounter.Mount(source, target, fsType, options) @@ -418,7 +441,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -func guessDiskIDPathByVolumeID(volumeID string) *string { +func guessDiskIDPathByVolumeID(volumeID string, logger *logrus.Entry) string { // Get the first part of the UUID. // The linux kernel limits volume serials to 20 bytes: // include/uapi/linux/virtio_blk.h:#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ @@ -426,64 +449,103 @@ func guessDiskIDPathByVolumeID(volumeID string) *string { globExpr := diskIDPath + "/*" + linuxSerial + "*" matches, _ := filepath.Glob(globExpr) + + logger.WithFields(logrus.Fields{ + "volumeID": volumeID, + "linuxSerial": linuxSerial, + "matches": matches, + }).Debug("guessDiskIDPathByVolumeID") + if len(matches) > 0 { - return &matches[0] + return matches[0] } - return nil + return "" } -func (m *mounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, volumeID string) (*string, error) { +func (m *mounter) FinalizeVolumeAttachmentAndFindPath(logger *logrus.Entry, volumeID string) (string, error) { numTries := 0 for { - probeAttachedVolume(logger) - - diskIDPath := guessDiskIDPathByVolumeID(volumeID) - if diskIDPath != nil { + diskIDPath := guessDiskIDPathByVolumeID(volumeID, logger) + if diskIDPath != "" { + // Resolve and log the actual device for debugging + resolved, err := filepath.EvalSymlinks(diskIDPath) + if err != nil { + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "error": err, + }).Debug("FinalizeVolumeAttachmentAndFindPath: found path but failed to resolve symlink") + } else { + logger.WithFields(logrus.Fields{ + "disk_id_path": diskIDPath, + "resolved_device": resolved, + "num_tries": numTries, + }).Debug("FinalizeVolumeAttachmentAndFindPath: found device path") + } return diskIDPath, nil } + logger.WithFields(logrus.Fields{ + "num_tries": numTries, + }).Debug("FinalizeVolumeAttachmentAndFindPath: device not found, probing") + + probeAttachedVolume(logger) + numTries++ - if numTries == 10 { + if numTries == 30 { break } time.Sleep(time.Second) } - return nil, errors.New("Could not attach disk: Timeout after 10s") + return "", errors.New("FinalizeVolumeAttachmentAndFindPath: Timeout after 30s") } -func probeAttachedVolume(logger *logrus.Entry) error { - // rescan scsi bus - scsiHostRescan() +func runCmdWithTimeout(name string, args []string, logger *logrus.Entry, timeout time.Duration) { + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() - // udevadm settle waits for udevd to process the device creation - // events for all hardware devices, thus ensuring that any device - // nodes have been created successfully before proceeding. - argsSettle := []string{"settle"} - cmdSettle := exec.Command("udevadm", argsSettle...) - _, errSettle := cmdSettle.CombinedOutput() - if errSettle != nil { - logger.Errorf("error running udevadm settle %v\n", errSettle) + out, err := exec.CommandContext(ctx, name, args...).CombinedOutput() + if err != nil { + logger.WithError(err). + WithFields(logrus.Fields{"out": out, "name": name, "args": args}). + Warn("unable to run cmd " + name) } +} + +var probeLock sync.Mutex + +func probeAttachedVolume(logger *logrus.Entry) { + const triggerTimeout = 15 * time.Second + + // host rescan and udevadm are global actions and if run concurrently, may run into issues with + // symlinking and partial updates. + probeLock.Lock() + defer probeLock.Unlock() + + // rescan scsi bus + logger.Debug("probeAttachedVolume: rescanning SCSI hosts") + scsiHostRescan(logger) - args := []string{"trigger"} - cmd := exec.Command("udevadm", args...) - _, err := cmd.CombinedOutput() + logger.Debug("probeAttachedVolume: running udevadm trigger") + runCmdWithTimeout("udevadm", []string{"trigger"}, logger, triggerTimeout) + + logger.Debug("probeAttachedVolume: running udevadm settle") + runCmdWithTimeout("udevadm", []string{"settle"}, logger, triggerTimeout) + + logger.Debugf("probeAttachedVolume: done") +} + +func scsiHostRescan(logger *logrus.Entry) { + const scsiPath = "/sys/class/scsi_host/" + dirs, err := os.ReadDir(scsiPath) if err != nil { - logger.Errorf("error running udevadm trigger %v\n", err) - return err + logger.WithError(err).Warn("scsiHostRescan: cannot read scsi_host directory") + return } - logger.Debugf("Successfully probed all attachments") - return nil -} -func scsiHostRescan() { - scsiPath := "/sys/class/scsi_host/" - if dirs, err := ioutil.ReadDir(scsiPath); err == nil { - for _, f := range dirs { - name := scsiPath + f.Name() + "/scan" - data := []byte("- - -") - ioutil.WriteFile(name, data, 0666) - } + for _, f := range dirs { + name := scsiPath + f.Name() + "/scan" + data := []byte("- - -") + _ = os.WriteFile(name, data, 0666) } } @@ -494,20 +556,20 @@ func (m *mounter) GetDeviceName(mounter mount.Interface, mountPath string) (stri // FindAbsoluteDeviceByIDPath follows the /dev/disk/by-id symlink to find the absolute path of a device func (m *mounter) FindAbsoluteDeviceByIDPath(volumeName string) (string, error) { - path := guessDiskIDPathByVolumeID(volumeName) - if path == nil { + path := guessDiskIDPathByVolumeID(volumeName, m.log) + if path == "" { return "", fmt.Errorf("could not find device-path for volume: %s", volumeName) } // EvalSymlinks returns relative link if the file is not a symlink // so we do not have to check if it is symlink prior to evaluation - resolved, err := filepath.EvalSymlinks(*path) + resolved, err := filepath.EvalSymlinks(path) if err != nil { - return "", fmt.Errorf("could not resolve symlink %q: %v", *path, err) + return "", fmt.Errorf("could not resolve symlink %q: %v", path, err) } if !strings.HasPrefix(resolved, "/dev") { - return "", fmt.Errorf("resolved symlink %q for %q was unexpected", resolved, *path) + return "", fmt.Errorf("resolved symlink %q for %q was unexpected", resolved, path) } return resolved, nil diff --git a/driver/node.go b/driver/node.go index d0d46c60..9d2fc69b 100644 --- a/driver/node.go +++ b/driver/node.go @@ -27,14 +27,16 @@ package driver import ( "context" + "os" + "path/filepath" + "strconv" + "github.com/container-storage-interface/spec/lib/go/csi" "github.com/sirupsen/logrus" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "k8s.io/mount-utils" utilexec "k8s.io/utils/exec" - "os" - "strconv" ) const ( @@ -71,11 +73,26 @@ func (d *Driver) NodeStageVolume(ctx context.Context, req *csi.NodeStageVolumeRe // Apparently sometimes we need to call udevadm trigger to get the volume // properly registered in /dev/disk. More information can be found here: // https://github.com/cloudscale-ch/csi-cloudscale/issues/9 - sourcePtr, err := d.mounter.FinalizeVolumeAttachmentAndFindPath(d.log.WithFields(logrus.Fields{"volume_id": req.VolumeId}), req.VolumeId) + source, err := d.mounter.FinalizeVolumeAttachmentAndFindPath(d.log.WithFields(logrus.Fields{"volume_id": req.VolumeId}), req.VolumeId) if err != nil { return nil, err } - source := *sourcePtr + + // Debug logging to help diagnose potential race conditions with concurrent volume mounts + resolvedSource, resolveErr := filepath.EvalSymlinks(source) + if resolveErr != nil { + d.log.WithFields(logrus.Fields{ + "volume_id": req.VolumeId, + "source": source, + "resolve_error": resolveErr, + }).Debug("failed to resolve source symlink") + } else { + d.log.WithFields(logrus.Fields{ + "volume_id": req.VolumeId, + "source_symlink": source, + "resolved_device": resolvedSource, + }).Debug("resolved source device path") + } publishContext := req.GetPublishContext() if publishContext == nil {