Skip to content

Commit

Permalink
Fix race of csi controller calls
Browse files Browse the repository at this point in the history
When a CreateVolume requests takes longer then 10 seconds the request
runs into a client-side timeout and gets retied by kubelet.
While processing the new request, the old request is still being canceled
on the server-side and the created ZFSVolume CR gets deleted.

1|2
CreateVolume|
ZFSVolume CR created|
waiting for volume creation|
client timeout + context cancel|
|CreateVolume
|ZFSVolume CR already exists
ZFSVolume CR deleted|
Return ERR|Return OK without creating a ZFSVolume

```
I0916 15:01:49.227721       1 grpc.go:72] GRPC call: /csi.v1.Controller/CreateVolume requests {"accessibility_requirements":{"preferred":[{"segments":{"beta.kubernetes.io/arch":"amd64","beta.kubernetes.io/os":"linux","kubernetes.io/arch":"amd64","kubernetes.io/hostname":"i04-1-b-node-i041b-0-0","kubernetes.io/os":"linux","m3.services/cluster":"i04-1-b","m3.services/worker-group":"i041b-0","node.systems.mittwald.cloud/placement-group":"i041b-0","openebs.io/nodeid":"i04-1-b-node-i041b-0-0","openebs.io/nodename":"i04-1-b-node-i041b-0-0"}}],"requisite":[{"segments":{"beta.kubernetes.io/arch":"amd64","beta.kubernetes.io/os":"linux","kubernetes.io/arch":"amd64","kubernetes.io/hostname":"i04-1-b-node-i041b-0-0","kubernetes.io/os":"linux","m3.services/cluster":"i04-1-b","m3.services/worker-group":"i041b-0","node.systems.mittwald.cloud/placement-group":"i041b-0","openebs.io/nodeid":"i04-1-b-node-i041b-0-0","openebs.io/nodename":"i04-1-b-node-i041b-0-0"}}]},"capacity_range":{"required_bytes":5368709120},"name":"pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457","parameters":{"compression":"off","csi.storage.k8s.io/pv/name":"pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457","csi.storage.k8s.io/pvc/name":"test-pvc0","csi.storage.k8s.io/pvc/namespace":"default","dedup":"off","fstype":"zfs","poolname":"kluster-pool/i04-1-b-128k","recordsize":"128k","shared":"yes","thinprovision":"yes"},"volume_capabilities":[{"AccessType":{"Mount":{"fs_type":"zfs"}},"access_mode":{"mode":1}}]}
I0916 15:01:58.853180       1 controller.go:289] zfs: trying volume creation kluster-pool/i04-1-b-128k/pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457 on node [i04-1-b-node-i041b-0-0]
I0916 15:01:59.104207       1 volume.go:139] zfs: waiting for volume kluster-pool/i04-1-b-128k/pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457 to be created on nodeid i04-1-b-node-i041b-0-0
I0916 15:02:00.105273       1 volume.go:170] zfs: volume kluster-pool/i04-1-b-128k/pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457 provisioning failed on nodeid i04-1-b-node-i041b-0-0 err: zfs: context deadline reached
I0916 15:02:00.235113       1 grpc.go:72] GRPC call: /csi.v1.Controller/CreateVolume requests {"accessibility_requirements":{"preferred":[{"segments":{"beta.kubernetes.io/arch":"amd64","beta.kubernetes.io/os":"linux","kubernetes.io/arch":"amd64","kubernetes.io/hostname":"i04-1-b-node-i041b-0-0","kubernetes.io/os":"linux","m3.services/cluster":"i04-1-b","m3.services/worker-group":"i041b-0","node.systems.mittwald.cloud/placement-group":"i041b-0","openebs.io/nodeid":"i04-1-b-node-i041b-0-0","openebs.io/nodename":"i04-1-b-node-i041b-0-0"}}],"requisite":[{"segments":{"beta.kubernetes.io/arch":"amd64","beta.kubernetes.io/os":"linux","kubernetes.io/arch":"amd64","kubernetes.io/hostname":"i04-1-b-node-i041b-0-0","kubernetes.io/os":"linux","m3.services/cluster":"i04-1-b","m3.services/worker-group":"i041b-0","node.systems.mittwald.cloud/placement-group":"i041b-0","openebs.io/nodeid":"i04-1-b-node-i041b-0-0","openebs.io/nodename":"i04-1-b-node-i041b-0-0"}}]},"capacity_range":{"required_bytes":5368709120},"name":"pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457","parameters":{"compression":"off","csi.storage.k8s.io/pv/name":"pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457","csi.storage.k8s.io/pvc/name":"test-pvc0","csi.storage.k8s.io/pvc/namespace":"default","dedup":"off","fstype":"zfs","poolname":"kluster-pool/i04-1-b-128k","recordsize":"128k","shared":"yes","thinprovision":"yes"},"volume_capabilities":[{"AccessType":{"Mount":{"fs_type":"zfs"}},"access_mode":{"mode":1}}]}
I0916 15:02:00.449153       1 volume.go:221] zfs: deleted the volume pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457
I0916 15:02:00.449299       1 controller.go:466] created the volume kluster-pool/i04-1-b-128k/pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457 on node i04-1-b-node-i041b-0-0
I0916 15:02:00.553778       1 grpc.go:81] GRPC response: {"volume":{"accessible_topology":[{"segments":{"openebs.io/nodeid":"i04-1-b-node-i041b-0-0"}}],"capacity_bytes":5368709120,"volume_context":{"openebs.io/cas-type":"localpv-zfs","openebs.io/poolname":"kluster-pool/i04-1-b-128k"},"volume_id":"pvc-eb11c446-f26c-4333-b3da-4ecb9c5b2457"}}
```

Signed-off-by: Luca Berneking <[email protected]>
  • Loading branch information
Lucaber authored and Abhinandan-Purkait committed Nov 4, 2024
1 parent 76617c0 commit 424e35b
Showing 1 changed file with 25 additions and 0 deletions.
25 changes: 25 additions & 0 deletions pkg/driver/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"strconv"
"strings"
"sync"
"time"

"github.com/container-storage-interface/spec/lib/go/csi"
Expand Down Expand Up @@ -63,6 +64,15 @@ type controller struct {

k8sNodeInformer cache.SharedIndexInformer
zfsNodeInformer cache.SharedIndexInformer

volMutexes sync.Map
}

func (cs *controller) LockVolume(volume string) func() {
value, _ := cs.volMutexes.LoadOrStore(volume, &sync.Mutex{})
mtx := value.(*sync.Mutex)
mtx.Lock()
return func() { mtx.Unlock() }
}

// NewController returns a new instance
Expand Down Expand Up @@ -450,6 +460,9 @@ func (cs *controller) CreateVolume(
contentSource := req.GetVolumeContentSource()
pvcName := helpers.GetInsensitiveParameter(&parameters, "csi.storage.k8s.io/pvc/name")

unlock := cs.LockVolume(volName)
defer unlock()

if contentSource != nil && contentSource.GetSnapshot() != nil {
snapshotID := contentSource.GetSnapshot().GetSnapshotId()

Expand Down Expand Up @@ -493,6 +506,8 @@ func (cs *controller) DeleteVolume(
}

volumeID := strings.ToLower(req.GetVolumeId())
unlock := cs.LockVolume(volumeID)
defer unlock()

// verify if the volume has already been deleted
vol, err := zfs.GetVolume(volumeID)
Expand Down Expand Up @@ -611,6 +626,8 @@ func (cs *controller) ControllerExpandVolume(
"ControllerExpandVolume: no volumeID provided",
)
}
unlock := cs.LockVolume(volumeID)
defer unlock()

/* round off the new size */
updatedSize := getRoundedCapacity(req.GetCapacityRange().GetRequiredBytes())
Expand Down Expand Up @@ -707,6 +724,10 @@ func (cs *controller) CreateSnapshot(
if err != nil {
return nil, err
}
unlockVol := cs.LockVolume(volumeID)
defer unlockVol()
unlockSnap := cs.LockVolume(snapName)
defer unlockSnap()

snapTimeStamp := time.Now().Unix()
var state string
Expand Down Expand Up @@ -803,6 +824,10 @@ func (cs *controller) DeleteSnapshot(
// should succeed when an invalid snapshot id is used
return &csi.DeleteSnapshotResponse{}, nil
}
unlockVol := cs.LockVolume(snapshotID[0])
defer unlockVol()
unlockSnap := cs.LockVolume(snapshotID[1])
defer unlockSnap()
if err := zfs.DeleteSnapshot(snapshotID[1]); err != nil {
return nil, status.Errorf(
codes.Internal,
Expand Down

0 comments on commit 424e35b

Please sign in to comment.