@@ -20,13 +20,15 @@ import (
20
20
"fmt"
21
21
"github.com/container-storage-interface/spec/lib/go/csi/v0"
22
22
"github.com/golang/glog"
23
+ "hash/fnv"
23
24
"k8s.io/api/core/v1"
24
25
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25
26
"k8s.io/client-go/kubernetes"
26
27
csiv1alpha1 "k8s.io/csi-api/pkg/apis/csi/v1alpha1"
27
28
csiclientset "k8s.io/csi-api/pkg/client/clientset/versioned"
28
29
"math/rand"
29
30
"sort"
31
+ "strconv"
30
32
"strings"
31
33
)
32
34
@@ -68,6 +70,7 @@ func GenerateAccessibilityRequirements(
68
70
kubeClient kubernetes.Interface ,
69
71
csiAPIClient csiclientset.Interface ,
70
72
driverName string ,
73
+ pvcName string ,
71
74
allowedTopologies []v1.TopologySelectorTerm ,
72
75
selectedNode * v1.Node ) (* csi.TopologyRequirement , error ) {
73
76
requirement := & csi.TopologyRequirement {}
@@ -96,7 +99,15 @@ func GenerateAccessibilityRequirements(
96
99
requirement .Requisite = toCSITopology (requisiteTerms )
97
100
98
101
/* Preferred */
99
- if selectedNode != nil {
102
+ var preferredTerms []topologyTerm
103
+ if selectedNode == nil {
104
+ // no node selected therefore ensure even spreading of StatefulSet volumes by sorting
105
+ // requisiteTerms and shifting the sorted terms based on hash of pvcName and replica index suffix
106
+ hash , index := getPVCNameHashAndIndexOffset (pvcName )
107
+ i := (hash + index ) % uint32 (len (requisiteTerms ))
108
+ preferredTerms = sortAndShift (requisiteTerms , nil , i )
109
+ } else {
110
+ // selectedNode is set so use topology from that node to populate preferredTerms
100
111
// TODO (verult) reuse selected node info from aggregateTopologies
101
112
// TODO (verult) retry
102
113
nodeInfo , err := csiAPIClient .CsiV1alpha1 ().CSINodeInfos ().Get (selectedNode .Name , metav1.GetOptions {})
@@ -110,7 +121,7 @@ func GenerateAccessibilityRequirements(
110
121
return nil , fmt .Errorf ("topology labels from selected node %v does not match topology keys from CSINodeInfo %v" , selectedNode .Labels , topologyKeys )
111
122
}
112
123
113
- preferredTerms : = sortAndShift (requisiteTerms , selectedTopology )
124
+ preferredTerms = sortAndShift (requisiteTerms , selectedTopology , 0 )
114
125
if preferredTerms == nil {
115
126
// Topology from selected node is not in requisite. This case should never be hit:
116
127
// - If AllowedTopologies is specified, the scheduler should choose a node satisfying the
@@ -119,10 +130,8 @@ func GenerateAccessibilityRequirements(
119
130
// selected node.
120
131
return nil , fmt .Errorf ("topology %v from selected node %q is not in requisite" , selectedTopology , selectedNode .Name )
121
132
}
122
-
123
- requirement .Preferred = toCSITopology (preferredTerms )
124
133
}
125
-
134
+ requirement . Preferred = toCSITopology ( preferredTerms )
126
135
return requirement , nil
127
136
}
128
137
@@ -267,17 +276,21 @@ func deduplicate(terms []topologyTerm) []topologyTerm {
267
276
}
268
277
269
278
// Sort the given terms in place,
270
- // then return a new list of terms equivalent to the sorted terms, but shifted so that the primary
271
- // term is the first in the list.
272
- func sortAndShift (terms []topologyTerm , primary topologyTerm ) []topologyTerm {
279
+ // then return a new list of terms equivalent to the sorted terms, but shifted so that
280
+ // either the primary term (if specified) or term at shiftIndex is the first in the list.
281
+ func sortAndShift (terms []topologyTerm , primary topologyTerm , shiftIndex uint32 ) []topologyTerm {
273
282
var preferredTerms []topologyTerm
274
283
sort .Slice (terms , func (i , j int ) bool {
275
284
return terms [i ].less (terms [j ])
276
285
})
277
- for i , t := range terms {
278
- if t .equal (primary ) {
279
- preferredTerms = append (terms [i :], terms [:i ]... )
280
- break
286
+ if primary == nil {
287
+ preferredTerms = append (terms [shiftIndex :], terms [:shiftIndex ]... )
288
+ } else {
289
+ for i , t := range terms {
290
+ if t .equal (primary ) {
291
+ preferredTerms = append (terms [i :], terms [:i ]... )
292
+ break
293
+ }
281
294
}
282
295
}
283
296
return preferredTerms
@@ -367,3 +380,53 @@ func toCSITopology(terms []topologyTerm) []*csi.Topology {
367
380
}
368
381
return out
369
382
}
383
+
384
+ // identical to logic in getPVCNameHashAndIndexOffset in pkg/volume/util/util.go in-tree
385
+ // [https://github.com/kubernetes/kubernetes/blob/master/pkg/volume/util/util.go]
386
+ func getPVCNameHashAndIndexOffset (pvcName string ) (hash uint32 , index uint32 ) {
387
+ if pvcName == "" {
388
+ // We should always be called with a name; this shouldn't happen
389
+ hash = rand .Uint32 ()
390
+ } else {
391
+ hashString := pvcName
392
+
393
+ // Heuristic to make sure that volumes in a StatefulSet are spread across zones
394
+ // StatefulSet PVCs are (currently) named ClaimName-StatefulSetName-Id,
395
+ // where Id is an integer index.
396
+ // Note though that if a StatefulSet pod has multiple claims, we need them to be
397
+ // in the same zone, because otherwise the pod will be unable to mount both volumes,
398
+ // and will be unschedulable. So we hash _only_ the "StatefulSetName" portion when
399
+ // it looks like `ClaimName-StatefulSetName-Id`.
400
+ // We continue to round-robin volume names that look like `Name-Id` also; this is a useful
401
+ // feature for users that are creating statefulset-like functionality without using statefulsets.
402
+ lastDash := strings .LastIndexByte (pvcName , '-' )
403
+ if lastDash != - 1 {
404
+ statefulsetIDString := pvcName [lastDash + 1 :]
405
+ statefulsetID , err := strconv .ParseUint (statefulsetIDString , 10 , 32 )
406
+ if err == nil {
407
+ // Offset by the statefulsetID, so we round-robin across zones
408
+ index = uint32 (statefulsetID )
409
+ // We still hash the volume name, but only the prefix
410
+ hashString = pvcName [:lastDash ]
411
+
412
+ // In the special case where it looks like `ClaimName-StatefulSetName-Id`,
413
+ // hash only the StatefulSetName, so that different claims on the same StatefulSet
414
+ // member end up in the same zone.
415
+ // Note that StatefulSetName (and ClaimName) might themselves both have dashes.
416
+ // We actually just take the portion after the final - of ClaimName-StatefulSetName.
417
+ // For our purposes it doesn't much matter (just suboptimal spreading).
418
+ lastDash := strings .LastIndexByte (hashString , '-' )
419
+ if lastDash != - 1 {
420
+ hashString = hashString [lastDash + 1 :]
421
+ }
422
+ }
423
+ }
424
+
425
+ // We hash the (base) volume name, so we don't bias towards the first N zones
426
+ h := fnv .New32 ()
427
+ h .Write ([]byte (hashString ))
428
+ hash = h .Sum32 ()
429
+ }
430
+
431
+ return hash , index
432
+ }
0 commit comments