17
17
package lm
18
18
19
19
import (
20
+ "bufio"
20
21
"errors"
21
22
"fmt"
23
+ "math/rand"
24
+ "net"
25
+ "os"
26
+ "sort"
22
27
"strconv"
23
28
"strings"
24
29
@@ -28,6 +33,7 @@ import (
28
33
29
34
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
30
35
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
36
+ "github.com/google/uuid"
31
37
)
32
38
33
39
var errMPSSharingNotSupported = errors .New ("MPS sharing is not supported" )
@@ -80,13 +86,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
80
86
return nil , fmt .Errorf ("error creating resource labeler: %v" , err )
81
87
}
82
88
89
+ imexLabeler , err := newImexDomainLabeler (* config .Flags .GFD .ImexNodesConfig , devices )
90
+ if err != nil {
91
+ return nil , fmt .Errorf ("error creating imex domain labeler: %v" , err )
92
+ }
93
+
83
94
l := Merge (
84
95
machineTypeLabeler ,
85
96
versionLabeler ,
86
97
migCapabilityLabeler ,
87
98
sharingLabeler ,
88
99
resourceLabeler ,
89
100
gpuModeLabeler ,
101
+ imexLabeler ,
90
102
)
91
103
92
104
return l , nil
@@ -218,6 +230,78 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
218
230
return labels , nil
219
231
}
220
232
233
+ func newImexDomainLabeler (configFile string , device []resource.Device ) (Labeler , error ) {
234
+ if configFile == "" {
235
+ return nil , nil
236
+ }
237
+
238
+ // read file and parse it
239
+ imexConfig , err := os .Open (configFile )
240
+ if err != nil {
241
+ return nil , fmt .Errorf ("failed to read imex config file: %v" , err )
242
+ }
243
+ defer imexConfig .Close ()
244
+
245
+ // Read the file line by line
246
+ var ips []string
247
+ scanner := bufio .NewScanner (imexConfig )
248
+ for scanner .Scan () {
249
+ ip := strings .TrimSpace (scanner .Text ())
250
+ if net .ParseIP (ip ) == nil {
251
+ return nil , fmt .Errorf ("invalid IP address in imex config file: %s" , ip )
252
+ }
253
+ ips = append (ips , ip )
254
+ }
255
+
256
+ if err := scanner .Err (); err != nil {
257
+ return nil , fmt .Errorf ("failed to read imex config file: %v" , err )
258
+ }
259
+
260
+ // Sort the IP addresses
261
+ sort .Strings (ips )
262
+
263
+ // Join the sorted IPs into a single string
264
+ sortedIPs := strings .Join (ips , "\n " )
265
+
266
+ hashedconfig := generateUUIDs (sortedIPs )[0 ]
267
+
268
+ var commonClusterUUID string
269
+ var commonCliqueID string
270
+ for _ , d := range device {
271
+ clusterUUID , err := d .GetClusterUUID ()
272
+ if err != nil {
273
+ return nil , fmt .Errorf ("error getting cluster UUID: %v" , err )
274
+ }
275
+ if commonClusterUUID == "" {
276
+ commonClusterUUID = clusterUUID
277
+ }
278
+ if commonClusterUUID != clusterUUID {
279
+ klog .Warningf ("Cluster UUIDs are different: %s != %s" , commonClusterUUID , clusterUUID )
280
+ return nil , nil
281
+ }
282
+
283
+ cliqueID , err := d .GetCliqueIP ()
284
+ if err != nil {
285
+ return nil , fmt .Errorf ("error getting clique ID: %v" , err )
286
+ }
287
+ if commonCliqueID == "" {
288
+ commonCliqueID = cliqueID
289
+ }
290
+ if commonCliqueID != cliqueID {
291
+ klog .Warningf ("Clique IDs are different: %s != %s" , commonCliqueID , cliqueID )
292
+ return nil , nil
293
+ }
294
+ }
295
+
296
+ labels := Labels {
297
+ "nvidia.com/gpu.clusteruuid" : commonClusterUUID ,
298
+ "nvidia.com/gpu.cliqueid" : commonCliqueID ,
299
+ "nvidia.com/gpu.imex-domain" : hashedconfig + "-" + commonCliqueID ,
300
+ }
301
+
302
+ return labels , nil
303
+ }
304
+
221
305
func getModeForClasses (classes []uint32 ) string {
222
306
if len (classes ) == 0 {
223
307
return "unknown"
@@ -254,3 +338,23 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
254
338
}
255
339
return classes , nil
256
340
}
341
+
342
+ func generateUUIDs (seed string ) []string {
343
+ rand := rand .New (rand .NewSource (hash (seed )))
344
+
345
+ uuids := make ([]string , 1 )
346
+ charset := make ([]byte , 16 )
347
+ rand .Read (charset )
348
+ uuid , _ := uuid .FromBytes (charset )
349
+ uuids [0 ] = uuid .String ()
350
+
351
+ return uuids
352
+ }
353
+
354
+ func hash (s string ) int64 {
355
+ h := int64 (0 )
356
+ for _ , c := range s {
357
+ h = 31 * h + int64 (c )
358
+ }
359
+ return h
360
+ }
0 commit comments