Skip to content

Commit fe63d86

Browse files
Enable labels for ClusterUUID and CliqueId
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 71c1fa7 commit fe63d86

File tree

5 files changed

+91
-0
lines changed

5 files changed

+91
-0
lines changed

api/config/v1/flags.go

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
107107
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
108108
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
109109
OutputFile *string `json:"outputFile" yaml:"outputFile"`
110+
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
110111
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
111112
}
112113

@@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
162163
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
163164
case "output-file":
164165
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
166+
case "imex-nodes-config":
167+
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
165168
case "sleep-interval":
166169
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
167170
case "no-timestamp":

cmd/gpu-feature-discovery/main.go

+7
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ func main() {
8686
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
8787
EnvVars: []string{"GFD_OUTPUT_FILE"},
8888
},
89+
&cli.StringFlag{
90+
Name: "imex-nopdes-config",
91+
Aliases: []string{"imex-nodes-config"},
92+
Usage: "the path to nvidia-imex nodes config file",
93+
Value: "/etc/nvidia-imex/nodes_config.cfg",
94+
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
95+
},
8996
&cli.StringFlag{
9097
Name: "machine-type-file",
9198
Value: "/sys/class/dmi/id/product_name",

deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml

+9
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,10 @@ spec:
182182
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
183183
- name: host-sys
184184
mountPath: "/sys"
185+
{{- if .Values.imexEnabled }}
186+
- name: imex-nodes-config
187+
mountPath: "/etc/nvidia-imex/nodes_config.cfg"
188+
{{- end }}
185189
{{- if $options.hasConfigMap }}
186190
- name: available-configs
187191
mountPath: /available-configs
@@ -199,6 +203,11 @@ spec:
199203
- name: host-sys
200204
hostPath:
201205
path: "/sys"
206+
{{- if .Values.imexEnabled }}
207+
- name: imex-nodes-config
208+
hostPath:
209+
path: "/etc/nvidia-imex/nodes_config.cfg"
210+
{{- end }}
202211
{{- if $options.hasConfigMap }}
203212
- name: available-configs
204213
configMap:

deployments/helm/nvidia-device-plugin/values.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ deviceIDStrategy: null
3535
nvidiaDriverRoot: null
3636
gdsEnabled: null
3737
mofedEnabled: null
38+
imexEnabled: false
3839
deviceDiscoveryStrategy: null
3940

4041
nameOverride: ""

internal/lm/nvml.go

+71
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@
1717
package lm
1818

1919
import (
20+
"bufio"
2021
"errors"
2122
"fmt"
23+
"math/rand"
24+
"os"
25+
"sort"
2226
"strconv"
2327
"strings"
2428

@@ -28,6 +32,7 @@ import (
2832

2933
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
3034
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
35+
"github.com/google/uuid"
3136
)
3237

3338
var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
@@ -80,13 +85,22 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
8085
return nil, fmt.Errorf("error creating resource labeler: %v", err)
8186
}
8287

88+
var imexLabeler Labeler
89+
if *config.Flags.GFD.ImexNodesConfig != "" {
90+
imexLabeler, err = newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig)
91+
if err != nil {
92+
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
93+
}
94+
}
95+
8396
l := Merge(
8497
machineTypeLabeler,
8598
versionLabeler,
8699
migCapabilityLabeler,
87100
sharingLabeler,
88101
resourceLabeler,
89102
gpuModeLabeler,
103+
imexLabeler,
90104
)
91105

92106
return l, nil
@@ -218,6 +232,41 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
218232
return labels, nil
219233
}
220234

235+
func newImexDomainLabeler(configFile string) (Labeler, error) {
236+
// read file and parse it
237+
imexConfig, err := os.Open(configFile)
238+
if err != nil {
239+
return nil, fmt.Errorf("failed to read imex config file: %v", err)
240+
}
241+
defer imexConfig.Close()
242+
243+
// Read the file line by line
244+
var ips []string
245+
scanner := bufio.NewScanner(imexConfig)
246+
for scanner.Scan() {
247+
line := scanner.Text()
248+
ips = append(ips, line)
249+
}
250+
251+
if err := scanner.Err(); err != nil {
252+
return nil, fmt.Errorf("failed to read imex config file: %v", err)
253+
}
254+
255+
// Sort the IP addresses
256+
sort.Strings(ips)
257+
258+
// Join the sorted IPs into a single string
259+
sortedIPs := strings.Join(ips, "\n")
260+
261+
hashedconfig := generateUUIDs(sortedIPs, 1)[0]
262+
263+
labels := Labels{
264+
"nvidia.com/gpu.imex-domain": hashedconfig,
265+
}
266+
267+
return labels, nil
268+
}
269+
221270
func getModeForClasses(classes []uint32) string {
222271
if len(classes) == 0 {
223272
return "unknown"
@@ -254,3 +303,25 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
254303
}
255304
return classes, nil
256305
}
306+
307+
func generateUUIDs(seed string, count int) []string {
308+
rand := rand.New(rand.NewSource(hash(seed)))
309+
310+
uuids := make([]string, count)
311+
for i := 0; i < count; i++ {
312+
charset := make([]byte, 16)
313+
rand.Read(charset)
314+
uuid, _ := uuid.FromBytes(charset)
315+
uuids[i] = uuid.String()
316+
}
317+
318+
return uuids
319+
}
320+
321+
func hash(s string) int64 {
322+
h := int64(0)
323+
for _, c := range s {
324+
h = 31*h + int64(c)
325+
}
326+
return h
327+
}

0 commit comments

Comments
 (0)