Skip to content

Commit 51a2a70

Browse files
Enable labels for ClusterUUID and CliqueId
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent a6a7ce1 commit 51a2a70

File tree

19 files changed

+370
-11
lines changed

19 files changed

+370
-11
lines changed

api/config/v1/flags.go

+8-5
Original file line numberDiff line numberDiff line change
@@ -103,11 +103,12 @@ func (f *deviceListStrategyFlag) UnmarshalJSON(b []byte) error {
103103

104104
// GFDCommandLineFlags holds the list of command line flags specific to GFD.
105105
type GFDCommandLineFlags struct {
106-
Oneshot *bool `json:"oneshot" yaml:"oneshot"`
107-
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
108-
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
109-
OutputFile *string `json:"outputFile" yaml:"outputFile"`
110-
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
106+
Oneshot *bool `json:"oneshot" yaml:"oneshot"`
107+
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
108+
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
109+
OutputFile *string `json:"outputFile" yaml:"outputFile"`
110+
ImexNodesConfigFile *string `json:"imexNodesConfigFile" yaml:"imexNodesConfigFile"`
111+
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
111112
}
112113

113114
// UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set.
@@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
162163
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
163164
case "output-file":
164165
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
166+
case "imex-nodes-config-file":
167+
updateFromCLIFlag(&f.GFD.ImexNodesConfigFile, c, n)
165168
case "sleep-interval":
166169
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
167170
case "no-timestamp":

api/config/v1/flags_test.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ func TestMarshalFlags(t *testing.T) {
186186
"noTimestamp": null,
187187
"outputFile": null,
188188
"sleepInterval": "0s",
189-
"machineTypeFile": null
189+
"machineTypeFile": null,
190+
"imexNodesConfigFile": null
190191
}
191192
}`,
192193
},
@@ -210,7 +211,8 @@ func TestMarshalFlags(t *testing.T) {
210211
"noTimestamp": null,
211212
"outputFile": null,
212213
"sleepInterval": "5ns",
213-
"machineTypeFile": null
214+
"machineTypeFile": null,
215+
"imexNodesConfigFile": null
214216
}
215217
}`,
216218
},

cmd/gpu-feature-discovery/main.go

+6
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ func main() {
8686
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
8787
EnvVars: []string{"GFD_OUTPUT_FILE"},
8888
},
89+
&cli.StringFlag{
90+
Name: "imex-nodes-config-file",
91+
Usage: "Path to the IMEX ",
92+
Value: "/etc/nvidia-imex/nodes_config.cfg",
93+
EnvVars: []string{"GFD_IMEX_NODES_CONFIG_FILE"},
94+
},
8995
&cli.StringFlag{
9096
Name: "machine-type-file",
9197
Value: "/sys/class/dmi/id/product_name",

deployments/helm/nvidia-device-plugin/templates/_helpers.tpl

+18
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,21 @@ We convert this to JSON so that it can be included and converted to an object us
269269
{{- $_ := set $options "addMigMonitorDevices" ( ne ( (include "nvidia-device-plugin.allPossibleMigStrategiesAreNone" . ) | trim ) "true" ) -}}
270270
{{- mustToJson $options -}}
271271
{{- end -}}
272+
273+
{{- define "nvidia-device-plugin.filepathJoin" -}}
274+
{{- $separator := "/" -}}
275+
{{- $path := "" -}}
276+
{{- range $index, $element := . -}}
277+
{{- if and (ne $element "") (ne $element nil) -}}
278+
{{- if $index -}}
279+
{{- $cleanElement := trimPrefix "/" $element -}}
280+
{{- $path = printf "%s%s%s" $path $separator $cleanElement -}}
281+
{{- else -}}
282+
{{- $path = $element -}}
283+
{{- end -}}
284+
{{- end -}}
285+
{{- end -}}
286+
{{- $resultRaw := $path | trimSuffix "/" -}}
287+
{{- $result := $resultRaw | clean }}
288+
{{- $result | quote -}}
289+
{{- end -}}

deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml

+6
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,8 @@ spec:
182182
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
183183
- name: host-sys
184184
mountPath: "/sys"
185+
- name: nvidia-imex-dir
186+
mountPath: "/etc/nvidia-imex"
185187
{{- if $options.hasConfigMap }}
186188
- name: available-configs
187189
mountPath: /available-configs
@@ -199,6 +201,10 @@ spec:
199201
- name: host-sys
200202
hostPath:
201203
path: "/sys"
204+
- name: nvidia-imex-dir
205+
type: DirectoryOrCreate
206+
hostPath:
207+
path: {{ include "nvidia-device-plugin.filepathJoin" (list "/" .Values.nvidiaDriverRoot "/etc/nvidia-imex") }}
202208
{{- if $options.hasConfigMap }}
203209
- name: available-configs
204210
configMap:

docs/gpu-feature-discovery/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,8 @@ their meaning:
210210
| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 |
211211
| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 |
212212
| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU. Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |
213+
| nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 |
214+
| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 |
213215

214216
Depending on the MIG strategy used, the following set of labels may also be
215217
available (or override the default values for some of the labels listed above):

go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ go 1.22.2
44

55
require (
66
github.com/NVIDIA/go-gpuallocator v0.5.0
7-
github.com/NVIDIA/go-nvlib v0.6.1
7+
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b
88
github.com/NVIDIA/go-nvml v0.12.4-0
99
github.com/NVIDIA/nvidia-container-toolkit v1.16.2
1010
github.com/fsnotify/fsnotify v1.7.0

go.sum

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
github.com/NVIDIA/go-gpuallocator v0.5.0 h1:166ICvPv2dU9oZ2J3kJ4y3XdbGCi6LhXgFZJtrqeu3A=
22
github.com/NVIDIA/go-gpuallocator v0.5.0/go.mod h1:zos5bTIN01hpQioOyu9oRKglrznImMQvm0bZllMmckw=
3-
github.com/NVIDIA/go-nvlib v0.6.1 h1:0/5FvaKvDJoJeJ+LFlh+NDQMxMlVw9wOXrOVrGXttfE=
4-
github.com/NVIDIA/go-nvlib v0.6.1/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
3+
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b h1:k5ptZB9RGUaR5RcK0R8Cfa4mtTHrSZZ73BFyD3c6KvM=
4+
github.com/NVIDIA/go-nvlib v0.6.2-0.20240928162840-41955a08425b/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
55
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
66
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
77
github.com/NVIDIA/nvidia-container-toolkit v1.16.2 h1:udrrtB8JrAs2KkKQ4njgSb/anUOC1b9tP5LjUtbjE+k=

internal/lm/fabric.go

+156
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/**
2+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
**/
16+
17+
package lm
18+
19+
import (
20+
"bufio"
21+
"fmt"
22+
"io"
23+
"math/rand" // nolint:gosec
24+
"net"
25+
"os"
26+
"sort"
27+
"strings"
28+
29+
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
30+
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
31+
32+
"github.com/google/uuid"
33+
"k8s.io/klog/v2"
34+
)
35+
36+
func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
37+
if config.Flags.GFD.ImexNodesConfigFile == nil {
38+
// No imex config file, return empty labels
39+
return empty{}, nil
40+
}
41+
42+
imexConfigFile, err := os.Open(*config.Flags.GFD.ImexNodesConfigFile)
43+
if os.IsNotExist(err) {
44+
// No imex config file, return empty labels
45+
return empty{}, nil
46+
} else if err != nil {
47+
return nil, fmt.Errorf("failed to open imex config file: %v", err)
48+
}
49+
defer imexConfigFile.Close()
50+
51+
clusterUUID, cliqueID, err := getFabricIDs(devices)
52+
if err != nil {
53+
return nil, err
54+
}
55+
if clusterUUID == "" || cliqueID == "" {
56+
return empty{}, nil
57+
}
58+
59+
imexDomainID, err := getImexDomainID(imexConfigFile)
60+
if err != nil {
61+
return nil, err
62+
}
63+
64+
labels := Labels{
65+
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
66+
"nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."),
67+
}
68+
69+
return labels, nil
70+
}
71+
72+
func getFabricIDs(devices []resource.Device) (string, string, error) {
73+
uniqueClusterUUIDs := make(map[string][]int)
74+
uniqueCliqueIDs := make(map[string][]int)
75+
for i, device := range devices {
76+
isFabricAttached, err := device.IsFabricAttached()
77+
if err != nil {
78+
return "", "", fmt.Errorf("error checking imex capability: %v", err)
79+
}
80+
if !isFabricAttached {
81+
continue
82+
}
83+
84+
clusterUUID, cliqueID, err := device.GetFabricIDs()
85+
if err != nil {
86+
87+
return "", "", fmt.Errorf("error getting fabric IDs: %w", err)
88+
}
89+
90+
uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i)
91+
uniqueCliqueIDs[cliqueID] = append(uniqueCliqueIDs[cliqueID], i)
92+
}
93+
94+
if len(uniqueClusterUUIDs) > 1 {
95+
klog.Warningf("Cluster UUIDs are non-unique: %v", uniqueClusterUUIDs)
96+
return "", "", nil
97+
}
98+
99+
if len(uniqueCliqueIDs) > 1 {
100+
klog.Warningf("Clique IDs are non-unique: %v", uniqueCliqueIDs)
101+
return "", "", nil
102+
}
103+
104+
for clusterUUID := range uniqueClusterUUIDs {
105+
for cliqueID := range uniqueCliqueIDs {
106+
return clusterUUID, cliqueID, nil
107+
}
108+
}
109+
return "", "", nil
110+
}
111+
112+
// getImexDomainID reads the imex config file and returns a unique identifier
113+
// based on the sorted list of IP addresses in the file.
114+
func getImexDomainID(r io.Reader) (string, error) {
115+
// Read the file line by line
116+
var ips []string
117+
scanner := bufio.NewScanner(r)
118+
for scanner.Scan() {
119+
ip := strings.TrimSpace(scanner.Text())
120+
if net.ParseIP(ip) == nil {
121+
return "", fmt.Errorf("invalid IP address in imex config file: %s", ip)
122+
}
123+
ips = append(ips, ip)
124+
}
125+
126+
if err := scanner.Err(); err != nil {
127+
return "", fmt.Errorf("failed to read imex config file: %v", err)
128+
}
129+
130+
if len(ips) == 0 {
131+
// No IPs in the file, return empty labels
132+
return "", nil
133+
}
134+
135+
sort.Strings(ips)
136+
137+
return generateContentUUID(strings.Join(ips, "\n")), nil
138+
139+
}
140+
141+
func generateContentUUID(seed string) string {
142+
// nolint:gosec
143+
rand := rand.New(rand.NewSource(hash(seed)))
144+
charset := make([]byte, 16)
145+
rand.Read(charset)
146+
uuid, _ := uuid.FromBytes(charset)
147+
return uuid.String()
148+
}
149+
150+
func hash(s string) int64 {
151+
h := int64(0)
152+
for _, c := range s {
153+
h = 31*h + int64(c)
154+
}
155+
return h
156+
}

internal/lm/nvml.go

+6
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
8080
return nil, fmt.Errorf("error creating resource labeler: %v", err)
8181
}
8282

83+
imexLabeler, err := newImexLabeler(config, devices)
84+
if err != nil {
85+
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
86+
}
87+
8388
l := Merge(
8489
machineTypeLabeler,
8590
versionLabeler,
8691
migCapabilityLabeler,
8792
sharingLabeler,
8893
resourceLabeler,
8994
gpuModeLabeler,
95+
imexLabeler,
9096
)
9197

9298
return l, nil

internal/resource/cuda-device.go

+8
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
100100
func (d *cudaDevice) GetPCIClass() (uint32, error) {
101101
return 0, nil
102102
}
103+
104+
func (d *cudaDevice) IsFabricAttached() (bool, error) {
105+
return false, nil
106+
}
107+
108+
func (d *cudaDevice) GetFabricIDs() (string, string, error) {
109+
return "", "", fmt.Errorf("GetFabricIDs is not supported for CUDA devices")
110+
}

0 commit comments

Comments
 (0)