Skip to content

Commit f0e1d1b

Browse files
Enable labels for ClusterUUID and CliqueId
Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
1 parent 71c1fa7 commit f0e1d1b

File tree

15 files changed

+263
-5
lines changed

15 files changed

+263
-5
lines changed

api/config/v1/flags.go

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ type GFDCommandLineFlags struct {
107107
NoTimestamp *bool `json:"noTimestamp" yaml:"noTimestamp"`
108108
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
109109
OutputFile *string `json:"outputFile" yaml:"outputFile"`
110+
ImexNodesConfig *string `json:"imexNodesConfig" yaml:"imexNodesConfig"`
110111
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
111112
}
112113

@@ -162,6 +163,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
162163
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
163164
case "output-file":
164165
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
166+
case "imex-nodes-config":
167+
updateFromCLIFlag(&f.GFD.ImexNodesConfig, c, n)
165168
case "sleep-interval":
166169
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
167170
case "no-timestamp":

cmd/gpu-feature-discovery/main.go

+6
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,12 @@ func main() {
8686
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
8787
EnvVars: []string{"GFD_OUTPUT_FILE"},
8888
},
89+
&cli.StringFlag{
90+
Name: "imex-nodes-config",
91+
Usage: "the path to nvidia-imex nodes config file",
92+
Value: "/etc/nvidia-imex/nodes_config.cfg",
93+
EnvVars: []string{"GFD_IMEX_NODES_CONFIG"},
94+
},
8995
&cli.StringFlag{
9096
Name: "machine-type-file",
9197
Value: "/sys/class/dmi/id/product_name",

deployments/container/Dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG GOLANG_VERSION=1.22.6
15+
ARG GOLANG_VERSION=1.23.1
1616
FROM nvcr.io/nvidia/cuda:12.6.0-base-ubi9 AS build
1717

1818
RUN yum install -y \
@@ -30,7 +30,7 @@ RUN set -eux; \
3030
aarch64) ARCH='arm64' ;; \
3131
*) echo "unsupported architecture" ; exit 1 ;; \
3232
esac; \
33-
wget -nv -O - https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${ARCH}.tar.gz \
33+
wget -nv -O - https://go.dev/dl/go1.23.1.linux-arm64.tar.gz \
3434
| tar -C /usr/local -xz
3535

3636
ENV GOPATH /go

deployments/container/native-only.mk

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414

1515
PUSH_ON_BUILD ?= false
16-
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/amd64
16+
DOCKER_BUILD_PLATFORM_OPTIONS = --platform=linux/aarch64
1717

1818
ifeq ($(PUSH_ON_BUILD),true)
1919
$(BUILD_TARGETS): build-%: image-%

deployments/devel/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
# This Dockerfile is also used to define the golang version used in this project
1616
# This allows dependabot to manage this version in addition to other images.
17-
FROM golang:1.22.6
17+
FROM golang:1.23
1818

1919
WORKDIR /work
2020
COPY * .

deployments/devel/go.mod

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
module github.com/NVIDIA/k8s-device-plugin/deployments/devel
22

3-
go 1.22
3+
go 1.23
4+
45
toolchain go1.23.0
56

67
require github.com/matryer/moq v0.5.0

deployments/helm/nvidia-device-plugin/templates/daemonset-gfd.yml

+13
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@ spec:
163163
- name: GFD_USE_NODE_FEATURE_API
164164
value: {{ .Values.nfd.enableNodeFeatureApi | quote }}
165165
{{- end }}
166+
{{- if typeIs "string" .Values.imexNodesConfigFile }}
167+
- name: GFD_IMEX_NODES_CONFIG
168+
value: {{ .Values.imexNodesConfigFile | quote }}
169+
{{- end }}
166170
{{- if $options.hasConfigMap }}
167171
- name: CONFIG_FILE
168172
value: /config/config.yaml
@@ -182,6 +186,10 @@ spec:
182186
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
183187
- name: host-sys
184188
mountPath: "/sys"
189+
{{- if typeIs "string" .Values.imexNodesConfigFile }}
190+
- name: imex-nodes-config
191+
mountPath: {{ .Values.imexNodesConfigFile | quote }}
192+
{{- end }}
185193
{{- if $options.hasConfigMap }}
186194
- name: available-configs
187195
mountPath: /available-configs
@@ -199,6 +207,11 @@ spec:
199207
- name: host-sys
200208
hostPath:
201209
path: "/sys"
210+
{{- if typeIs "string" .Values.imexNodesConfigFile }}
211+
- name: imex-nodes-config
212+
hostPath:
213+
path: {{ .Values.imexNodesConfigFile | quote }}
214+
{{- end }}
202215
{{- if $options.hasConfigMap }}
203216
- name: available-configs
204217
configMap:

deployments/helm/nvidia-device-plugin/values.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ deviceIDStrategy: null
3535
nvidiaDriverRoot: null
3636
gdsEnabled: null
3737
mofedEnabled: null
38+
# Default value is "/etc/nvidia-imex/nodes_config.cfg"
39+
imexNodesConfigFile: null
3840
deviceDiscoveryStrategy: null
3941

4042
nameOverride: ""

internal/lm/nvml.go

+104
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,13 @@
1717
package lm
1818

1919
import (
20+
"bufio"
2021
"errors"
2122
"fmt"
23+
"math/rand"
24+
"net"
25+
"os"
26+
"sort"
2227
"strconv"
2328
"strings"
2429

@@ -28,6 +33,7 @@ import (
2833

2934
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
3035
"github.com/NVIDIA/k8s-device-plugin/internal/resource"
36+
"github.com/google/uuid"
3137
)
3238

3339
var errMPSSharingNotSupported = errors.New("MPS sharing is not supported")
@@ -80,13 +86,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
8086
return nil, fmt.Errorf("error creating resource labeler: %v", err)
8187
}
8288

89+
imexLabeler, err := newImexDomainLabeler(*config.Flags.GFD.ImexNodesConfig, devices)
90+
if err != nil {
91+
return nil, fmt.Errorf("error creating imex domain labeler: %v", err)
92+
}
93+
8394
l := Merge(
8495
machineTypeLabeler,
8596
versionLabeler,
8697
migCapabilityLabeler,
8798
sharingLabeler,
8899
resourceLabeler,
89100
gpuModeLabeler,
101+
imexLabeler,
90102
)
91103

92104
return l, nil
@@ -218,6 +230,78 @@ func newGPUModeLabeler(devices []resource.Device) (Labeler, error) {
218230
return labels, nil
219231
}
220232

233+
func newImexDomainLabeler(configFile string, device []resource.Device) (Labeler, error) {
234+
if configFile == "" {
235+
return nil, nil
236+
}
237+
238+
// read file and parse it
239+
imexConfig, err := os.Open(configFile)
240+
if err != nil {
241+
return nil, fmt.Errorf("failed to read imex config file: %v", err)
242+
}
243+
defer imexConfig.Close()
244+
245+
// Read the file line by line
246+
var ips []string
247+
scanner := bufio.NewScanner(imexConfig)
248+
for scanner.Scan() {
249+
ip := strings.TrimSpace(scanner.Text())
250+
if net.ParseIP(ip) == nil {
251+
return nil, fmt.Errorf("invalid IP address in imex config file: %s", ip)
252+
}
253+
ips = append(ips, ip)
254+
}
255+
256+
if err := scanner.Err(); err != nil {
257+
return nil, fmt.Errorf("failed to read imex config file: %v", err)
258+
}
259+
260+
// Sort the IP addresses
261+
sort.Strings(ips)
262+
263+
// Join the sorted IPs into a single string
264+
sortedIPs := strings.Join(ips, "\n")
265+
266+
hashedconfig := generateUUIDs(sortedIPs)[0]
267+
268+
var commonClusterUUID string
269+
var commonCliqueID string
270+
for _, d := range device {
271+
clusterUUID, err := d.GetClusterUUID()
272+
if err != nil {
273+
return nil, fmt.Errorf("error getting cluster UUID: %v", err)
274+
}
275+
if commonClusterUUID == "" {
276+
commonClusterUUID = clusterUUID
277+
}
278+
if commonClusterUUID != clusterUUID {
279+
klog.Warningf("Cluster UUIDs are different: %s != %s", commonClusterUUID, clusterUUID)
280+
return nil, nil
281+
}
282+
283+
cliqueID, err := d.GetCliqueIP()
284+
if err != nil {
285+
return nil, fmt.Errorf("error getting clique ID: %v", err)
286+
}
287+
if commonCliqueID == "" {
288+
commonCliqueID = cliqueID
289+
}
290+
if commonCliqueID != cliqueID {
291+
klog.Warningf("Clique IDs are different: %s != %s", commonCliqueID, cliqueID)
292+
return nil, nil
293+
}
294+
}
295+
296+
labels := Labels{
297+
"nvidia.com/gpu.clusteruuid": commonClusterUUID,
298+
"nvidia.com/gpu.cliqueid": commonCliqueID,
299+
"nvidia.com/gpu.imex-domain": hashedconfig + "-" + commonCliqueID,
300+
}
301+
302+
return labels, nil
303+
}
304+
221305
func getModeForClasses(classes []uint32) string {
222306
if len(classes) == 0 {
223307
return "unknown"
@@ -254,3 +338,23 @@ func getDeviceClasses(devices []resource.Device) ([]uint32, error) {
254338
}
255339
return classes, nil
256340
}
341+
342+
func generateUUIDs(seed string) []string {
343+
rand := rand.New(rand.NewSource(hash(seed)))
344+
345+
uuids := make([]string, 1)
346+
charset := make([]byte, 16)
347+
rand.Read(charset)
348+
uuid, _ := uuid.FromBytes(charset)
349+
uuids[0] = uuid.String()
350+
351+
return uuids
352+
}
353+
354+
func hash(s string) int64 {
355+
h := int64(0)
356+
for _, c := range s {
357+
h = 31*h + int64(c)
358+
}
359+
return h
360+
}

internal/resource/cuda-device.go

+8
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
100100
func (d *cudaDevice) GetPCIClass() (uint32, error) {
101101
return 0, nil
102102
}
103+
104+
func (d *cudaDevice) GetClusterUUID() (string, error) {
105+
return "", nil
106+
}
107+
108+
func (d *cudaDevice) GetCliqueIP() (string, error) {
109+
return "", nil
110+
}

internal/resource/device_mock.go

+74
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)