Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable labels for IMEX Domain and Clique #965

Merged
merged 1 commit into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/config/v1/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,10 @@ type GFDCommandLineFlags struct {
SleepInterval *Duration `json:"sleepInterval" yaml:"sleepInterval"`
OutputFile *string `json:"outputFile" yaml:"outputFile"`
MachineTypeFile *string `json:"machineTypeFile" yaml:"machineTypeFile"`
// ImexNodesConfigFile is the path to a file containing the IP addresses of nodes
// that are part of the IMEX domain.
// Note that this is the absolute path to the file in the device plugin container.
ImexNodesConfigFile *string `json:"imexNodesConfigFile" yaml:"imexNodesConfigFile"`
}

// UpdateFromCLIFlags updates Flags from settings in the cli Flags if they are set.
Expand Down Expand Up @@ -162,6 +166,8 @@ func (f *Flags) UpdateFromCLIFlags(c *cli.Context, flags []cli.Flag) {
updateFromCLIFlag(&f.GFD.Oneshot, c, n)
case "output-file":
updateFromCLIFlag(&f.GFD.OutputFile, c, n)
case "imex-nodes-config-file":
updateFromCLIFlag(&f.GFD.ImexNodesConfigFile, c, n)
case "sleep-interval":
updateFromCLIFlag(&f.GFD.SleepInterval, c, n)
case "no-timestamp":
Expand Down
6 changes: 4 additions & 2 deletions api/config/v1/flags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,8 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "0s",
"machineTypeFile": null
"machineTypeFile": null,
"imexNodesConfigFile": null
}
}`,
},
Expand All @@ -210,7 +211,8 @@ func TestMarshalFlags(t *testing.T) {
"noTimestamp": null,
"outputFile": null,
"sleepInterval": "5ns",
"machineTypeFile": null
"machineTypeFile": null,
"imexNodesConfigFile": null
}
}`,
},
Expand Down
6 changes: 6 additions & 0 deletions cmd/gpu-feature-discovery/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ func main() {
Value: "/etc/kubernetes/node-feature-discovery/features.d/gfd",
EnvVars: []string{"GFD_OUTPUT_FILE"},
},
&cli.StringFlag{
Name: "imex-nodes-config-file",
Usage: "Path to the IMEX nodes config file. This file contains a list of IP addresses of the nodes in the IMEX domain.",
Value: "/etc/nvidia-imex/nodes_config.cfg",
EnvVars: []string{"GFD_IMEX_NODES_CONFIG_FILE"},
},
&cli.StringFlag{
Name: "machine-type-file",
Value: "/sys/class/dmi/id/product_name",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ spec:
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: host-sys
mountPath: "/sys"
- name: nvidia-imex-dir
mountPath: "/etc/nvidia-imex"
{{- if $options.hasConfigMap }}
- name: available-configs
mountPath: /available-configs
Expand All @@ -199,6 +201,10 @@ spec:
- name: host-sys
hostPath:
path: "/sys"
- name: nvidia-imex-dir
type: DirectoryOrCreate
hostPath:
path: {{ clean ( join "/" ( list "/" .Values.nvidiaDriverRoot "/etc/nvidia-imex" ) ) | quote }}
{{- if $options.hasConfigMap }}
- name: available-configs
configMap:
Expand Down
2 changes: 2 additions & 0 deletions docs/gpu-feature-discovery/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ their meaning:
| nvidia.com/gpu.memory | Integer | Memory of the GPU in Mb | 2048 |
| nvidia.com/gpu.product | String | Model of the GPU | GeForce-GT-710 |
| nvidia.com/gpu.mode | String | Display or Compute Mode of the GPU. Details of the GPU modes can be found [here](https://docs.nvidia.com/grid/13.0/grid-gpumodeswitch-user-guide/index.html#compute-and-graphics-mode) | compute |
| nvidia.com/gpu.clique | String | GPUFabric ClusterUUID + CliqueID | 7b968a6d-c8aa-45e1-9e07-e1e51be99c31.1 |
| nvidia.com/gpu.imex-domain | String | IMEX domain Ip list(Hashed) + CliqueID | 79b326e7-d566-3483-c2a3-9b38fa5cb1c8.1 |

Depending on the MIG strategy used, the following set of labels may also be
available (or override the default values for some of the labels listed above):
Expand Down
159 changes: 159 additions & 0 deletions internal/lm/fabric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package lm

import (
"bufio"
"fmt"
"io"
"math/rand" // nolint:gosec
"net"
"os"
"sort"
"strings"

spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
"github.com/NVIDIA/k8s-device-plugin/internal/resource"

"github.com/google/uuid"
"k8s.io/klog/v2"
)

func newImexLabeler(config *spec.Config, devices []resource.Device) (Labeler, error) {
if config.Flags.GFD.ImexNodesConfigFile == nil || *config.Flags.GFD.ImexNodesConfigFile == "" {
// No imex config file, return empty labels
return empty{}, nil
}

imexConfigFile, err := os.Open(*config.Flags.GFD.ImexNodesConfigFile)
if os.IsNotExist(err) {
// No imex config file, return empty labels
return empty{}, nil
} else if err != nil {
return nil, fmt.Errorf("failed to open imex config file: %v", err)
}
defer imexConfigFile.Close()

clusterUUID, cliqueID, err := getFabricIDs(devices)
if err != nil {
return nil, err
}
if clusterUUID == "" || cliqueID == "" {
return empty{}, nil
}

imexDomainID, err := getImexDomainID(imexConfigFile)
if err != nil {
return nil, err
}
if imexDomainID == "" {
return empty{}, nil
}

labels := Labels{
"nvidia.com/gpu.clique": strings.Join([]string{clusterUUID, cliqueID}, "."),
"nvidia.com/gpu.imex-domain": strings.Join([]string{imexDomainID, cliqueID}, "."),
}

return labels, nil
}

func getFabricIDs(devices []resource.Device) (string, string, error) {
uniqueClusterUUIDs := make(map[string][]int)
uniqueCliqueIDs := make(map[string][]int)
for i, device := range devices {
isFabricAttached, err := device.IsFabricAttached()
if err != nil {
return "", "", fmt.Errorf("error checking imex capability: %v", err)
}
if !isFabricAttached {
continue
}

clusterUUID, cliqueID, err := device.GetFabricIDs()
if err != nil {

return "", "", fmt.Errorf("error getting fabric IDs: %w", err)
}

uniqueClusterUUIDs[clusterUUID] = append(uniqueClusterUUIDs[clusterUUID], i)
uniqueCliqueIDs[cliqueID] = append(uniqueCliqueIDs[cliqueID], i)
}

if len(uniqueClusterUUIDs) > 1 {
klog.Warningf("Cluster UUIDs are non-unique: %v", uniqueClusterUUIDs)
return "", "", nil
}

if len(uniqueCliqueIDs) > 1 {
klog.Warningf("Clique IDs are non-unique: %v", uniqueCliqueIDs)
return "", "", nil
}

for clusterUUID := range uniqueClusterUUIDs {
for cliqueID := range uniqueCliqueIDs {
return clusterUUID, cliqueID, nil
}
}
return "", "", nil
}

// getImexDomainID reads the imex config file and returns a unique identifier
// based on the sorted list of IP addresses in the file.
func getImexDomainID(r io.Reader) (string, error) {
// Read the file line by line
var ips []string
scanner := bufio.NewScanner(r)
for scanner.Scan() {
ip := strings.TrimSpace(scanner.Text())
if net.ParseIP(ip) == nil {
return "", fmt.Errorf("invalid IP address in imex config file: %s", ip)
}
ips = append(ips, ip)
}

if err := scanner.Err(); err != nil {
return "", fmt.Errorf("failed to read imex config file: %v", err)
}

if len(ips) == 0 {
// No IPs in the file, return empty labels
return "", nil
}

sort.Strings(ips)

return generateContentUUID(strings.Join(ips, "\n")), nil

}

func generateContentUUID(seed string) string {
// nolint:gosec
rand := rand.New(rand.NewSource(hash(seed)))
charset := make([]byte, 16)
rand.Read(charset)
uuid, _ := uuid.FromBytes(charset)
return uuid.String()
}

func hash(s string) int64 {
h := int64(0)
for _, c := range s {
h = 31*h + int64(c)
}
return h
}
6 changes: 6 additions & 0 deletions internal/lm/nvml.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,19 @@ func NewDeviceLabeler(manager resource.Manager, config *spec.Config) (Labeler, e
return nil, fmt.Errorf("error creating resource labeler: %v", err)
}

imexLabeler, err := newImexLabeler(config, devices)
if err != nil {
return nil, fmt.Errorf("error creating IMEX labeler: %v", err)
}

l := Merge(
machineTypeLabeler,
versionLabeler,
migCapabilityLabeler,
sharingLabeler,
resourceLabeler,
gpuModeLabeler,
imexLabeler,
)

return l, nil
Expand Down
8 changes: 8 additions & 0 deletions internal/resource/cuda-device.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,11 @@ func (d *cudaDevice) IsMigEnabled() (bool, error) {
func (d *cudaDevice) GetPCIClass() (uint32, error) {
return 0, nil
}

func (d *cudaDevice) IsFabricAttached() (bool, error) {
return false, nil
}

func (d *cudaDevice) GetFabricIDs() (string, string, error) {
return "", "", fmt.Errorf("GetFabricIDs is not supported for CUDA devices")
}
Loading