From ce7a0d4ce22548e43a821a769d0aeb572ad2403b Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Wed, 11 Sep 2024 22:58:15 +0200 Subject: [PATCH 1/4] pkg/nvidia: Avoid creating an info.Interface with the nvcdi.Interface NVIDIA Container Toolkit 0.16.0 added a new API to avoid creating a new info.Interface when creating a nvcdi.Interface, if an info.Interface already exists [1]. Commit 649d02f8a6e1d353 already bumped the required NVIDIA Container Toolkit version to 0.16.0, so take advantage of that. [1] NVIDIA Container Toolkit commit 8fc4b9c742f894ef https://github.com/NVIDIA/nvidia-container-toolkit/commit/8fc4b9c742f894ef https://github.com/NVIDIA/nvidia-container-toolkit/pull/516 https://github.com/containers/toolbox/pull/1541 --- src/pkg/nvidia/nvidia.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index a707ff776..1866bec4d 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -74,7 +74,7 @@ func GenerateCDISpec() (*specs.Spec, error) { logger = logrus.StandardLogger() } - cdi, err := nvcdi.New(nvcdi.WithLogger(logger)) + cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger)) if err != nil { logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) return nil, errors.New("failed to create Container Device Interface library for NVIDIA") From 09dcdb492a78a06ea247eb729c8762d2cf36acb7 Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Wed, 11 Sep 2024 23:08:00 +0200 Subject: [PATCH 2/4] pkg/nvidia: Share the Logrus logger with the info.Interface A new API was added to github.com/NVIDIA/go-nvlib 0.4.0 to specify a logger to be used by a info.Interface [1]. Commit 649d02f8a6e1d353 already bumped the required go-nvlib version to 0.6.0, so take advantage of that. [1] github.com/NVIDIA/go-nvlib commit 21c8f035ca66b29d https://github.com/NVIDIA/go-nvlib/commit/21c8f035ca66b29d https://github.com/NVIDIA/go-nvlib/pull/28 https://github.com/containers/toolbox/pull/1541 --- src/pkg/nvidia/nvidia.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index 1866bec4d..9b6ba755f 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -45,7 +45,14 @@ func createNullLogger() *logrus.Logger { func GenerateCDISpec() (*specs.Spec, error) { logrus.Debugf("Generating Container Device Interface for NVIDIA") - info := info.New() + var logger *logrus.Logger + if logLevel < logrus.DebugLevel { + logger = createNullLogger() + } else { + logger = logrus.StandardLogger() + } + + info := info.New(info.WithLogger(logger)) if ok, reason := info.HasDXCore(); ok { logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason) @@ -67,13 +74,6 @@ func GenerateCDISpec() (*specs.Spec, error) { return nil, ErrPlatformUnsupported } - var logger *logrus.Logger - if logLevel < logrus.DebugLevel { - logger = createNullLogger() - } else { - logger = logrus.StandardLogger() - } - cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger)) if err != nil { logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) From 977c3d98a435a47c5ea55bd7dbb81e1e40c43611 Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Fri, 13 Sep 2024 15:33:25 +0200 Subject: [PATCH 3/4] pkg/nvidia: Tweak a debug log to avoid an abbreviation It's better to avoid abbreviations when the length of the string and the depth of the indentation are favourable. https://github.com/containers/toolbox/pull/1541 --- src/pkg/nvidia/nvidia.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index 9b6ba755f..16daf2fb5 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -61,7 +61,8 @@ func GenerateCDISpec() (*specs.Spec, error) { hasNvml, reason := info.HasNvml() if !hasNvml { - logrus.Debugf("Generating Container Device Interface for NVIDIA: NVML not found: %s", reason) + logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s", + reason) } isTegra, reason := info.IsTegraSystem() From 8dd2f8e80aad1b76659b44f20c346f75e647d65d Mon Sep 17 00:00:00 2001 From: Debarshi Ray Date: Fri, 30 Aug 2024 18:24:16 +0200 Subject: [PATCH 4/4] cmd/run, pkg/nvidia: Detect mismatched NVIDIA kernel & user space driver The proprietary NVIDIA driver has a kernel space part and a user space part, and they must always have the same matching version. Sometimes, the host operating system might end up with mismatched parts. One reason could be that the different third-party repositories used to distribute the driver might be incompatible with each other. eg., in the case of Fedora it could be RPM Fusion and NVIDIA's own repository. This shows up in the systemd journal as: $ journalctl --dmesg ... kernel: NVRM: API mismatch: the client has the version 555.58.02, but NVRM: this kernel module has the version 560.35.03. Please NVRM: make sure that this kernel module and all NVIDIA driver NVRM: components have the same version. ... Without any special handling of this scenario, users would be presented with a very misleading error: $ toolbox enter Error: failed to get Container Device Interface containerEdits for NVIDIA Instead, improve the error message to be more self-documenting: $ toolbox enter Error: the proprietary NVIDIA driver's kernel and user space don't match Check the host operating system and systemd journal. https://github.com/containers/toolbox/pull/1541 --- src/cmd/run.go | 9 ++++++++- src/go.mod | 2 +- src/pkg/nvidia/nvidia.go | 22 ++++++++++++++++++---- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/cmd/run.go b/src/cmd/run.go index 39ac8f07a..719c0d6ab 100644 --- a/src/cmd/run.go +++ b/src/cmd/run.go @@ -269,7 +269,14 @@ func runCommand(container string, cdiSpecForNvidia, err := nvidia.GenerateCDISpec() if err != nil { - if !errors.Is(err, nvidia.ErrPlatformUnsupported) { + if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) { + var builder strings.Builder + fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n") + fmt.Fprintf(&builder, "Check the host operating system and systemd journal.") + + errMsg := builder.String() + return errors.New(errMsg) + } else if !errors.Is(err, nvidia.ErrPlatformUnsupported) { return err } } else { diff --git a/src/go.mod b/src/go.mod index 36e8d5017..d6c6055d8 100644 --- a/src/go.mod +++ b/src/go.mod @@ -5,6 +5,7 @@ go 1.20 require ( github.com/HarryMichal/go-version v1.0.1 github.com/NVIDIA/go-nvlib v0.6.1 + github.com/NVIDIA/go-nvml v0.12.4-0 github.com/NVIDIA/nvidia-container-toolkit v1.16.1 github.com/acobaugh/osrelease v0.1.0 github.com/briandowns/spinner v1.18.0 @@ -23,7 +24,6 @@ require ( ) require ( - github.com/NVIDIA/go-nvml v0.12.4-0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fatih/color v1.13.0 // indirect github.com/google/uuid v1.6.0 // indirect diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index 16daf2fb5..fdb924063 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -21,6 +21,7 @@ import ( "io" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/sirupsen/logrus" @@ -32,7 +33,8 @@ var ( ) var ( - ErrPlatformUnsupported = errors.New("platform is unsupported") + ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch") + ErrPlatformUnsupported = errors.New("platform is unsupported") ) func createNullLogger() *logrus.Logger { @@ -52,7 +54,8 @@ func GenerateCDISpec() (*specs.Spec, error) { logger = logrus.StandardLogger() } - info := info.New(info.WithLogger(logger)) + nvmLib := nvml.New() + info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib)) if ok, reason := info.HasDXCore(); ok { logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason) @@ -60,7 +63,18 @@ func GenerateCDISpec() (*specs.Spec, error) { } hasNvml, reason := info.HasNvml() - if !hasNvml { + if hasNvml { + if err := nvmLib.Init(); err != nvml.SUCCESS { + logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s", + err) + + if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH { + return nil, ErrNVMLDriverLibraryVersionMismatch + } else { + return nil, errors.New("failed to initialize NVIDIA Management Library") + } + } + } else { logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s", reason) } @@ -75,7 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) { return nil, ErrPlatformUnsupported } - cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger)) + cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib)) if err != nil { logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) return nil, errors.New("failed to create Container Device Interface library for NVIDIA")