diff --git a/src/cmd/run.go b/src/cmd/run.go index 39ac8f07a..719c0d6ab 100644 --- a/src/cmd/run.go +++ b/src/cmd/run.go @@ -269,7 +269,14 @@ func runCommand(container string, cdiSpecForNvidia, err := nvidia.GenerateCDISpec() if err != nil { - if !errors.Is(err, nvidia.ErrPlatformUnsupported) { + if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) { + var builder strings.Builder + fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n") + fmt.Fprintf(&builder, "Check the host operating system and systemd journal.") + + errMsg := builder.String() + return errors.New(errMsg) + } else if !errors.Is(err, nvidia.ErrPlatformUnsupported) { return err } } else { diff --git a/src/go.mod b/src/go.mod index 36e8d5017..d6c6055d8 100644 --- a/src/go.mod +++ b/src/go.mod @@ -5,6 +5,7 @@ go 1.20 require ( github.com/HarryMichal/go-version v1.0.1 github.com/NVIDIA/go-nvlib v0.6.1 + github.com/NVIDIA/go-nvml v0.12.4-0 github.com/NVIDIA/nvidia-container-toolkit v1.16.1 github.com/acobaugh/osrelease v0.1.0 github.com/briandowns/spinner v1.18.0 @@ -23,7 +24,6 @@ require ( ) require ( - github.com/NVIDIA/go-nvml v0.12.4-0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/fatih/color v1.13.0 // indirect github.com/google/uuid v1.6.0 // indirect diff --git a/src/pkg/nvidia/nvidia.go b/src/pkg/nvidia/nvidia.go index a707ff776..fdb924063 100644 --- a/src/pkg/nvidia/nvidia.go +++ b/src/pkg/nvidia/nvidia.go @@ -21,6 +21,7 @@ import ( "io" "github.com/NVIDIA/go-nvlib/pkg/nvlib/info" + "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/sirupsen/logrus" @@ -32,7 +33,8 @@ var ( ) var ( - ErrPlatformUnsupported = errors.New("platform is unsupported") + ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch") + ErrPlatformUnsupported = errors.New("platform is unsupported") ) func createNullLogger() *logrus.Logger { @@ -45,7 +47,15 @@ func createNullLogger() *logrus.Logger { func GenerateCDISpec() (*specs.Spec, error) { logrus.Debugf("Generating Container Device Interface for NVIDIA") - info := info.New() + var logger *logrus.Logger + if logLevel < logrus.DebugLevel { + logger = createNullLogger() + } else { + logger = logrus.StandardLogger() + } + + nvmLib := nvml.New() + info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib)) if ok, reason := info.HasDXCore(); ok { logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason) @@ -53,8 +63,20 @@ func GenerateCDISpec() (*specs.Spec, error) { } hasNvml, reason := info.HasNvml() - if !hasNvml { - logrus.Debugf("Generating Container Device Interface for NVIDIA: NVML not found: %s", reason) + if hasNvml { + if err := nvmLib.Init(); err != nvml.SUCCESS { + logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s", + err) + + if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH { + return nil, ErrNVMLDriverLibraryVersionMismatch + } else { + return nil, errors.New("failed to initialize NVIDIA Management Library") + } + } + } else { + logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s", + reason) } isTegra, reason := info.IsTegraSystem() @@ -67,14 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) { return nil, ErrPlatformUnsupported } - var logger *logrus.Logger - if logLevel < logrus.DebugLevel { - logger = createNullLogger() - } else { - logger = logrus.StandardLogger() - } - - cdi, err := nvcdi.New(nvcdi.WithLogger(logger)) + cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib)) if err != nil { logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err) return nil, errors.New("failed to create Container Device Interface library for NVIDIA")