Skip to content

Commit c32956c

Browse files
committed
cmd/run, pkg/nvidia: Detect mismatched NVIDIA kernel & user space driver
The proprietary NVIDIA driver has a kernel space part and a user space part, and they must always have the same matching version. Sometimes, the host operating system might end up with mismatched parts. One reason could be that the different third-party repositories used to distribute the driver might be incompatible with each other. eg., in the case of Fedora it could be RPM Fusion and NVIDIA's own repository. This shows up in the systemd journal as: $ journalctl --dmesg ... kernel: NVRM: API mismatch: the client has the version 555.58.02, but NVRM: this kernel module has the version 560.35.03. Please NVRM: make sure that this kernel module and all NVIDIA driver NVRM: components have the same version. ... Without any special handling of this scenario, users would be presented with a very misleading error: $ toolbox enter Error: failed to get Container Device Interface containerEdits for NVIDIA Instead, improve the error message to be more self-documenting: $ toolbox enter Error: the proprietary NVIDIA driver's kernel and user space don't match Check the systemd journal and the contents of the operating system. containers#1541
1 parent a94de6c commit c32956c

File tree

3 files changed

+27
-6
lines changed

3 files changed

+27
-6
lines changed

src/cmd/run.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,14 @@ func runCommand(container string,
269269

270270
cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
271271
if err != nil {
272-
if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
272+
if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) {
273+
var builder strings.Builder
274+
fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n")
275+
fmt.Fprintf(&builder, "Check the systemd journal and the contents of the operating system.")
276+
277+
errMsg := builder.String()
278+
return errors.New(errMsg)
279+
} else if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
273280
return err
274281
}
275282
} else {

src/go.mod

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ go 1.20
55
require (
66
github.com/HarryMichal/go-version v1.0.1
77
github.com/NVIDIA/go-nvlib v0.6.1
8+
github.com/NVIDIA/go-nvml v0.12.4-0
89
github.com/NVIDIA/nvidia-container-toolkit v1.16.1
910
github.com/acobaugh/osrelease v0.1.0
1011
github.com/briandowns/spinner v1.18.0
@@ -23,7 +24,6 @@ require (
2324
)
2425

2526
require (
26-
github.com/NVIDIA/go-nvml v0.12.4-0 // indirect
2727
github.com/davecgh/go-spew v1.1.1 // indirect
2828
github.com/fatih/color v1.13.0 // indirect
2929
github.com/google/uuid v1.6.0 // indirect

src/pkg/nvidia/nvidia.go

+18-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"io"
2222

2323
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
24+
"github.com/NVIDIA/go-nvml/pkg/nvml"
2425
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
2526
nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
2627
"github.com/sirupsen/logrus"
@@ -32,7 +33,8 @@ var (
3233
)
3334

3435
var (
35-
ErrPlatformUnsupported = errors.New("platform is unsupported")
36+
ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch")
37+
ErrPlatformUnsupported = errors.New("platform is unsupported")
3638
)
3739

3840
func createNullLogger() *logrus.Logger {
@@ -52,15 +54,27 @@ func GenerateCDISpec() (*specs.Spec, error) {
5254
logger = logrus.StandardLogger()
5355
}
5456

55-
info := info.New(info.WithLogger(logger))
57+
nvmLib := nvml.New()
58+
info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib))
5659

5760
if ok, reason := info.HasDXCore(); ok {
5861
logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason)
5962
return nil, ErrPlatformUnsupported
6063
}
6164

6265
hasNvml, reason := info.HasNvml()
63-
if !hasNvml {
66+
if hasNvml {
67+
if err := nvmLib.Init(); err != nvml.SUCCESS {
68+
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s",
69+
err)
70+
71+
if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH {
72+
return nil, ErrNVMLDriverLibraryVersionMismatch
73+
} else {
74+
return nil, errors.New("failed to initialize NVIDIA Management Library")
75+
}
76+
}
77+
} else {
6478
logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s",
6579
reason)
6680
}
@@ -75,7 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) {
7589
return nil, ErrPlatformUnsupported
7690
}
7791

78-
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger))
92+
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib))
7993
if err != nil {
8094
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err)
8195
return nil, errors.New("failed to create Container Device Interface library for NVIDIA")

0 commit comments

Comments
 (0)