Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cmd/run, pkg/nvidia: Detect mismatched NVIDIA kernel & user space driver #1541

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/cmd/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,14 @@ func runCommand(container string,

cdiSpecForNvidia, err := nvidia.GenerateCDISpec()
if err != nil {
if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
if errors.Is(err, nvidia.ErrNVMLDriverLibraryVersionMismatch) {
var builder strings.Builder
fmt.Fprintf(&builder, "the proprietary NVIDIA driver's kernel and user space don't match\n")
fmt.Fprintf(&builder, "Check the host operating system and systemd journal.")

errMsg := builder.String()
return errors.New(errMsg)
} else if !errors.Is(err, nvidia.ErrPlatformUnsupported) {
return err
}
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.20
require (
github.com/HarryMichal/go-version v1.0.1
github.com/NVIDIA/go-nvlib v0.6.1
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/NVIDIA/nvidia-container-toolkit v1.16.1
github.com/acobaugh/osrelease v0.1.0
github.com/briandowns/spinner v1.18.0
Expand All @@ -23,7 +24,6 @@ require (
)

require (
github.com/NVIDIA/go-nvml v0.12.4-0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/fatih/color v1.13.0 // indirect
github.com/google/uuid v1.6.0 // indirect
Expand Down
39 changes: 27 additions & 12 deletions src/pkg/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"io"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
nvspec "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/sirupsen/logrus"
Expand All @@ -32,7 +33,8 @@ var (
)

var (
ErrPlatformUnsupported = errors.New("platform is unsupported")
ErrNVMLDriverLibraryVersionMismatch = errors.New("NVML driver/library version mismatch")
ErrPlatformUnsupported = errors.New("platform is unsupported")
)

func createNullLogger() *logrus.Logger {
Expand All @@ -45,16 +47,36 @@ func createNullLogger() *logrus.Logger {
func GenerateCDISpec() (*specs.Spec, error) {
logrus.Debugf("Generating Container Device Interface for NVIDIA")

info := info.New()
var logger *logrus.Logger
if logLevel < logrus.DebugLevel {
logger = createNullLogger()
} else {
logger = logrus.StandardLogger()
}

nvmLib := nvml.New()
info := info.New(info.WithLogger(logger), info.WithNvmlLib(nvmLib))

if ok, reason := info.HasDXCore(); ok {
logrus.Debugf("Generating Container Device Interface for NVIDIA: Windows is unsupported: %s", reason)
return nil, ErrPlatformUnsupported
}

hasNvml, reason := info.HasNvml()
if !hasNvml {
logrus.Debugf("Generating Container Device Interface for NVIDIA: NVML not found: %s", reason)
if hasNvml {
if err := nvmLib.Init(); err != nvml.SUCCESS {
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to initialize NVML: %s",
err)

if err == nvml.ERROR_LIB_RM_VERSION_MISMATCH {
return nil, ErrNVMLDriverLibraryVersionMismatch
} else {
return nil, errors.New("failed to initialize NVIDIA Management Library")
}
}
} else {
logrus.Debugf("Generating Container Device Interface for NVIDIA: Management Library not found: %s",
reason)
}

isTegra, reason := info.IsTegraSystem()
Expand All @@ -67,14 +89,7 @@ func GenerateCDISpec() (*specs.Spec, error) {
return nil, ErrPlatformUnsupported
}

var logger *logrus.Logger
if logLevel < logrus.DebugLevel {
logger = createNullLogger()
} else {
logger = logrus.StandardLogger()
}

cdi, err := nvcdi.New(nvcdi.WithLogger(logger))
cdi, err := nvcdi.New(nvcdi.WithInfoLib(info), nvcdi.WithLogger(logger), nvcdi.WithNvmlLib(nvmLib))
if err != nil {
logrus.Debugf("Generating Container Device Interface for NVIDIA: failed to create library: %s", err)
return nil, errors.New("failed to create Container Device Interface library for NVIDIA")
Expand Down