Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cdi mode resolution #428

Merged
merged 2 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit
go 1.20

require (
github.com/NVIDIA/go-nvlib v0.3.0
github.com/NVIDIA/go-nvlib v0.4.0
github.com/NVIDIA/go-nvml v0.12.0-6
github.com/fsnotify/fsnotify v1.7.0
github.com/opencontainers/runtime-spec v1.2.0
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
github.com/NVIDIA/go-nvlib v0.3.0 h1:vd7jSOthJTqzqIWZrv317xDr1+Mnjoy5X4N69W9YwQM=
github.com/NVIDIA/go-nvlib v0.3.0/go.mod h1:NasUuId9hYFvwzuOHCu9F2X6oTU2tG0JHTfbJYuDAbA=
github.com/NVIDIA/go-nvlib v0.4.0 h1:dvuqjjSamBODFuxttPg4H/xtNVQRZOSlwFtuNKybcGI=
github.com/NVIDIA/go-nvlib v0.4.0/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k=
github.com/NVIDIA/go-nvml v0.12.0-6 h1:FJYc2KrpvX+VOC/8QQvMiQMmZ/nPMRpdJO/Ik4xfcr0=
github.com/NVIDIA/go-nvml v0.12.0-6/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
Expand Down
59 changes: 12 additions & 47 deletions internal/info/auto.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,75 +17,40 @@
package info

import (
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
"github.com/NVIDIA/go-nvml/pkg/nvml"

"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)

// infoInterface provides an alias for mocking.
//
//go:generate moq -stub -out info-interface_mock.go . infoInterface
type infoInterface interface {
info.Interface
// UsesNVGPUModule indicates whether the system is using the nvgpu kernel module
UsesNVGPUModule() (bool, string)
}

type resolver struct {
logger logger.Interface
info infoInterface
}

// ResolveAutoMode determines the correct mode for the platform if set to "auto"
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
nvinfo := info.New()
nvmllib := nvml.New()
devicelib := device.New(
device.WithNvml(nvmllib),
)

info := additionalInfo{
Interface: nvinfo,
nvmllib: nvmllib,
devicelib: devicelib,
}

r := resolver{
logger: logger,
info: info,
}
return r.resolveMode(mode, image)
return resolveMode(logger, mode, image, nil)
}

// resolveMode determines the correct mode for the platform if set to "auto"
func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) {
func resolveMode(logger logger.Interface, mode string, image image.CUDA, propertyExtractor info.PropertyExtractor) (rmode string) {
if mode != "auto" {
r.logger.Infof("Using requested mode '%s'", mode)
logger.Infof("Using requested mode '%s'", mode)
return mode
}
defer func() {
r.logger.Infof("Auto-detected mode as '%v'", rmode)
logger.Infof("Auto-detected mode as '%v'", rmode)
}()

if image.OnlyFullyQualifiedCDIDevices() {
return "cdi"
}

isTegra, reason := r.info.IsTegraSystem()
r.logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)

hasNVML, reason := r.info.HasNvml()
r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason)

usesNVGPUModule, reason := r.info.UsesNVGPUModule()
r.logger.Debugf("Uses nvgpu kernel module? %v: %v", usesNVGPUModule, reason)
nvinfo := info.New(
info.WithLogger(logger),
info.WithPropertyExtractor(propertyExtractor),
)

if (isTegra && !hasNVML) || usesNVGPUModule {
switch nvinfo.ResolvePlatform() {
case info.PlatformNVML, info.PlatformWSL:
return "legacy"
case info.PlatformTegra:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- this case aligns with the original conditional because usesNVGPUModule will only be true on a Tegra platform, correct?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the logic for checking the use of the nvgpu module has been pulled into the platform check. PlatformTegra now means that either the Tegra sysfs files are present and NVML is not, or NVML is present and the nvgpu module is used.

return "csv"
}

return "legacy"
}
18 changes: 10 additions & 8 deletions internal/info/auto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package info
import (
"testing"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -202,23 +203,24 @@ func TestResolveAutoMode(t *testing.T) {

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
info := &infoInterfaceMock{
properties := &info.PropertyExtractorMock{
HasNvmlFunc: func() (bool, string) {
return tc.info["nvml"], "nvml"
},
HasDXCoreFunc: func() (bool, string) {
return tc.info["dxcore"], "dxcore"
},
IsTegraSystemFunc: func() (bool, string) {
return tc.info["tegra"], "tegra"
},
UsesNVGPUModuleFunc: func() (bool, string) {
HasTegraFilesFunc: func() (bool, string) {
return tc.info["tegra"], "tegra"
},
UsesOnlyNVGPUModuleFunc: func() (bool, string) {
return tc.info["nvgpu"], "nvgpu"
},
}

r := resolver{
logger: logger,
info: info,
}

var mounts []specs.Mount
for _, d := range tc.mounts {
mount := specs.Mount{
Expand All @@ -231,7 +233,7 @@ func TestResolveAutoMode(t *testing.T) {
image.WithEnvMap(tc.envmap),
image.WithMounts(mounts),
)
mode := r.resolveMode(tc.mode, image)
mode := resolveMode(logger, tc.mode, image, properties)
require.EqualValues(t, tc.expectedMode, mode)
})
}
Expand Down
194 changes: 0 additions & 194 deletions internal/info/info-interface_mock.go

This file was deleted.

32 changes: 15 additions & 17 deletions pkg/nvcdi/lib.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,12 @@ func New(opts ...Option) (Interface, error) {
l.nvidiaCTKPath = "/usr/bin/nvidia-ctk"
}
if l.infolib == nil {
l.infolib = info.New()
l.infolib = info.New(
info.WithRoot(l.driverRoot),
info.WithLogger(l.logger),
info.WithNvmlLib(l.nvmllib),
info.WithDeviceLib(l.devicelib),
)
}

l.driver = root.New(
Expand Down Expand Up @@ -184,26 +189,19 @@ func (l *nvcdilib) resolveMode() (rmode string) {
return l.mode
}
defer func() {
l.logger.Infof("Auto-detected mode as %q", rmode)
l.logger.Infof("Auto-detected mode as '%v'", rmode)
}()

isWSL, reason := l.infolib.HasDXCore()
l.logger.Debugf("Is WSL-based system? %v: %v", isWSL, reason)

if isWSL {
return ModeWsl
}

isNvml, reason := l.infolib.HasNvml()
l.logger.Debugf("Is NVML-based system? %v: %v", isNvml, reason)

isTegra, reason := l.infolib.IsTegraSystem()
l.logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)

if isTegra && !isNvml {
platform := l.infolib.ResolvePlatform()
switch platform {
case info.PlatformNVML:
return ModeNvml
case info.PlatformTegra:
return ModeCSV
case info.PlatformWSL:
return ModeWsl
}

l.logger.Warningf("Unsupported platform detected: %v; assuming %v", platform, ModeNvml)
return ModeNvml
}

Expand Down
Loading