Skip to content

Issue 1165 : nccl + torch.cuda not available #1166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 2, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ignite/distributed/comp_models/native.py
Original file line number Diff line number Diff line change
@@ -74,6 +74,9 @@ def _create_from_backend(self, backend, timeout=None, **kwargs):
if timeout is not None:
init_pg_kwargs["timeout"] = timeout

if backend == dist.Backend.NCCL and not torch.cuda.is_available():
raise RuntimeError("Nccl backend is required but no cuda capable devices")

dist.init_process_group(backend, init_method="env://", **init_pg_kwargs)
# https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
dist.barrier()
9 changes: 9 additions & 0 deletions tests/ignite/conftest.py
Original file line number Diff line number Diff line change
@@ -216,3 +216,12 @@ def _xla_execute(fn, args, nprocs):
@pytest.fixture()
def xmp_executor():
yield _xla_execute


@pytest.fixture()
def mock_gpu_is_not_available():
from unittest.mock import patch

with patch("torch.cuda") as mock_cuda:
mock_cuda.is_available.return_value = False
yield mock_cuda
14 changes: 14 additions & 0 deletions tests/ignite/distributed/comp_models/test_native.py
Original file line number Diff line number Diff line change
@@ -32,6 +32,20 @@ def test__native_dist_model():
assert "mpi" not in available_backends


@pytest.mark.distributed
@pytest.mark.skipif(not dist.is_nccl_available(), reason="Skip if nccl not available")
def test__native_nccl_but_no_gpu(mock_gpu_is_not_available):

env_backup = os.environ

with pytest.raises(RuntimeError, match=r"Nccl backend is required but no cuda capable devices"):
_NativeDistModel(backend="nccl")

# environ could be corrupted by _NativeDistModel
os.environ.clear()
os.environ.update(env_backup)


@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
def test__native_dist_model_create_from_backend_bad_config():