Skip to content

Commit 6d5518b

Browse files
authored
fix init reordering bug (#675)
Fix 3 bugs: 1. reordering the init sequence in the dragon backend resulted in an un-set collection being used 2. fix tests that should have been updated to compare set contents instead of individual items 3. remove newly added validation on empty host lists that broke existing tests
1 parent ef034d5 commit 6d5518b

File tree

4 files changed

+53
-30
lines changed

4 files changed

+53
-30
lines changed

smartsim/_core/launcher/dragon/dragonBackend.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,6 @@ def __init__(self, pid: int) -> None:
157157
self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count())
158158
"""Incremental ID to assign to new steps prior to execution"""
159159

160-
self._initialize_hosts()
161160
self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = (
162161
collections.OrderedDict()
163162
)
@@ -188,11 +187,7 @@ def __init__(self, pid: int) -> None:
188187
else 5
189188
)
190189
"""Time in seconds needed to server to complete shutdown"""
191-
192-
self._view = DragonBackendView(self)
193-
logger.debug(self._view.host_desc)
194190
self._infra_ddict: t.Optional[dragon_ddict.DDict] = None
195-
self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
196191

197192
self._nodes: t.List["dragon_machine.Node"] = []
198193
"""Node capability information for hosts in the allocation"""
@@ -205,6 +200,11 @@ def __init__(self, pid: int) -> None:
205200
self._allocated_hosts: t.Dict[str, t.Set[str]] = {}
206201
"""Mapping with hostnames as keys and a set of running step IDs as the value"""
207202

203+
self._initialize_hosts()
204+
self._view = DragonBackendView(self)
205+
logger.debug(self._view.host_desc)
206+
self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
207+
208208
@property
209209
def hosts(self) -> list[str]:
210210
with self._queue_lock:

smartsim/_core/launcher/dragon/pqueue.py

-6
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,6 @@ def remove(
143143
:param tracking_id: a unique task identifier executing on the node
144144
to remove
145145
:raises ValueError: if tracking_id is already assigned to this node"""
146-
if tracking_id and tracking_id not in self.assigned_tasks:
147-
raise ValueError("Attempted removal of untracked item")
148-
149146
self._num_refs = max(self._num_refs - 1, 0)
150147
if tracking_id:
151148
self._assigned_tasks = self._assigned_tasks - {tracking_id}
@@ -460,8 +457,5 @@ def next_n(
460457
:param hosts: a list of hostnames used to filter the available nodes
461458
:returns: Collection of reserved nodes
462459
:raises ValueError: if the hosts parameter is an empty list"""
463-
if hosts is not None and not hosts:
464-
raise ValueError("No hostnames provided")
465-
466460
heap = self._create_sub_heap(hosts, filter_on)
467461
return self._get_next_n_available_nodes(num_items, heap, tracking_id)

tests/test_dragon_run_request.py

+40-9
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def set_mock_group_infos(
176176
}
177177

178178
monkeypatch.setattr(dragon_backend, "_group_infos", group_infos)
179-
monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"})
179+
monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: {"abc123-1"}})
180180
monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"])
181181

182182
return group_infos
@@ -221,8 +221,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
221221
assert dragon_backend._running_steps == [step_id]
222222
assert len(dragon_backend._queued_steps) == 0
223223
assert len(dragon_backend.free_hosts) == 1
224-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
225-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
224+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
225+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
226226

227227
monkeypatch.setattr(
228228
dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -233,8 +233,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
233233
assert dragon_backend._running_steps == [step_id]
234234
assert len(dragon_backend._queued_steps) == 0
235235
assert len(dragon_backend.free_hosts) == 1
236-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
237-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
236+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
237+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
238238

239239
dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
240240

@@ -316,8 +316,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
316316
assert dragon_backend._running_steps == [step_id]
317317
assert len(dragon_backend._queued_steps) == 0
318318
assert len(dragon_backend._prioritizer.unassigned()) == 1
319-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
320-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
319+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
320+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
321321

322322
monkeypatch.setattr(
323323
dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -328,8 +328,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
328328
assert dragon_backend._running_steps == [step_id]
329329
assert len(dragon_backend._queued_steps) == 0
330330
assert len(dragon_backend._prioritizer.unassigned()) == 1
331-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
332-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
331+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
332+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
333333

334334
dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
335335

@@ -728,3 +728,34 @@ def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) -
728728
assert can_honor, error_msg
729729
# confirm failure message indicates number of nodes requested as cause
730730
assert error_msg is None, error_msg
731+
732+
733+
def test_can_honor_hosts_1_hosts_requested(monkeypatch: pytest.MonkeyPatch) -> None:
734+
"""Verify that requesting nodes with invalid names causes number of available
735+
nodes check to be reduced but still passes if enough valid named nodes are passed"""
736+
dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
737+
738+
# let's supply 2 valid and 1 invalid hostname
739+
actual_hosts = list(dragon_backend._hosts)
740+
actual_hosts[0] = f"x{actual_hosts[0]}"
741+
742+
host_list = ",".join(actual_hosts)
743+
744+
run_req = DragonRunRequest(
745+
exe="sleep",
746+
exe_args=["5"],
747+
path="/a/fake/path",
748+
nodes=1, # <----- requesting 0 nodes - should be ignored
749+
hostlist=host_list, # <--- two valid names are available
750+
tasks=1,
751+
tasks_per_node=1,
752+
env={},
753+
current_env={},
754+
pmi_enabled=False,
755+
policy=DragonRunPolicy(),
756+
)
757+
758+
can_honor, error_msg = dragon_backend._can_honor(run_req)
759+
760+
# confirm the failure is indicated
761+
assert can_honor, error_msg

tests/test_node_prioritizer.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -457,9 +457,9 @@ def test_node_prioritizer_multi_increment_subheap_assigned() -> None:
457457
assert len(all_tracking_info) == 0
458458

459459

460-
def test_node_prioritizer_empty_subheap_next_w_hosts() -> None:
460+
def test_node_prioritizer_empty_subheap_next_w_no_hosts() -> None:
461461
"""Verify that retrieving multiple nodes via `next_n` API does
462-
not allow an empty host list"""
462+
with an empty host list uses the entire available host list"""
463463

464464
num_cpu_nodes, num_gpu_nodes = 8, 0
465465
cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
@@ -476,15 +476,15 @@ def test_node_prioritizer_empty_subheap_next_w_hosts() -> None:
476476

477477
# request n == {num_requested} nodes from set of 3 available
478478
num_requested = 1
479-
with pytest.raises(ValueError) as ex:
480-
p.next(hosts=hostnames)
479+
node = p.next(hosts=hostnames)
480+
assert node
481481

482-
assert "No hostnames provided" == ex.value.args[0]
482+
# assert "No hostnames provided" == ex.value.args[0]
483483

484484

485485
def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
486486
"""Verify that retrieving multiple nodes via `next_n` API does
487-
not allow an empty host list"""
487+
not blow up with an empty host list"""
488488

489489
num_cpu_nodes, num_gpu_nodes = 8, 0
490490
cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
@@ -501,10 +501,8 @@ def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
501501

502502
# request n == {num_requested} nodes from set of 3 available
503503
num_requested = 1
504-
with pytest.raises(ValueError) as ex:
505-
p.next_n(num_requested, hosts=hostnames)
506-
507-
assert "No hostnames provided" == ex.value.args[0]
504+
node = p.next_n(num_requested, hosts=hostnames)
505+
assert node is not None
508506

509507

510508
@pytest.mark.parametrize("num_requested", [-100, -1, 0])

0 commit comments

Comments
 (0)