Skip to content

Commit b2739d3

Browse files
committed
fix init reordering bug
1 parent ef034d5 commit b2739d3

File tree

4 files changed

+59
-32
lines changed

4 files changed

+59
-32
lines changed

smartsim/_core/launcher/dragon/dragonBackend.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,6 @@ def __init__(self, pid: int) -> None:
157157
self._step_ids = (f"{create_short_id_str()}-{id}" for id in itertools.count())
158158
"""Incremental ID to assign to new steps prior to execution"""
159159

160-
self._initialize_hosts()
161160
self._queued_steps: "collections.OrderedDict[str, DragonRunRequest]" = (
162161
collections.OrderedDict()
163162
)
@@ -188,11 +187,7 @@ def __init__(self, pid: int) -> None:
188187
else 5
189188
)
190189
"""Time in seconds needed to server to complete shutdown"""
191-
192-
self._view = DragonBackendView(self)
193-
logger.debug(self._view.host_desc)
194190
self._infra_ddict: t.Optional[dragon_ddict.DDict] = None
195-
self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
196191

197192
self._nodes: t.List["dragon_machine.Node"] = []
198193
"""Node capability information for hosts in the allocation"""
@@ -205,6 +200,11 @@ def __init__(self, pid: int) -> None:
205200
self._allocated_hosts: t.Dict[str, t.Set[str]] = {}
206201
"""Mapping with hostnames as keys and a set of running step IDs as the value"""
207202

203+
self._initialize_hosts()
204+
self._view = DragonBackendView(self)
205+
logger.debug(self._view.host_desc)
206+
self._prioritizer = NodePrioritizer(self._nodes, self._queue_lock)
207+
208208
@property
209209
def hosts(self) -> list[str]:
210210
with self._queue_lock:

smartsim/_core/launcher/dragon/pqueue.py

-6
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,6 @@ def remove(
143143
:param tracking_id: a unique task identifier executing on the node
144144
to remove
145145
:raises ValueError: if tracking_id is already assigned to this node"""
146-
if tracking_id and tracking_id not in self.assigned_tasks:
147-
raise ValueError("Attempted removal of untracked item")
148-
149146
self._num_refs = max(self._num_refs - 1, 0)
150147
if tracking_id:
151148
self._assigned_tasks = self._assigned_tasks - {tracking_id}
@@ -460,8 +457,5 @@ def next_n(
460457
:param hosts: a list of hostnames used to filter the available nodes
461458
:returns: Collection of reserved nodes
462459
:raises ValueError: if the hosts parameter is an empty list"""
463-
if hosts is not None and not hosts:
464-
raise ValueError("No hostnames provided")
465-
466460
heap = self._create_sub_heap(hosts, filter_on)
467461
return self._get_next_n_available_nodes(num_items, heap, tracking_id)

tests/test_dragon_run_request.py

+46-11
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ def set_mock_group_infos(
176176
}
177177

178178
monkeypatch.setattr(dragon_backend, "_group_infos", group_infos)
179-
monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: "abc123-1"})
179+
monkeypatch.setattr(dragon_backend, "_allocated_hosts", {hosts[0]: {"abc123-1"}})
180180
monkeypatch.setattr(dragon_backend, "_running_steps", ["abc123-1"])
181181

182182
return group_infos
@@ -221,8 +221,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
221221
assert dragon_backend._running_steps == [step_id]
222222
assert len(dragon_backend._queued_steps) == 0
223223
assert len(dragon_backend.free_hosts) == 1
224-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
225-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
224+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
225+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
226226

227227
monkeypatch.setattr(
228228
dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -233,8 +233,8 @@ def test_run_request(monkeypatch: pytest.MonkeyPatch) -> None:
233233
assert dragon_backend._running_steps == [step_id]
234234
assert len(dragon_backend._queued_steps) == 0
235235
assert len(dragon_backend.free_hosts) == 1
236-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
237-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
236+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
237+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
238238

239239
dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
240240

@@ -316,8 +316,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
316316
assert dragon_backend._running_steps == [step_id]
317317
assert len(dragon_backend._queued_steps) == 0
318318
assert len(dragon_backend._prioritizer.unassigned()) == 1
319-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
320-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
319+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
320+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
321321

322322
monkeypatch.setattr(
323323
dragon_backend._group_infos[step_id].process_group, "status", "Running"
@@ -328,8 +328,8 @@ def test_run_request_with_policy(monkeypatch: pytest.MonkeyPatch) -> None:
328328
assert dragon_backend._running_steps == [step_id]
329329
assert len(dragon_backend._queued_steps) == 0
330330
assert len(dragon_backend._prioritizer.unassigned()) == 1
331-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[0]] == step_id
332-
assert dragon_backend._allocated_hosts[dragon_backend.hosts[1]] == step_id
331+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[0]]
332+
assert step_id in dragon_backend._allocated_hosts[dragon_backend.hosts[1]]
333333

334334
dragon_backend._group_infos[step_id].status = SmartSimStatus.STATUS_CANCELLED
335335

@@ -635,7 +635,8 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
635635
hosts = dragon_backend.hosts
636636
dragon_backend._prioritizer.increment(hosts[0])
637637

638-
expected_msg = textwrap.dedent(f"""\
638+
expected_msg = textwrap.dedent(
639+
f"""\
639640
Dragon server backend update
640641
| Host | Status |
641642
|--------|----------|
@@ -648,7 +649,8 @@ def test_view(monkeypatch: pytest.MonkeyPatch) -> None:
648649
| del999-2 | Cancelled | {hosts[1]} | -9 | 1 |
649650
| c101vz-3 | Completed | {hosts[1]},{hosts[2]} | 0 | 2 |
650651
| 0ghjk1-4 | Failed | {hosts[2]} | -1 | 1 |
651-
| ljace0-5 | NeverStarted | | | 0 |""")
652+
| ljace0-5 | NeverStarted | | | 0 |"""
653+
)
652654

653655
# get rid of white space to make the comparison easier
654656
actual_msg = dragon_backend.status_message.replace(" ", "")
@@ -728,3 +730,36 @@ def test_can_honor_hosts_unavailable_hosts_ok(monkeypatch: pytest.MonkeyPatch) -
728730
assert can_honor, error_msg
729731
# confirm failure message indicates number of nodes requested as cause
730732
assert error_msg is None, error_msg
733+
734+
735+
def test_can_honor_hosts_1_hosts_requested(monkeypatch: pytest.MonkeyPatch) -> None:
736+
"""Verify that requesting nodes with invalid names causes number of available
737+
nodes check to be reduced but still passes if enough valid named nodes are passed"""
738+
dragon_backend = get_mock_backend(monkeypatch, num_cpus=8, num_gpus=0)
739+
740+
# let's supply 2 valid and 1 invalid hostname
741+
actual_hosts = list(dragon_backend._hosts)
742+
actual_hosts[0] = f"x{actual_hosts[0]}"
743+
744+
host_list = ",".join(actual_hosts)
745+
746+
run_req = DragonRunRequest(
747+
exe="sleep",
748+
exe_args=["5"],
749+
path="/a/fake/path",
750+
nodes=1, # <----- requesting 0 nodes - should be ignored
751+
hostlist=host_list, # <--- two valid names are available
752+
tasks=1,
753+
tasks_per_node=1,
754+
env={},
755+
current_env={},
756+
pmi_enabled=False,
757+
policy=DragonRunPolicy(),
758+
)
759+
760+
can_honor, error_msg = dragon_backend._can_honor(run_req)
761+
762+
# confirm the failure is indicated
763+
assert can_honor, error_msg
764+
# # confirm failure message indicates number of nodes requested as cause
765+
# assert error_msg is None, error_msg

tests/test_node_prioritizer.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -457,9 +457,9 @@ def test_node_prioritizer_multi_increment_subheap_assigned() -> None:
457457
assert len(all_tracking_info) == 0
458458

459459

460-
def test_node_prioritizer_empty_subheap_next_w_hosts() -> None:
460+
def test_node_prioritizer_empty_subheap_next_w_no_hosts() -> None:
461461
"""Verify that retrieving multiple nodes via `next_n` API does
462-
not allow an empty host list"""
462+
with an empty host list uses the entire available host list"""
463463

464464
num_cpu_nodes, num_gpu_nodes = 8, 0
465465
cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
@@ -476,15 +476,15 @@ def test_node_prioritizer_empty_subheap_next_w_hosts() -> None:
476476

477477
# request n == {num_requested} nodes from set of 3 available
478478
num_requested = 1
479-
with pytest.raises(ValueError) as ex:
480-
p.next(hosts=hostnames)
479+
node = p.next(hosts=hostnames)
480+
assert node
481481

482-
assert "No hostnames provided" == ex.value.args[0]
482+
# assert "No hostnames provided" == ex.value.args[0]
483483

484484

485485
def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
486486
"""Verify that retrieving multiple nodes via `next_n` API does
487-
not allow an empty host list"""
487+
not blow up with an empty host list"""
488488

489489
num_cpu_nodes, num_gpu_nodes = 8, 0
490490
cpu_hosts, gpu_hosts = mock_node_hosts(num_cpu_nodes, num_gpu_nodes)
@@ -501,10 +501,8 @@ def test_node_prioritizer_empty_subheap_next_n_w_hosts() -> None:
501501

502502
# request n == {num_requested} nodes from set of 3 available
503503
num_requested = 1
504-
with pytest.raises(ValueError) as ex:
505-
p.next_n(num_requested, hosts=hostnames)
506-
507-
assert "No hostnames provided" == ex.value.args[0]
504+
node = p.next_n(num_requested, hosts=hostnames)
505+
assert node is not None
508506

509507

510508
@pytest.mark.parametrize("num_requested", [-100, -1, 0])

0 commit comments

Comments
 (0)