Fix small bugs with async map (#7445)

lhoestq · web-flow · commit f09db017d1d4 · 2025-03-13T11:37:58.000+01:00
* fix async map resuming

* fix with_indices

* fix tests

* fix tests

* again
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -1076,15 +1076,17 @@ def _iter(self):
             num_examples_to_skip = 0
         iterator = iter(self.ex_iterable)
 
+        # We use the same logic as in Dataset.map, but with less features/formatting
+        # since they're handled by FormattedExamplesIterable
+
         if self.formatting:
             formatter = get_formatter(self.formatting.format_type)
-            format_dict = (
-                formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else cast_to_python_objects
-            )
+            format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None
         else:
             format_dict = None
 
         def iter_batched_inputs():
+            nonlocal current_idx
             for key, example in iterator:
                 # If `batched`, first build the batch, if `batch_size` is None or <=0, then the batch is the whole dataset
                 iterator_batch = (
@@ -1104,17 +1106,21 @@ def iter_batched_inputs():
                 ):  # ignore last batch
                     return
                 batch = _examples_to_batch(examples)
+                # we need to format here in case we need to stack tensors together
                 batch = format_dict(batch) if format_dict else batch
                 indices = [current_idx + i for i in range(len(key_examples_list))]
+                current_idx += len(indices)
                 yield indices, (key, batch)
 
         def iter_inputs():
+            nonlocal current_idx
             for key, example in iterator:
                 # If not batched, we can apply the transform and yield the example directly
                 # first copy the example, since we might drop some keys
                 example = dict(example)
-                example = format_dict(example) if format_dict else example
-                yield current_idx, (key, example)
+                # no need to do formatting here
+                current_idx += 1
+                yield current_idx - 1, (key, example)
 
         def validate_function_output(processed_inputs):
             if self.batched and processed_inputs:
@@ -1147,17 +1153,7 @@ def prepare_outputs(key_example, inputs, processed_inputs):
                     if processed_inputs is key_example[1] and c in processed_inputs:
                         del processed_inputs[c]
             transformed_inputs = {**inputs, **processed_inputs}
-            if self.features:
-                for c in self.features.keys():
-                    if c not in transformed_inputs:
-                        transformed_inputs[c] = (
-                            [None] * len(transformed_inputs[next(iter(processed_inputs))]) if self.batched else None
-                        )
-                transformed_inputs = (
-                    self.features.decode_batch(transformed_inputs)
-                    if self.batched
-                    else self.features.decode_example(transformed_inputs)
-                )
+            # no need to do features decoding here
             return transformed_inputs
 
         def apply_function(key_example, indices):
@@ -1185,6 +1181,11 @@ def iter_outputs():
             nonlocal tasks, loop
             inputs_iterator = iter_batched_inputs() if self.batched else iter_inputs()
             if inspect.iscoroutinefunction(self.function):
+                if self._state_dict:
+                    previous_state = self.ex_iterable.state_dict()
+                    self._state_dict["previous_state"] = previous_state
+                    previous_state_task = None
+                    previous_state_example_idx = self._state_dict["previous_state_example_idx"]
                 indices: Union[list[int], list[list[int]]] = []
                 for i, key_example in inputs_iterator:
                     indices.append(i)
@@ -1198,42 +1199,57 @@ def iter_outputs():
                             done, pending = loop.run_until_complete(
                                 asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
                             )
+                    if len(tasks) >= 10 * config.MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL:
+                        loop.run_until_complete(tasks[0])
                     # yield finished tasks
                     while tasks and tasks[0].done():
-                        yield indices.pop(0), tasks.pop(0).result()
+                        i, task = indices.pop(0), tasks.pop(0)
+                        yield i, task.result()
+                        if self._state_dict and task is previous_state_task:
+                            self._state_dict["previous_state"] = previous_state
+                            self._state_dict["num_examples_since_previous_state"] = 0
+                            self._state_dict["previous_state_example_idx"] = previous_state_example_idx
+                            previous_state, previous_state_task = None, None
+                    # checkpoint
+                    if self._state_dict and previous_state_task is None and tasks:
+                        previous_state = self.ex_iterable.state_dict()
+                        previous_state_task = tasks[-1]
+                        previous_state_example_idx = current_idx
                 while tasks:
                     yield indices[0], loop.run_until_complete(tasks[0])
                     indices.pop(0), tasks.pop(0)
             else:
-                for i, key_example in inputs_iterator:
-                    yield i, apply_function(key_example, i)
-
-        try:
-            if self.batched:
                 if self._state_dict:
-                    self._state_dict["previous_state"] = self.ex_iterable.state_dict()
-                    self._state_dict["num_examples_since_previous_state"] = 0
-                    self._state_dict["previous_state_example_idx"] = current_idx
-                for key, transformed_batch in iter_outputs():
-                    # yield one example at a time from the transformed batch
-                    for example in _batch_to_examples(transformed_batch):
-                        current_idx += 1
-                        if self._state_dict:
-                            self._state_dict["num_examples_since_previous_state"] += 1
-                        if num_examples_to_skip > 0:
-                            num_examples_to_skip -= 1
-                            continue
-                        yield key, example
-                    if self._state_dict:
+                    if self.batched:
                         self._state_dict["previous_state"] = self.ex_iterable.state_dict()
                         self._state_dict["num_examples_since_previous_state"] = 0
                         self._state_dict["previous_state_example_idx"] = current_idx
-            else:
-                for key, transformed_example in iter_outputs():
-                    current_idx += 1
+                for i, key_example in inputs_iterator:
                     if self._state_dict:
-                        self._state_dict["previous_state_example_idx"] += 1
-                    yield key, transformed_example
+                        if not self.batched:
+                            self._state_dict["previous_state_example_idx"] = current_idx
+                    yield i, apply_function(key_example, i)
+                    if self._state_dict:
+                        if self.batched:
+                            self._state_dict["previous_state"] = self.ex_iterable.state_dict()
+                            self._state_dict["num_examples_since_previous_state"] = 0
+                            self._state_dict["previous_state_example_idx"] = current_idx
+
+        try:
+            outputs = iter_outputs()
+            if self.batched:
+                outputs = (
+                    (key, transformed_example)
+                    for key, transformed_batch in outputs
+                    for transformed_example in _batch_to_examples(transformed_batch)
+                )
+            for key, transformed_example in outputs:
+                if self._state_dict and self._state_dict["previous_state"] is not None:
+                    self._state_dict["num_examples_since_previous_state"] += 1
+                if num_examples_to_skip > 0:
+                    num_examples_to_skip -= 1
+                    continue
+                yield key, transformed_example
         except (Exception, KeyboardInterrupt):
             if loop:
                 logger.debug(f"Canceling {len(tasks)} async tasks.")
@@ -1800,7 +1816,7 @@ def _init_state_dict(self) -> dict:
 
     def __iter__(self):
         if not self.formatting or self.formatting.is_table:
-            formatter = PythonFormatter()
+            formatter = PythonFormatter(features=self._features if not self.ex_iterable.is_typed else None)
         else:
             formatter = get_formatter(
                 self.formatting.format_type,
@@ -1817,15 +1833,17 @@ def __iter__(self):
             format_dict = (
                 formatter.recursive_tensorize
                 if isinstance(formatter, TensorFormatter)
-                else cast_to_python_objects  # cast in case features is None
+                else None  # cast in case features is None
             )
             for key, example in self.ex_iterable:
                 # don't apply feature types if already applied by ex_iterable (e.g. in case of chained with_format)
                 if self.features and not self.ex_iterable.is_typed:
                     example = _apply_feature_types_on_example(
                         example, self.features, token_per_repo_id=self.token_per_repo_id
                     )
-                yield key, format_dict(example)
+                if format_dict:
+                    example = format_dict(example)
+                yield key, example
 
     def _iter_arrow(self) -> Iterator[tuple[Key, pa.Table]]:
         if not self.features:
@@ -2049,7 +2067,7 @@ def __setstate__(self, d):
         _maybe_add_torch_iterable_dataset_parent_class(self.__class__)
 
     def _head(self, n=5):
-        return _examples_to_batch(list(self.take(n)))
+        return next(iter(self.iter(batch_size=n)))
 
     @property
     def epoch(self) -> int:
@@ -2111,15 +2129,8 @@ def _iter_pytorch(self):
             if self._starting_state_dict:
                 ex_iterable.load_state_dict(self._starting_state_dict)
 
-            if self._formatting:
-                formatter = get_formatter(self._formatting.format_type, features=self.features)
-                format_dict = (
-                    formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else cast_to_python_objects
-                )
-            else:
-                format_dict = None
-
             if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table):
+                formatter = get_formatter(self._formatting.format_type, features=self.features)
                 if ex_iterable.iter_arrow:
                     iterator = ex_iterable.iter_arrow()
                 else:
@@ -2129,13 +2140,8 @@ def _iter_pytorch(self):
                 return
             else:
                 for key, example in ex_iterable:
-                    if self.features and not ex_iterable.is_typed:
-                        # `IterableDataset` automatically fills missing columns with None.
-                        # This is done with `_apply_feature_types_on_example`.
-                        example = _apply_feature_types_on_example(
-                            example, self.features, token_per_repo_id=self._token_per_repo_id
-                        )
-                    yield format_dict(example) if format_dict else example
+                    # no need to format thanks to FormattedExamplesIterable
+                    yield example
             logger.debug(
                 f"{_log_prefix}dataloader worker#{worker_info.id}, ': Finished iterating over {len(shards_indices)}/{ex_iterable.num_shards} shards."
             )
@@ -2191,6 +2197,14 @@ def _prepare_ex_iterable_for_iteration(
                     )
                 ex_iterable = StepExamplesIterable(ex_iterable, step=world_size, offset=rank)
 
+        if self._formatting or (self.features and ex_iterable.features != self.features):
+            ex_iterable = FormattedExamplesIterable(
+                ex_iterable,
+                formatting=self._formatting,
+                features=self.features,
+                token_per_repo_id=self._token_per_repo_id,
+            )
+
         self._state_dict = ex_iterable._init_state_dict()
         if self._starting_state_dict:
             ex_iterable.load_state_dict(self._starting_state_dict)
@@ -2207,15 +2221,8 @@ def __iter__(self):
                 return
 
         ex_iterable = self._prepare_ex_iterable_for_iteration()
-        if self._formatting:
-            formatter = get_formatter(self._formatting.format_type, features=self.features)
-            format_dict = (
-                formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else cast_to_python_objects
-            )
-        else:
-            format_dict = None
-
         if self._formatting and (ex_iterable.iter_arrow or self._formatting.is_table):
+            formatter = get_formatter(self._formatting.format_type, features=self.features)
             if ex_iterable.iter_arrow:
                 iterator = ex_iterable.iter_arrow()
             else:
@@ -2225,13 +2232,8 @@ def __iter__(self):
             return
 
         for key, example in ex_iterable:
-            if self.features and not ex_iterable.is_typed:
-                # `IterableDataset` automatically fills missing columns with None.
-                # This is done with `_apply_feature_types_on_example`.
-                example = _apply_feature_types_on_example(
-                    example, self.features, token_per_repo_id=self._token_per_repo_id
-                )
-            yield format_dict(example) if format_dict else example
+            # no need to format thanks to FormattedExamplesIterable
+            yield example
 
     def iter(self, batch_size: int, drop_last_batch: bool = False):
         """Iterate through the batches of size `batch_size`.
@@ -2244,9 +2246,7 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
 
         if self._formatting:
             formatter = get_formatter(self._formatting.format_type, features=self.features)
-            format_dict = (
-                formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else cast_to_python_objects
-            )
+            format_dict = formatter.recursive_tensorize if isinstance(formatter, TensorFormatter) else None
         else:
             format_dict = None
 
@@ -2267,10 +2267,7 @@ def iter(self, batch_size: int, drop_last_batch: bool = False):
             if drop_last_batch and len(examples) < batch_size:  # ignore last batch
                 return
             batch = _examples_to_batch(examples)
-            if self.features and not ex_iterable.is_typed:
-                # `IterableDataset` automatically fills missing columns with None.
-                # This is done with `_apply_feature_types_on_batch`.
-                batch = _apply_feature_types_on_batch(batch, self.features, token_per_repo_id=self._token_per_repo_id)
+            # we need to format here in case we need to stack tensors together
             yield format_dict(batch) if format_dict else batch
 
     @staticmethod
@@ -3241,7 +3238,13 @@ def batch(self, batch_size: int, drop_last_batch: bool = False) -> "IterableData
         def batch_fn(unbatched):
             return {k: [v] for k, v in unbatched.items()}
 
-        return self.map(batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch)
+        if self.features:
+            features = Features({col: [feature] for col, feature in self.features.items()})
+        else:
+            features = None
+        return self.map(
+            batch_fn, batched=True, batch_size=batch_size, drop_last_batch=drop_last_batch, features=features
+        )
 
 
 def _concatenate_iterable_datasets(
diff --git a/tests/test_iterable_dataset.py b/tests/test_iterable_dataset.py
@@ -505,6 +505,13 @@ def test_mapped_examples_iterable_drop_last_batch(n, func, batched, batch_size):
             next(iter(ex_iterable))
 
 
+def _wrap_async(func, *args, **kwargs):
+    async def wrapped_func(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapped_func
+
+
 @pytest.mark.parametrize(
     "n, func, batched, batch_size",
     [
@@ -519,10 +526,11 @@ def test_mapped_examples_iterable_drop_last_batch(n, func, batched, batch_size):
         (5, lambda x, indices: {"id+idx": [i + j for i, j in zip(x["id"], indices)]}, True, -1),  # same with bs<=0
     ],
 )
-def test_mapped_examples_iterable_with_indices(n, func, batched, batch_size):
+@pytest.mark.parametrize("wrapper", [lambda x: x, _wrap_async])
+def test_mapped_examples_iterable_with_indices(n, func, batched, batch_size, wrapper):
     base_ex_iterable = ExamplesIterable(generate_examples_fn, {"n": n})
     ex_iterable = MappedExamplesIterable(
-        base_ex_iterable, func, batched=batched, batch_size=batch_size, with_indices=True
+        base_ex_iterable, wrapper(func), batched=batched, batch_size=batch_size, with_indices=True
     )
     all_examples = [x for _, x in generate_examples_fn(n=n)]
     if batched is False:
@@ -2454,3 +2462,15 @@ def test_iterable_dataset_batch():
     assert len(batches[2]["text"]) == 2
     assert batches[2]["id"] == [8, 9]
     assert batches[2]["text"] == ["Text 8", "Text 9"]
+
+    # Test with features
+    batched_ds = ds._resolve_features().batch(batch_size=3)
+    batches = list(batched_ds)
+
+    assert batched_ds.features is not None
+    assert len(batches) == 4  # 3 full batches and 1 partial batch
+    for i, batch in enumerate(batches[:1]):
+        assert len(batch["id"]) == 3
+        assert len(batch["text"]) == 3
+        assert batch["id"] == [3 * i, 3 * i + 1, 3 * i + 2]
+        assert batch["text"] == [f"Text {3 * i}", f"Text {3 * i + 1}", f"Text {3 * i + 2}"]