Refactor string_to_dict to return None if there is no match instead of raising ValueError (#7435)

ringohoffman · web-flow · commit 67ffdfb56b48 · 2025-03-12T17:51:59.000+01:00
* Refactor string_to_dict to return None if there is no match instead of raising ValueError instead of having the pattern of using try-except to handle when there is no match, we can instead check if the return value is None; we can also assert that the return value should not be None if we know that should be true * Allow for source_url_fields to be None they can be local file paths here https://github.com/huggingface/datasets/actions/runs/13683185040/job/38380924390?pr=7435#step:10:9731
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3179,9 +3179,11 @@ def format_new_fingerprint(new_fingerprint: str, rank: int) -> str:
                     del kwargs["shard"]
             else:
                 logger.info(f"Loading cached processed dataset at {format_cache_file_name(cache_file_name, '*')}")
-            assert None not in transformed_shards, (
-                f"Failed to retrieve results from map: result list {transformed_shards} still contains None - at least one worker failed to return its results"
-            )
+            if None in transformed_shards:
+                raise ValueError(
+                    f"Failed to retrieve results from map: result list {transformed_shards} still contains None - at "
+                    "least one worker failed to return its results"
+                )
             logger.info(f"Concatenating {num_proc} shards")
             result = _concatenate_map_style_datasets(transformed_shards)
             # update fingerprint if the dataset changed
@@ -5328,7 +5330,7 @@ def _push_parquet_shards_to_hub(
         max_shard_size: Optional[Union[int, str]] = None,
         num_shards: Optional[int] = None,
         embed_external_files: bool = True,
-    ) -> tuple[str, str, int, int, list[str], int]:
+    ) -> tuple[list[CommitOperationAdd], int, int]:
         """Pushes the dataset shards as Parquet files to the hub.
 
         Returns:
@@ -5374,7 +5376,7 @@ def shards_with_embedded_external_files(shards: Iterator[Dataset]) -> Iterator[D
         api = HfApi(endpoint=config.HF_ENDPOINT, token=token)
 
         uploaded_size = 0
-        additions = []
+        additions: list[CommitOperationAdd] = []
         for index, shard in hf_tqdm(
             enumerate(shards),
             desc="Uploading the dataset shards",
@@ -5559,8 +5561,9 @@ def push_to_hub(
         # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern)
         # and delete old split shards (if they exist)
         repo_with_dataset_card, repo_with_dataset_infos = False, False
-        deletions, deleted_size = [], 0
-        repo_splits = []  # use a list to keep the order of the splits
+        deletions: list[CommitOperationDelete] = []
+        deleted_size = 0
+        repo_splits: list[str] = []  # use a list to keep the order of the splits
         repo_files_to_add = [addition.path_in_repo for addition in additions]
         for repo_file in api.list_repo_tree(
             repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
@@ -5579,10 +5582,10 @@ def push_to_hub(
             elif fnmatch.fnmatch(
                 repo_file.rfilename, PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*")
             ):
-                repo_split = string_to_dict(
-                    repo_file.rfilename,
-                    glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED),
-                )["split"]
+                pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED)
+                split_pattern_fields = string_to_dict(repo_file.rfilename, pattern)
+                assert split_pattern_fields is not None
+                repo_split = split_pattern_fields["split"]
                 if repo_split not in repo_splits:
                     repo_splits.append(repo_split)
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
@@ -264,14 +264,16 @@ def _get_data_files_patterns(pattern_resolver: Callable[[str], list[str]]) -> di
         except FileNotFoundError:
             continue
         if len(data_files) > 0:
-            splits: set[str] = {
-                string_to_dict(xbasename(p), glob_pattern_to_regex(xbasename(split_pattern)))["split"]
-                for p in data_files
-            }
+            splits: set[str] = set()
+            for p in data_files:
+                p_parts = string_to_dict(xbasename(p), glob_pattern_to_regex(xbasename(split_pattern)))
+                assert p_parts is not None
+                splits.add(p_parts["split"])
+
             if any(not re.match(_split_re, split) for split in splits):
                 raise ValueError(f"Split name should match '{_split_re}'' but got '{splits}'.")
             sorted_splits = [str(split) for split in DEFAULT_SPLITS if split in splits] + sorted(
-                splits - set(DEFAULT_SPLITS)
+                splits - {str(split) for split in DEFAULT_SPLITS}
             )
             return {split: [split_pattern.format(split=split)] for split in sorted_splits}
     # then check the default patterns based on train/valid/test splits
diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -1765,8 +1765,8 @@ def push_to_hub(
         # Check if the repo already has a README.md and/or a dataset_infos.json to update them with the new split info (size and pattern)
         # and delete old split shards (if they exist)
         repo_with_dataset_card, repo_with_dataset_infos = False, False
-        repo_splits = []  # use a list to keep the order of the splits
-        deletions = []
+        repo_splits: list[str] = []  # use a list to keep the order of the splits
+        deletions: list[CommitOperationDelete] = []
         repo_files_to_add = [addition.path_in_repo for addition in additions]
         for repo_file in api.list_repo_tree(
             repo_id=repo_id,
@@ -1790,12 +1790,12 @@ def push_to_hub(
                 repo_file.rfilename,
                 PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED.replace("{split}", "*"),
             ):
-                repo_split = string_to_dict(
-                    repo_file.rfilename,
-                    glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED),
-                )["split"]
+                pattern = glob_pattern_to_regex(PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED)
+                split_pattern_fields = string_to_dict(repo_file.rfilename, pattern)
+                assert split_pattern_fields is not None
+                repo_split = split_pattern_fields["split"]
                 if repo_split not in repo_splits:
-                    repo_splits.append(split)
+                    repo_splits.append(repo_split)
 
         # get the info from the README to update them
         if repo_with_dataset_card:
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -173,11 +173,8 @@ def decode_example(
             pattern = (
                 config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
             )
-            try:
-                repo_id = string_to_dict(source_url, pattern)["repo_id"]
-                token = token_per_repo_id[repo_id]
-            except (ValueError, KeyError):
-                token = None
+            source_url_fields = string_to_dict(source_url, pattern)
+            token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
 
             download_config = DownloadConfig(token=token)
             with xopen(path, "rb", download_config=download_config) as f:
diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py
@@ -174,11 +174,10 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "PIL.Image.Imag
                         if source_url.startswith(config.HF_ENDPOINT)
                         else config.HUB_DATASETS_HFFS_URL
                     )
-                    try:
-                        repo_id = string_to_dict(source_url, pattern)["repo_id"]
-                        token = token_per_repo_id.get(repo_id)
-                    except ValueError:
-                        token = None
+                    source_url_fields = string_to_dict(source_url, pattern)
+                    token = (
+                        token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
+                    )
                     download_config = DownloadConfig(token=token)
                     with xopen(path, "rb", download_config=download_config) as f:
                         bytes_ = BytesIO(f.read())
diff --git a/src/datasets/features/video.py b/src/datasets/features/video.py
@@ -1,6 +1,6 @@
 import os
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
+from typing import TYPE_CHECKING, Any, ClassVar, Optional, TypedDict, Union
 
 import numpy as np
 import pyarrow as pa
@@ -18,6 +18,11 @@
     from .features import FeatureType
 
 
+class Example(TypedDict):
+    path: Optional[str]
+    bytes: Optional[bytes]
+
+
 @dataclass
 class Video:
     """
@@ -66,7 +71,7 @@ class Video:
     def __call__(self):
         return self.pa_type
 
-    def encode_example(self, value: Union[str, bytes, dict, np.ndarray, "VideoReader"]) -> dict:
+    def encode_example(self, value: Union[str, bytes, Example, np.ndarray, "VideoReader"]) -> Example:
         """Encode example into a format for Arrow.
 
         Args:
@@ -92,21 +97,29 @@ def encode_example(self, value: Union[str, bytes, dict, np.ndarray, "VideoReader
         elif isinstance(value, np.ndarray):
             # convert the video array to bytes
             return encode_np_array(value)
-        elif VideoReader and isinstance(value, VideoReader):
+        elif VideoReader is not None and isinstance(value, VideoReader):
             # convert the torchvision video reader to bytes
             return encode_torchvision_video(value)
-        elif value.get("path") is not None and os.path.isfile(value["path"]):
-            # we set "bytes": None to not duplicate the data if they're already available locally
-            return {"bytes": None, "path": value.get("path")}
-        elif value.get("bytes") is not None or value.get("path") is not None:
-            # store the video bytes, and path is used to infer the video format using the file extension
-            return {"bytes": value.get("bytes"), "path": value.get("path")}
+        elif isinstance(value, dict):
+            path, bytes_ = value.get("path"), value.get("bytes")
+            if path is not None and os.path.isfile(path):
+                # we set "bytes": None to not duplicate the data if they're already available locally
+                return {"bytes": None, "path": path}
+            elif bytes_ is not None or path is not None:
+                # store the video bytes, and path is used to infer the video format using the file extension
+                return {"bytes": bytes_, "path": path}
+            else:
+                raise ValueError(
+                    f"A video sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
+                )
         else:
-            raise ValueError(
-                f"A video sample should have one of 'path' or 'bytes' but they are missing or None in {value}."
-            )
+            raise TypeError(f"Unsupported encode_example type: {type(value)}")
 
-    def decode_example(self, value: dict, token_per_repo_id=None) -> "VideoReader":
+    def decode_example(
+        self,
+        value: Union[str, Example],
+        token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None,
+    ) -> "VideoReader":
         """Decode example video file into video data.
 
         Args:
@@ -136,15 +149,18 @@ def decode_example(self, value: dict, token_per_repo_id=None) -> "VideoReader":
         if token_per_repo_id is None:
             token_per_repo_id = {}
 
-        path, bytes_ = value["path"], value["bytes"]
+        if isinstance(value, str):
+            path, bytes_ = value, None
+        else:
+            path, bytes_ = value["path"], value["bytes"]
+
         if bytes_ is None:
             if path is None:
                 raise ValueError(f"A video should have one of 'path' or 'bytes' but both are None in {value}.")
+            elif is_local_path(path):
+                video = VideoReader(path)
             else:
-                if is_local_path(path):
-                    video = VideoReader(path)
-                else:
-                    video = hf_video_reader(path, token_per_repo_id=token_per_repo_id)
+                video = hf_video_reader(path, token_per_repo_id=token_per_repo_id)
         else:
             video = VideoReader(bytes_)
         video._hf_encoded = {"path": path, "bytes": bytes_}
@@ -215,7 +231,7 @@ def video_to_bytes(video: "VideoReader") -> bytes:
     raise NotImplementedError()
 
 
-def encode_torchvision_video(video: "VideoReader") -> dict:
+def encode_torchvision_video(video: "VideoReader") -> Example:
     if hasattr(video, "_hf_encoded"):
         return video._hf_encoded
     else:
@@ -224,7 +240,7 @@ def encode_torchvision_video(video: "VideoReader") -> dict:
         )
 
 
-def encode_np_array(array: np.ndarray) -> dict:
+def encode_np_array(array: np.ndarray) -> Example:
     raise NotImplementedError()
 
 
@@ -235,7 +251,7 @@ def encode_np_array(array: np.ndarray) -> dict:
 
 
 def hf_video_reader(
-    path: str, token_per_repo_id: Optional[dict[str, str]] = None, stream: str = "video"
+    path: str, token_per_repo_id: Optional[dict[str, Union[bool, str]]] = None, stream: str = "video"
 ) -> "VideoReader":
     import av
     from torchvision import get_video_backend
@@ -246,11 +262,8 @@ def hf_video_reader(
         token_per_repo_id = {}
     source_url = path.split("::")[-1]
     pattern = config.HUB_DATASETS_URL if source_url.startswith(config.HF_ENDPOINT) else config.HUB_DATASETS_HFFS_URL
-    try:
-        repo_id = string_to_dict(source_url, pattern)["repo_id"]
-        token = token_per_repo_id.get(repo_id)
-    except ValueError:
-        token = None
+    source_url_fields = string_to_dict(source_url, pattern)
+    token = token_per_repo_id.get(source_url_fields["repo_id"]) if source_url_fields is not None else None
     download_config = DownloadConfig(token=token)
     f = xopen(path, "rb", download_config=download_config)
 
diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py
@@ -159,7 +159,7 @@ def glob_pattern_to_regex(pattern):
     )
 
 
-def string_to_dict(string: str, pattern: str) -> dict[str, str]:
+def string_to_dict(string: str, pattern: str) -> Optional[dict[str, str]]:
     """Un-format a string using a python f-string pattern.
     From https://stackoverflow.com/a/36838374
 
@@ -177,15 +177,14 @@ def string_to_dict(string: str, pattern: str) -> dict[str, str]:
         pattern (str): pattern formatted like a python f-string
 
     Returns:
-        Dict[str, str]: dictionary of variable -> value, retrieved from the input using the pattern
-
-    Raises:
-        ValueError: if the string doesn't match the pattern
+        Optional[dict[str, str]]: dictionary of variable -> value, retrieved from the input using the pattern, or
+        `None` if the string does not match the pattern.
     """
+    pattern = re.sub(r"{([^:}]+)(?::[^}]+)?}", r"{\1}", pattern)  # remove format specifiers, e.g. {rank:05d} -> {rank}
     regex = re.sub(r"{(.+?)}", r"(?P<_\1>.+)", pattern)
     result = re.search(regex, string)
     if result is None:
-        raise ValueError(f"String {string} doesn't match the pattern {pattern}")
+        return None
     values = list(result.groups())
     keys = re.findall(r"{(.+?)}", pattern)
     _dict = dict(zip(keys, values))
diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py
@@ -1,3 +1,4 @@
+import os
 import time
 from dataclasses import dataclass
 from multiprocessing import Pool
@@ -13,6 +14,7 @@
     asdict,
     iflatmap_unordered,
     map_nested,
+    string_to_dict,
     temp_seed,
     temporary_assignment,
     zip_dict,
@@ -267,3 +269,21 @@ def test_iflatmap_unordered():
         assert out.count("a") == 2
         assert out.count("b") == 2
         assert len(out) == 4
+
+
+def test_string_to_dict():
+    file_name = "dataset/cache-3b163736cf4505085d8b5f9b4c266c26.arrow"
+    file_name_prefix, file_name_ext = os.path.splitext(file_name)
+
+    suffix_template = "_{rank:05d}_of_{num_proc:05d}"
+    cache_file_name_pattern = file_name_prefix + suffix_template + file_name_ext
+
+    file_name_parts = string_to_dict(file_name, cache_file_name_pattern)
+    assert file_name_parts is None
+
+    rank = 1
+    num_proc = 2
+    file_name = file_name_prefix + suffix_template.format(rank=rank, num_proc=num_proc) + file_name_ext
+    file_name_parts = string_to_dict(file_name, cache_file_name_pattern)
+    assert file_name_parts is not None
+    assert file_name_parts == {"rank": f"{rank:05d}", "num_proc": f"{num_proc:05d}"}