Skip to content

Add support for csv/tsv and documentations #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ load(path, load_options)
```
Loads the dataset at the path and **automatically infers its format** (e.g., compressed JSON, PyArrow, MDS, etc.) based on clues from the file format and directory structure.

For loading from datasets hosted on the HuggingFace hub, you can use the `hub` input type and specify the path as `path/to/dataset>name#split`.
For example, `load("tatsu-lab/alpaca_eval>alpaca_eval#eval")` is equivalent to `datasets.load_dataset("tatsu-lab/alpaca_eval", split="eval")`.

---

```python
Expand Down
21 changes: 19 additions & 2 deletions datatools/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ def load_from_hub(path: str):
return load_dataset(path, name=(name[0] if name else None), split=(split[0] if split else None))


def load_csv(path: Union[Path, str]):
from datasets import load_dataset
if isinstance(path, Path):
# HY: load_dataset expects a string path
path = str(path)

if "tsv" in path:
return load_dataset("csv", data_files=path, delimiter="\t")['train']
else:
return load_dataset("csv", data_files=path)['train']


def load_hf_dataset(path: Union[Path, str], input_type: str):
from datasets import load_from_disk, Dataset
path = str(path)
Expand All @@ -35,6 +47,8 @@ def load_hf_dataset(path: Union[Path, str], input_type: str):
"arrow": Dataset.from_file,
"parquet": Dataset.from_parquet,
"hub": load_from_hub,
"csv": load_csv,
"tsv": load_csv,
}[input_type](path)


Expand All @@ -54,17 +68,20 @@ def load(*input_paths: List[Union[Path, str]], options: Optional[LoadOptions] =
# Best guess from file extension
# Iterate over suffixes in reverse order to handle cases like .jsonl.zst
for suffix in path.suffixes[::-1]:
if suffix in [".arrow", ".parquet", ".npy", ".jsonl"]:
if suffix in [".arrow", ".parquet", ".npy", ".jsonl", ".tsv", ".csv"]:
input_type = suffix[1:]
break
elif not path.exists():
# HY: if the path does not exist (not a file or directory), we assume it should be loaded from hub
input_type = "hub"

if input_type == "mosaic":
return LocalDatasets(input_paths)
elif input_type == "jsonl":
return JsonlDataset(input_paths)
elif input_type == "npy":
return np.concatenate([np.load(path) for path in input_paths])
elif input_type in {"hf", "arrow", "parquet", "hub"}:
elif input_type in {"hf", "arrow", "parquet", "hub", "csv", "tsv"}:
from datasets import concatenate_datasets
return concatenate_datasets([load_hf_dataset(path, input_type) for path in input_paths])
else:
Expand Down