-
Notifications
You must be signed in to change notification settings - Fork 111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Making listing lazy in DatasetQuery
#976
Changes from all commits
8a0fed2
e0751fb
b969ca5
dff695a
1eaef9e
2e9ada3
4863c89
e23f383
de8dcbf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,6 @@ | |
) | ||
|
||
from datachain.lib.file import ( | ||
File, | ||
FileType, | ||
get_file_type, | ||
) | ||
|
@@ -95,24 +94,28 @@ def from_storage( | |
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type}) | ||
return dc | ||
|
||
dc = from_dataset(list_ds_name, session=session, settings=settings) | ||
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type}) | ||
|
||
if update or not list_ds_exists: | ||
# disable prefetch for listing, as it pre-downloads all files | ||
( | ||
from_records( | ||
DataChain.DEFAULT_FILE_RECORD, | ||
session=session, | ||
settings=settings, | ||
in_memory=in_memory, | ||
) | ||
.settings(prefetch=0) | ||
.gen( | ||
list_bucket(list_uri, cache, client_config=client_config), | ||
output={f"{object_name}": File}, | ||
|
||
def lst_fn(): | ||
# disable prefetch for listing, as it pre-downloads all files | ||
( | ||
from_records( | ||
DataChain.DEFAULT_FILE_RECORD, | ||
session=session, | ||
settings=settings, | ||
in_memory=in_memory, | ||
) | ||
.settings(prefetch=0) | ||
.gen( | ||
list_bucket(list_uri, cache, client_config=client_config), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems to be called everytime I use the datachain to apply steps. Should'nt this be applied only once? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should be called every time you apply steps. The whole idea is for user to apply steps only once anyway as it's very expensive operation. |
||
output={f"{object_name}": file_type}, | ||
) | ||
.save(list_ds_name, listing=True) | ||
) | ||
.save(list_ds_name, listing=True) | ||
) | ||
|
||
dc = from_dataset(list_ds_name, session=session, settings=settings) | ||
dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type}) | ||
dc._query.add_before_steps(lst_fn) | ||
|
||
return ls(dc, list_path, recursive=recursive, object_name=object_name) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ | |
QueryScriptCancelError, | ||
) | ||
from datachain.func.base import Function | ||
from datachain.lib.listing import is_listing_dataset | ||
from datachain.lib.udf import UDFAdapter, _get_cache | ||
from datachain.progress import CombinedDownloadCallback, TqdmCombinedDownloadCallback | ||
from datachain.query.schema import C, UDFParamSpec, normalize_param | ||
|
@@ -151,13 +152,6 @@ def step_result( | |
) | ||
|
||
|
||
class StartingStep(ABC): | ||
"""An initial query processing step, referencing a data source.""" | ||
|
||
@abstractmethod | ||
def apply(self) -> "StepResult": ... | ||
|
||
|
||
@frozen | ||
class Step(ABC): | ||
"""A query processing step (filtering, mutation, etc.)""" | ||
|
@@ -170,7 +164,7 @@ def apply( | |
|
||
|
||
@frozen | ||
class QueryStep(StartingStep): | ||
class QueryStep: | ||
catalog: "Catalog" | ||
dataset_name: str | ||
dataset_version: int | ||
|
@@ -1097,26 +1091,42 @@ def __init__( | |
self.temp_table_names: list[str] = [] | ||
self.dependencies: set[DatasetDependencyType] = set() | ||
self.table = self.get_table() | ||
self.starting_step: StartingStep | ||
self.starting_step: Optional[QueryStep] = None | ||
self.name: Optional[str] = None | ||
self.version: Optional[int] = None | ||
self.feature_schema: Optional[dict] = None | ||
self.column_types: Optional[dict[str, Any]] = None | ||
self.before_steps: list[Callable] = [] | ||
|
||
self.name = name | ||
self.list_ds_name: Optional[str] = None | ||
|
||
if fallback_to_studio and is_token_set(): | ||
ds = self.catalog.get_dataset_with_remote_fallback(name, version) | ||
self.name = name | ||
self.dialect = self.catalog.warehouse.db.dialect | ||
if version: | ||
self.version = version | ||
|
||
if is_listing_dataset(name): | ||
# not setting query step yet as listing dataset might not exist at | ||
# this point | ||
self.list_ds_name = name | ||
elif fallback_to_studio and is_token_set(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not related to this PR, but We may want to import it as:
above, for example, just for the better readability of the code here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, agreed cc @amritghimire ... it is still not a good idea to have Studio exposed this way ideally it should be just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's push really really hard to keep studio contained, it is important ... in the same way as for example using DC itself for the implementations (e.g. I wonder if from_storage can be done via There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Listing is already done with def from_ storage():
return (
cls.from_records(DEFAULT_FILE_RECORDS)
.gen(list_bucket(,,,))
.save(list_ds_name, listing=True)
)
ds = DataChain.from_storage("s3://ldb-public").filter(...).map(...).save("my_dataset") This is similar as it was before this PR but it's not lazy and to make it lazy we need to add some step in |
||
self._set_starting_step( | ||
self.catalog.get_dataset_with_remote_fallback(name, version) | ||
) | ||
else: | ||
ds = self.catalog.get_dataset(name) | ||
self._set_starting_step(self.catalog.get_dataset(name)) | ||
|
||
def _set_starting_step(self, ds: "DatasetRecord") -> None: | ||
if not self.version: | ||
self.version = ds.latest_version | ||
|
||
self.version = version or ds.latest_version | ||
self.starting_step = QueryStep(self.catalog, ds.name, self.version) | ||
|
||
# at this point we know our starting dataset so setting up schemas | ||
self.feature_schema = ds.get_version(self.version).feature_schema | ||
self.column_types = copy(ds.schema) | ||
if "sys__id" in self.column_types: | ||
self.column_types.pop("sys__id") | ||
self.starting_step = QueryStep(self.catalog, name, self.version) | ||
self.dialect = self.catalog.warehouse.db.dialect | ||
|
||
def __iter__(self): | ||
return iter(self.db_results()) | ||
|
@@ -1180,11 +1190,23 @@ def c(self, column: Union[C, str]) -> "ColumnClause[Any]": | |
col.table = self.table | ||
return col | ||
|
||
def add_before_steps(self, fn: Callable) -> None: | ||
""" | ||
Setting custom function to be run before applying steps | ||
""" | ||
self.before_steps.append(fn) | ||
|
||
def apply_steps(self) -> QueryGenerator: | ||
""" | ||
Apply the steps in the query and return the resulting | ||
sqlalchemy.SelectBase. | ||
""" | ||
for fn in self.before_steps: | ||
fn() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With this I saw a caveat, that fn seems to be called every time a step is performed since we don't clear the before steps at any time. So, whenever I try to use the collect or chain, I am getting the query to refetch the table instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is expected and it's how it was before when listing was lazy (before we refactored it using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but it seems to run every time I run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, my question was should we rerun listing every time collect is called when update is passed? Or once should suffice? |
||
|
||
if self.list_ds_name: | ||
# at this point we know what is our starting listing dataset name | ||
self._set_starting_step(self.catalog.get_dataset(self.list_ds_name)) # type: ignore [arg-type] | ||
query = self.clone() | ||
|
||
index = os.getenv("DATACHAIN_QUERY_CHUNK_INDEX", self._chunk_index) | ||
|
@@ -1203,6 +1225,7 @@ def apply_steps(self) -> QueryGenerator: | |
query = query.filter(C.sys__rand % total == index) | ||
query.steps = query.steps[-1:] + query.steps[:-1] | ||
|
||
assert query.starting_step | ||
result = query.starting_step.apply() | ||
self.dependencies.update(result.dependencies) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calling from_dataset when list_ds_exists is false also doesn't seem right
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lower level code (
DatasetQuery
) is aware of listing being lazy so this is ok. We will start chain with listing dataset and the fact it doesn't exists yet is just the nature of it's "laziness"There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean we could get dataset not found error when the ist_ds_name doesn't exist