diff --git a/conftest.py b/conftest.py index 3a91de643..31c19ccbf 100644 --- a/conftest.py +++ b/conftest.py @@ -27,35 +27,34 @@ from __future__ import annotations import asyncio -from collections import defaultdict -from dataclasses import dataclass import json import os import pathlib import shutil -import subprocess import signal import socket +import subprocess import sys import tempfile import time import typing as t import uuid import warnings +from collections import defaultdict +from dataclasses import dataclass from subprocess import run -import time import psutil import pytest import smartsim from smartsim import Experiment -from smartsim._core.launcher.dragon.dragonConnector import DragonConnector -from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.config import CONFIG from smartsim._core.config.config import Config +from smartsim._core.launcher.dragon.dragonConnector import DragonConnector +from smartsim._core.launcher.dragon.dragonLauncher import DragonLauncher from smartsim._core.utils.telemetry.telemetry import JobEntity -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Application from smartsim.error import SSConfigError, SSInternalError from smartsim.log import get_logger @@ -468,13 +467,13 @@ def check_output_dir() -> None: @pytest.fixture -def dbutils() -> t.Type[DBUtils]: - return DBUtils +def fsutils() -> t.Type[FSUtils]: + return FSUtils -class DBUtils: +class FSUtils: @staticmethod - def get_db_configs() -> t.Dict[str, t.Any]: + def get_fs_configs() -> t.Dict[str, t.Any]: config_settings = { "enable_checkpoints": 1, "set_max_memory": "3gb", @@ -488,7 +487,7 @@ def get_db_configs() -> t.Dict[str, t.Any]: return config_settings @staticmethod - def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: + def get_smartsim_error_fs_configs() -> t.Dict[str, t.Any]: bad_configs = { "save": [ "-1", # frequency must be positive @@ -515,7 +514,7 @@ def get_smartsim_error_db_configs() -> t.Dict[str, t.Any]: return bad_configs @staticmethod - def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: + def get_type_error_fs_configs() -> t.Dict[t.Union[int, str], t.Any]: bad_configs: t.Dict[t.Union[int, str], t.Any] = { "save": [2, True, ["2"]], # frequency must be specified as a string "maxmemory": [99, True, ["99"]], # memory form must be a string @@ -536,15 +535,15 @@ def get_type_error_db_configs() -> t.Dict[t.Union[int, str], t.Any]: @staticmethod def get_config_edit_method( - db: Orchestrator, config_setting: str + fs: FeatureStore, config_setting: str ) -> t.Optional[t.Callable[..., None]]: - """Get a db configuration file edit method from a str""" + """Get a fs configuration file edit method from a str""" config_edit_methods: t.Dict[str, t.Callable[..., None]] = { - "enable_checkpoints": db.enable_checkpoints, - "set_max_memory": db.set_max_memory, - "set_eviction_strategy": db.set_eviction_strategy, - "set_max_clients": db.set_max_clients, - "set_max_message_size": db.set_max_message_size, + "enable_checkpoints": fs.enable_checkpoints, + "set_max_memory": fs.set_max_memory, + "set_eviction_strategy": fs.set_eviction_strategy, + "set_max_clients": fs.set_max_clients, + "set_max_message_size": fs.set_max_message_size, } return config_edit_methods.get(config_setting, None) @@ -650,21 +649,21 @@ class ColoUtils: @staticmethod def setup_test_colo( fileutils: t.Type[FileUtils], - db_type: str, + fs_type: str, exp: Experiment, application_file: str, - db_args: t.Dict[str, t.Any], + fs_args: t.Dict[str, t.Any], colo_settings: t.Optional[RunSettings] = None, colo_application_name: str = "colocated_application", port: t.Optional[int] = None, on_wlm: bool = False, ) -> Application: - """Setup database needed for the colo pinning tests""" + """Setup feature store needed for the colo pinning tests""" # get test setup sr_test_script = fileutils.get_test_conf_path(application_file) - # Create an app with a colo_db which uses 1 db_cpu + # Create an app with a colo_fs which uses 1 fs_cpu if colo_settings is None: colo_settings = exp.create_run_settings( exe=sys.executable, exe_args=[sr_test_script] @@ -675,28 +674,28 @@ def setup_test_colo( colo_application = exp.create_application(colo_application_name, colo_settings) - if db_type in ["tcp", "deprecated"]: - db_args["port"] = port if port is not None else _find_free_port(test_ports) - db_args["ifname"] = "lo" - if db_type == "uds" and colo_application_name is not None: + if fs_type in ["tcp", "deprecated"]: + fs_args["port"] = port if port is not None else _find_free_port(test_ports) + fs_args["ifname"] = "lo" + if fs_type == "uds" and colo_application_name is not None: tmp_dir = tempfile.gettempdir() socket_suffix = str(uuid.uuid4())[:7] socket_name = f"{colo_application_name}_{socket_suffix}.socket" - db_args["unix_socket"] = os.path.join(tmp_dir, socket_name) + fs_args["unix_socket"] = os.path.join(tmp_dir, socket_name) colocate_fun: t.Dict[str, t.Callable[..., None]] = { - "tcp": colo_application.colocate_db_tcp, - "deprecated": colo_application.colocate_db, - "uds": colo_application.colocate_db_uds, + "tcp": colo_application.colocate_fs_tcp, + "deprecated": colo_application.colocate_fs, + "uds": colo_application.colocate_fs_uds, } with warnings.catch_warnings(): - if db_type == "deprecated": - message = "`colocate_db` has been deprecated" + if fs_type == "deprecated": + message = "`colocate_fs` has been deprecated" warnings.filterwarnings("ignore", message=message) - colocate_fun[db_type](**db_args) - # assert application will launch with colocated db + colocate_fun[fs_type](**fs_args) + # assert application will launch with colocated fs assert colo_application.colocated - # Check to make sure that limit_db_cpus made it into the colo settings + # Check to make sure that limit_fs_cpus made it into the colo settings return colo_application @@ -747,7 +746,7 @@ def mock_sink() -> t.Type[MockSink]: @pytest.fixture def mock_con() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db connection telemetry""" + """Generates mock fs connection telemetry""" def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: for i in range(min, max): @@ -761,7 +760,7 @@ def _mock_con(min: int = 1, max: int = 254) -> t.Iterable[t.Any]: @pytest.fixture def mock_mem() -> t.Callable[[int, int], t.Iterable[t.Any]]: - """Generates mock db memory usage telemetry""" + """Generates mock fs memory usage telemetry""" def _mock_mem(min: int = 1, max: int = 1000) -> t.Iterable[t.Any]: for i in range(min, max): @@ -879,9 +878,9 @@ def details(self) -> t.List[t.Tuple[t.Tuple[t.Any, ...], t.Dict[str, t.Any]]]: return self._details -## Reuse database across tests +## Reuse feature store across tests -database_registry: t.DefaultDict[str, t.Optional[Orchestrator]] = defaultdict( +feature_store_registry: t.DefaultDict[str, t.Optional[FeatureStore]] = defaultdict( lambda: None ) @@ -902,14 +901,14 @@ def wlm_experiment(test_dir: str, wlmutils: WLMUtils) -> smartsim.Experiment: ) -def _cleanup_db(name: str) -> None: - global database_registry - db = database_registry[name] - if db and db.is_active(): +def _cleanup_fs(name: str) -> None: + global feature_store_registry + fs = feature_store_registry[name] + if fs and fs.is_active(): exp = Experiment("cleanup") try: - db = exp.reconnect_orchestrator(db.checkpoint_file) - exp.stop(db) + fs = exp.reconnect_feature_store(fs.checkpoint_file) + exp.stop(fs) except: pass @@ -925,15 +924,15 @@ class DBConfiguration: @dataclass -class PrepareDatabaseOutput: - orchestrator: t.Optional[Orchestrator] # The actual orchestrator object - new_db: bool # True if a new database was created when calling prepare_db +class PrepareFeatureStoreOutput: + featurestore: t.Optional[FeatureStore] # The actual feature store object + new_fs: bool # True if a new feature store was created when calling prepare_fs -# Reuse databases +# Reuse feature stores @pytest.fixture(scope="session") -def local_db() -> t.Generator[DBConfiguration, None, None]: - name = "local_db_fixture" +def local_fs() -> t.Generator[DBConfiguration, None, None]: + name = "local_fs_fixture" config = DBConfiguration( name, "local", @@ -943,14 +942,15 @@ def local_db() -> t.Generator[DBConfiguration, None, None]: _find_free_port(tuple(reversed(test_ports))), ) yield config - _cleanup_db(name) + _cleanup_fs(name) + @pytest.fixture(scope="session") -def single_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: +def single_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: hostlist = wlmutils.get_test_hostlist() hostlist = hostlist[-1:] if hostlist is not None else None - name = "single_db_fixture" + name = "single_fx_fixture" config = DBConfiguration( name, wlmutils.get_test_launcher(), @@ -960,14 +960,14 @@ def single_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: _find_free_port(tuple(reversed(test_ports))), ) yield config - _cleanup_db(name) + _cleanup_fs(name) @pytest.fixture(scope="session") -def clustered_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: +def clustered_fs(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None]: hostlist = wlmutils.get_test_hostlist() hostlist = hostlist[-4:-1] if hostlist is not None else None - name = "clustered_db_fixture" + name = "clustered_fs_fixture" config = DBConfiguration( name, wlmutils.get_test_launcher(), @@ -977,12 +977,12 @@ def clustered_db(wlmutils: WLMUtils) -> t.Generator[DBConfiguration, None, None] _find_free_port(tuple(reversed(test_ports))), ) yield config - _cleanup_db(name) + _cleanup_fs(name) @pytest.fixture -def register_new_db() -> t.Callable[[DBConfiguration], Orchestrator]: - def _register_new_db(config: DBConfiguration) -> Orchestrator: +def register_new_fs() -> t.Callable[[DBConfiguration], FeatureStore]: + def _register_new_fs(config: DBConfiguration) -> FeatureStore: exp_path = pathlib.Path(test_output_root, config.name) exp_path.mkdir(exist_ok=True) exp = Experiment( @@ -990,40 +990,40 @@ def _register_new_db(config: DBConfiguration) -> Orchestrator: exp_path=str(exp_path), launcher=config.launcher, ) - orc = exp.create_database( + feature_store = exp.create_feature_store( port=config.port, batch=False, interface=config.interface, hosts=config.hostlist, - db_nodes=config.num_nodes, + fs_nodes=config.num_nodes, ) - exp.generate(orc, overwrite=True) - exp.start(orc) - global database_registry - database_registry[config.name] = orc - return orc + exp.generate(feature_store, overwrite=True) + exp.start(feature_store) + global feature_store_registry + feature_store_registry[config.name] = feature_store + return feature_store - return _register_new_db + return _register_new_fs @pytest.fixture(scope="function") -def prepare_db( - register_new_db: t.Callable[[DBConfiguration], Orchestrator] -) -> t.Callable[[DBConfiguration], PrepareDatabaseOutput]: - def _prepare_db(db_config: DBConfiguration) -> PrepareDatabaseOutput: - global database_registry - db = database_registry[db_config.name] +def prepare_fs( + register_new_fs: t.Callable[[DBConfiguration], FeatureStore] +) -> t.Callable[[DBConfiguration], PrepareFeatureStoreOutput]: + def _prepare_fs(fs_config: DBConfiguration) -> PrepareFeatureStoreOutput: + global feature_store_registry + fs = feature_store_registry[fs_config.name] - new_db = False - db_up = False + new_fs = False + fs_up = False - if db: - db_up = db.is_active() + if fs: + fs_up = fs.is_active() - if not db_up or db is None: - db = register_new_db(db_config) - new_db = True + if not fs_up or fs is None: + fs = register_new_fs(fs_config) + new_fs = True - return PrepareDatabaseOutput(db, new_db) + return PrepareFeatureStoreOutput(fs, new_fs) - return _prepare_db + return _prepare_fs diff --git a/setup.py b/setup.py index 96f98bc2c..f0c0b045d 100644 --- a/setup.py +++ b/setup.py @@ -140,13 +140,13 @@ def finalize_options(self): class SmartSimBuild(build_py): def run(self): - database_builder = builder.DatabaseBuilder( + feature_store_builder = builder.FeatureStoreBuilder( build_env(), build_env.MALLOC, build_env.JOBS ) - if not database_builder.is_built: - database_builder.build_from_git(versions.REDIS_URL, versions.REDIS) + if not feature_store_builder.is_built: + feature_store_builder.build_from_git(versions.REDIS_URL, versions.REDIS) - database_builder.cleanup() + feature_store_builder.cleanup() # run original build_py command super().run() diff --git a/smartsim/_core/_cli/build.py b/smartsim/_core/_cli/build.py index 951521f17..ea5f2177c 100644 --- a/smartsim/_core/_cli/build.py +++ b/smartsim/_core/_cli/build.py @@ -107,12 +107,12 @@ def check_backends_install() -> bool: return not bool(msg) -def build_database( +def build_feature_store( build_env: BuildEnv, versions: Versioner, keydb: bool, verbose: bool ) -> None: - # check database installation - database_name = "KeyDB" if keydb else "Redis" - database_builder = builder.DatabaseBuilder( + # check feature store installation + feature_store_name = "KeyDB" if keydb else "Redis" + feature_store_builder = builder.FeatureStoreBuilder( build_env(), jobs=build_env.JOBS, _os=builder.OperatingSystem.from_str(platform.system()), @@ -120,14 +120,14 @@ def build_database( malloc=build_env.MALLOC, verbose=verbose, ) - if not database_builder.is_built: + if not feature_store_builder.is_built: logger.info( - f"Building {database_name} version {versions.REDIS} " + f"Building {feature_store_name} version {versions.REDIS} " f"from {versions.REDIS_URL}" ) - database_builder.build_from_git(versions.REDIS_URL, versions.REDIS_BRANCH) - database_builder.cleanup() - logger.info(f"{database_name} build complete!") + feature_store_builder.build_from_git(versions.REDIS_URL, versions.REDIS_BRANCH) + feature_store_builder.cleanup() + logger.info(f"{feature_store_name} build complete!") def build_redis_ai( @@ -403,9 +403,9 @@ def execute( _configure_keydb_build(versions) if verbose: - db_name: DbEngine = "KEYDB" if keydb else "REDIS" + fs_name: DbEngine = "KEYDB" if keydb else "REDIS" logger.info("Version Information:") - vers = versions.as_dict(db_name=db_name) + vers = versions.as_dict(fs_name=fs_name) version_names = list(vers.keys()) print(tabulate(vers, headers=version_names, tablefmt="github"), "\n") @@ -423,7 +423,7 @@ def execute( try: if not args.only_python_packages: # REDIS/KeyDB - build_database(build_env, versions, keydb, verbose) + build_feature_store(build_env, versions, keydb, verbose) # REDISAI build_redis_ai( diff --git a/smartsim/_core/_cli/dbcli.py b/smartsim/_core/_cli/dbcli.py index 733c2fe4d..b06e5984f 100644 --- a/smartsim/_core/_cli/dbcli.py +++ b/smartsim/_core/_cli/dbcli.py @@ -28,14 +28,14 @@ import os import typing as t -from smartsim._core._cli.utils import get_db_path +from smartsim._core._cli.utils import get_fs_path def execute( _args: argparse.Namespace, _unparsed_args: t.Optional[t.List[str]] = None, / ) -> int: - if db_path := get_db_path(): - print(db_path) + if fs_path := get_fs_path(): + print(fs_path) return os.EX_OK - print("Database (Redis or KeyDB) dependencies not found") + print("Feature store(Redis or KeyDB) dependencies not found") return os.EX_SOFTWARE diff --git a/smartsim/_core/_cli/info.py b/smartsim/_core/_cli/info.py index c08fcb1a3..4f4137cd2 100644 --- a/smartsim/_core/_cli/info.py +++ b/smartsim/_core/_cli/info.py @@ -29,12 +29,12 @@ def execute( end="\n\n", ) - print("Orchestrator Configuration:") - db_path = _utils.get_db_path() - db_table = [["Installed", _fmt_installed_db(db_path)]] - if db_path: - db_table.append(["Location", str(db_path)]) - print(tabulate(db_table, tablefmt="fancy_outline"), end="\n\n") + print("FeatureStore Configuration:") + fs_path = _utils.get_fs_path() + fs_table = [["Installed", _fmt_installed_fs(fs_path)]] + if fs_path: + fs_table.append(["Location", str(fs_path)]) + print(tabulate(fs_table, tablefmt="fancy_outline"), end="\n\n") print("Redis AI Configuration:") rai_path = _helpers.redis_install_base().parent / "redisai.so" @@ -72,11 +72,11 @@ def execute( return os.EX_OK -def _fmt_installed_db(db_path: t.Optional[pathlib.Path]) -> str: - if db_path is None: +def _fmt_installed_fs(fs_path: t.Optional[pathlib.Path]) -> str: + if fs_path is None: return _MISSING_DEP - db_name, _ = db_path.name.split("-", 1) - return _helpers.colorize(db_name.upper(), "green") + fs_name, _ = fs_path.name.split("-", 1) + return _helpers.colorize(fs_name.upper(), "green") def _fmt_installed_redis_ai(rai_path: pathlib.Path) -> str: diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 9c9b46cab..6c2a40911 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -113,12 +113,12 @@ def clean(core_path: Path, _all: bool = False) -> int: removed = True file_path.unlink() if removed: - logger.info("Successfully removed SmartSim database installation") + logger.info("Successfully removed SmartSim feature store installation") return os.EX_OK -def get_db_path() -> t.Optional[Path]: +def get_fs_path() -> t.Optional[Path]: bin_path = get_install_path() / "_core" / "bin" for option in bin_path.iterdir(): if option.name in ("redis-cli", "keydb-cli"): diff --git a/smartsim/_core/_cli/validate.py b/smartsim/_core/_cli/validate.py index 96d46d6ee..709968c11 100644 --- a/smartsim/_core/_cli/validate.py +++ b/smartsim/_core/_cli/validate.py @@ -128,7 +128,7 @@ def configure_parser(parser: argparse.ArgumentParser) -> None: type=int, default=None, help=( - "The port on which to run the orchestrator for the mini experiment. " + "The port on which to run the feature store for the mini experiment. " "If not provided, `smart` will attempt to automatically select an " "open port" ), @@ -154,7 +154,7 @@ def test_install( exp.telemetry.disable() port = find_free_port() if port is None else port - with _make_managed_local_orc(exp, port) as client: + with _make_managed_local_feature_store(exp, port) as client: logger.info("Verifying Tensor Transfer") client.put_tensor("plain-tensor", np.ones((1, 1, 3, 3))) client.get_tensor("plain-tensor") @@ -192,18 +192,18 @@ def _set_or_del_env_var(var: str, val: t.Optional[str]) -> None: @contextlib.contextmanager -def _make_managed_local_orc( +def _make_managed_local_feature_store( exp: Experiment, port: int ) -> t.Generator[Client, None, None]: - """Context managed orc that will be stopped if an exception is raised""" - orc = exp.create_database(db_nodes=1, interface="lo", port=port) - exp.generate(orc) - exp.start(orc) + """Context managed feature store that will be stopped if an exception is raised""" + feature_store = exp.create_feature_store(fs_nodes=1, interface="lo", port=port) + exp.generate(feature_store) + exp.start(feature_store) try: - (client_addr,) = orc.get_address() + (client_addr,) = feature_store.get_address() yield Client(False, address=client_addr) finally: - exp.stop(orc) + exp.stop(feature_store) def _test_tf_install(client: Client, tmp_dir: str, device: Device) -> None: diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index edb1ff116..dadd31c1d 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -281,11 +281,11 @@ class Versioner: TENSORFLOW = Version_(REDISAI.tensorflow) ONNX = Version_(REDISAI.onnx) - def as_dict(self, db_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: + def as_dict(self, fs_name: DbEngine = "REDIS") -> t.Dict[str, t.Tuple[str, ...]]: pkg_map = { "SMARTSIM": self.SMARTSIM, "SMARTREDIS": self.SMARTREDIS, - db_name: self.REDIS, + fs_name: self.REDIS, "REDISAI": self.REDISAI, "TORCH": self.TORCH, "TENSORFLOW": self.TENSORFLOW, diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index 8f5bdc557..20d025773 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -243,7 +243,7 @@ def run_command( raise BuildError(e) from e -class DatabaseBuilder(Builder): +class FeatureStoreBuilder(Builder): """Class to build Redis or KeyDB from Source Supported build methods: - from git @@ -285,8 +285,8 @@ def build_from_git( :param branch: branch to checkout """ # pylint: disable=too-many-locals - database_name = "keydb" if "KeyDB" in git_url else "redis" - database_build_path = Path(self.build_dir, database_name.lower()) + feature_store_name = "keydb" if "KeyDB" in git_url else "redis" + feature_store_build_path = Path(self.build_dir, feature_store_name.lower()) # remove git directory if it exists as it should # really never exist as we delete after build @@ -297,9 +297,9 @@ def build_from_git( if keydb_build_path.is_dir(): shutil.rmtree(str(keydb_build_path)) - # Check database URL + # Check feature store URL if not self.is_valid_url(git_url): - raise BuildError(f"Malformed {database_name} URL: {git_url}") + raise BuildError(f"Malformed {feature_store_name} URL: {git_url}") clone_cmd = config_git_command( self._platform, @@ -311,7 +311,7 @@ def build_from_git( branch, "--depth", "1", - database_name, + feature_store_name, ], ) @@ -325,14 +325,14 @@ def build_from_git( str(self.jobs), f"MALLOC={self.malloc}", ] - self.run_command(build_cmd, cwd=str(database_build_path)) + self.run_command(build_cmd, cwd=str(feature_store_build_path)) # move redis binaries to smartsim/smartsim/_core/bin - database_src_dir = database_build_path / "src" - server_source = database_src_dir / (database_name.lower() + "-server") - server_destination = self.bin_path / (database_name.lower() + "-server") - cli_source = database_src_dir / (database_name.lower() + "-cli") - cli_destination = self.bin_path / (database_name.lower() + "-cli") + feature_store_src_dir = feature_store_build_path / "src" + server_source = feature_store_src_dir / (feature_store_name.lower() + "-server") + server_destination = self.bin_path / (feature_store_name.lower() + "-server") + cli_source = feature_store_src_dir / (feature_store_name.lower() + "-cli") + cli_destination = self.bin_path / (feature_store_name.lower() + "-cli") self.copy_file(server_source, server_destination, set_exe=True) self.copy_file(cli_source, cli_destination, set_exe=True) @@ -342,8 +342,8 @@ def build_from_git( bin_path = Path(dependency_path, "bin").resolve() try: database_exe = next(bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() - _ = expand_exe_path(str(database)) + feature_store = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + _ = expand_exe_path(str(feature_store)) except (TypeError, FileNotFoundError) as e: raise BuildError("Installation of redis-server failed!") from e diff --git a/smartsim/_core/config/config.py b/smartsim/_core/config/config.py index 9cf950b21..374457f3a 100644 --- a/smartsim/_core/config/config.py +++ b/smartsim/_core/config/config.py @@ -118,7 +118,7 @@ def database_conf(self) -> str: conf = Path(os.environ.get("REDIS_CONF", self.conf_path)).resolve() if not conf.is_file(): raise SSConfigError( - "Database configuration file at REDIS_CONF could not be found" + "Feature store configuration file at REDIS_CONF could not be found" ) return str(conf) @@ -126,12 +126,12 @@ def database_conf(self) -> str: def database_exe(self) -> str: try: database_exe = next(self.bin_path.glob("*-server")) - database = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() - exe = expand_exe_path(str(database)) + feature_store = Path(os.environ.get("REDIS_PATH", database_exe)).resolve() + exe = expand_exe_path(str(feature_store)) return exe except (TypeError, FileNotFoundError) as e: raise SSConfigError( - "Specified database binary at REDIS_PATH could not be used" + "Specified feature store binary at REDIS_PATH could not be used" ) from e @property diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 5f53db8fa..302a51e96 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -45,16 +45,16 @@ from ..._core.launcher.step import Step from ..._core.utils.helpers import ( SignalInterceptionStack, - unpack_colo_db_identifier, - unpack_db_identifier, + unpack_colo_fs_identifier, + unpack_fs_identifier, ) from ..._core.utils.redis import ( - db_is_active, + fs_is_active, set_ml_model, set_script, - shutdown_db_node, + shutdown_fs_node, ) -from ...database import Orchestrator +from ...database import FeatureStore from ...entity import Application, Ensemble, EntitySequence, SmartSimEntity from ...error import ( LauncherError, @@ -144,21 +144,21 @@ def start( launched.map(_look_up_launched_data(self._launcher)) ) - # block until all non-database jobs are complete + # block until all non-feature store jobs are complete if block: # poll handles its own keyboard interrupt as # it may be called separately self.poll(5, True, kill_on_interrupt=kill_on_interrupt) @property - def active_orchestrator_jobs(self) -> t.Dict[str, Job]: - """Return active orchestrator jobs.""" - return {**self._jobs.db_jobs} + def active_feature_store_jobs(self) -> t.Dict[str, Job]: + """Return active feature store jobs.""" + return {**self._jobs.fs_jobs} @property - def orchestrator_active(self) -> bool: + def feature_store_active(self) -> bool: with JM_LOCK: - if len(self._jobs.db_jobs) > 0: + if len(self._jobs.fs_jobs) > 0: return True return False @@ -193,8 +193,8 @@ def finished( :raises ValueError: if entity has not been launched yet """ try: - if isinstance(entity, Orchestrator): - raise TypeError("Finished() does not support Orchestrator instances") + if isinstance(entity, FeatureStore): + raise TypeError("Finished() does not support FeatureStore instances") if isinstance(entity, EntitySequence): return all(self.finished(ent) for ent in entity.entities) if not isinstance(entity, SmartSimEntity): @@ -243,21 +243,21 @@ def stop_entity( ) self._jobs.move_to_completed(job) - def stop_db(self, db: Orchestrator) -> None: - """Stop an orchestrator + def stop_fs(self, fs: FeatureStore) -> None: + """Stop an FeatureStore - :param db: orchestrator to be stopped + :param fs: FeatureStore to be stopped """ - if db.batch: - self.stop_entity(db) + if fs.batch: + self.stop_entity(fs) else: with JM_LOCK: - for node in db.entities: + for node in fs.entities: for host_ip, port in itertools.product( - (get_ip_from_host(host) for host in node.hosts), db.ports + (get_ip_from_host(host) for host in node.hosts), fs.ports ): - retcode, _, _ = shutdown_db_node(host_ip, port) - # Sometimes the DB will not shutdown (unless we force NOSAVE) + retcode, _, _ = shutdown_fs_node(host_ip, port) + # Sometimes the fs will not shutdown (unless we force NOSAVE) if retcode != 0: self.stop_entity(node) continue @@ -272,7 +272,7 @@ def stop_db(self, db: Orchestrator) -> None: ) self._jobs.move_to_completed(job) - db.reset_hosts() + fs.reset_hosts() def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list @@ -397,8 +397,8 @@ def _launch( ) -> LaunchedManifest[t.Tuple[str, Step]]: """Main launching function of the controller - Orchestrators are always launched first so that the - address of the database can be given to following entities + FeatureStores are always launched first so that the + address of the feature store can be given to following entities :param exp_name: The name of the launching experiment :param exp_path: path to location of ``Experiment`` directory if generated @@ -410,27 +410,27 @@ def _launch( exp_path=exp_path, launcher_name=str(self._launcher), ) - # Loop over deployables to launch and launch multiple orchestrators - for orchestrator in manifest.dbs: - for key in self._jobs.get_db_host_addresses(): - _, db_id = unpack_db_identifier(key, "_") - if orchestrator.db_identifier == db_id: + # Loop over deployables to launch and launch multiple FeatureStores + for featurestore in manifest.fss: + for key in self._jobs.get_fs_host_addresses(): + _, fs_id = unpack_fs_identifier(key, "_") + if featurestore.fs_identifier == fs_id: raise SSDBIDConflictError( - f"Database identifier {orchestrator.db_identifier}" + f"Feature store identifier {featurestore.fs_identifier}" " has already been used. Pass in a unique" - " name for db_identifier" + " name for fs_identifier" ) - if orchestrator.num_shards > 1 and isinstance( + if featurestore.num_shards > 1 and isinstance( self._launcher, LocalLauncher ): raise SmartSimError( - "Local launcher does not support multi-host orchestrators" + "Local launcher does not support multi-host feature stores" ) - self._launch_orchestrator(orchestrator, manifest_builder) + self._launch_feature_store(featurestore, manifest_builder) - if self.orchestrator_active: - self._set_dbobjects(manifest) + if self.feature_store_active: + self._set_fsobjects(manifest) # create all steps prior to launch steps: t.List[ @@ -498,70 +498,80 @@ def _launch( return manifest_builder.finalize() - def _launch_orchestrator( + def _launch_feature_store( self, - orchestrator: Orchestrator, + featurestore: FeatureStore, manifest_builder: LaunchedManifestBuilder[t.Tuple[str, Step]], ) -> None: - """Launch an Orchestrator instance + """Launch an FeatureStore instance - This function will launch the Orchestrator instance and + This function will launch the FeatureStore instance and if on WLM, find the nodes where it was launched and set them in the JobManager - :param orchestrator: orchestrator to launch + :param featurestore: FeatureStore to launch :param manifest_builder: An `LaunchedManifestBuilder` to record the - names and `Step`s of the launched orchestrator + names and `Step`s of the launched featurestore """ - orchestrator.remove_stale_files() - orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" + featurestore.remove_stale_files() + feature_store_telem_dir = ( + manifest_builder.run_telemetry_subdirectory / "database" + ) - # if the orchestrator was launched as a batch workload - if orchestrator.batch: - orc_batch_step, substeps = self._create_batch_job_step( - orchestrator, orc_telem_dir + # if the featurestore was launched as a batch workload + if featurestore.batch: + feature_store_batch_step, substeps = self._create_batch_job_step( + featurestore, feature_store_telem_dir ) - manifest_builder.add_database( - orchestrator, [(orc_batch_step.name, step) for step in substeps] + manifest_builder.add_feature_store( + featurestore, + [(feature_store_batch_step.name, step) for step in substeps], ) - self._launch_step(orc_batch_step, orchestrator) - self.symlink_output_files(orc_batch_step, orchestrator) + self._launch_step(feature_store_batch_step, featurestore) + self.symlink_output_files(feature_store_batch_step, featurestore) # symlink substeps to maintain directory structure - for substep, substep_entity in zip(substeps, orchestrator.entities): + for substep, substep_entity in zip(substeps, featurestore.entities): self.symlink_output_files(substep, substep_entity) - # if orchestrator was run on existing allocation, locally, or in allocation + # if featurestore was run on existing allocation, locally, or in allocation else: - db_steps = [ - (self._create_job_step(db, orc_telem_dir / orchestrator.name), db) - for db in orchestrator.entities + fs_steps = [ + ( + self._create_job_step( + fs, feature_store_telem_dir / featurestore.name + ), + fs, + ) + for fs in featurestore.entities ] - manifest_builder.add_database( - orchestrator, [(step.name, step) for step, _ in db_steps] + manifest_builder.add_feature_store( + featurestore, [(step.name, step) for step, _ in fs_steps] ) - for db_step in db_steps: - self._launch_step(*db_step) - self.symlink_output_files(*db_step) + for fs_step in fs_steps: + self._launch_step(*fs_step) + self.symlink_output_files(*fs_step) - # wait for orchestrator to spin up - self._orchestrator_launch_wait(orchestrator) + # wait for featurestore to spin up + self._feature_store_launch_wait(featurestore) # set the jobs in the job manager to provide SSDB variable to entities # if _host isnt set within each - self._jobs.set_db_hosts(orchestrator) + self._jobs.set_fs_hosts(featurestore) - # create the database cluster - if orchestrator.num_shards > 2: + # create the feature store cluster + if featurestore.num_shards > 2: num_trials = 5 cluster_created = False while not cluster_created: try: - create_cluster(orchestrator.hosts, orchestrator.ports) - check_cluster_status(orchestrator.hosts, orchestrator.ports) - num_shards = orchestrator.num_shards - logger.info(f"Database cluster created with {num_shards} shards") + create_cluster(featurestore.hosts, featurestore.ports) + check_cluster_status(featurestore.hosts, featurestore.ports) + num_shards = featurestore.num_shards + logger.info( + f"Feature store cluster created with {num_shards} shards" + ) cluster_created = True except SSInternalError: if num_trials > 0: @@ -573,8 +583,8 @@ def _launch_orchestrator( else: # surface SSInternalError as we have no way to recover raise - self._save_orchestrator(orchestrator) - logger.debug(f"Orchestrator launched on nodes: {orchestrator.hosts}") + self._save_feature_store(featurestore) + logger.debug(f"FeatureStore launched on nodes: {featurestore.hosts}") def _launch_step( self, @@ -591,10 +601,10 @@ def _launch_step( completed_job = self._jobs.completed.get(entity.name, None) # if completed job DNE and is the entity name is not - # running in JobManager.jobs or JobManager.db_jobs, + # running in JobManager.jobs or JobManager.fs_jobs, # launch the job if completed_job is None and ( - entity.name not in self._jobs.jobs and entity.name not in self._jobs.db_jobs + entity.name not in self._jobs.jobs and entity.name not in self._jobs.fs_jobs ): try: job_id = self._launcher.run(job_step) @@ -636,7 +646,7 @@ def _launch_step( def _create_batch_job_step( self, - entity_list: t.Union[Orchestrator, Ensemble, _AnonymousBatchJob], + entity_list: t.Union[FeatureStore, Ensemble, _AnonymousBatchJob], telemetry_dir: pathlib.Path, ) -> t.Tuple[Step, t.List[Step]]: """Use launcher to create batch job step @@ -695,16 +705,16 @@ def _prep_entity_client_env(self, entity: Application) -> None: :param entity: The entity to retrieve connections from """ client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} - address_dict = self._jobs.get_db_host_addresses() + address_dict = self._jobs.get_fs_host_addresses() - for db_id, addresses in address_dict.items(): - db_name, _ = unpack_db_identifier(db_id, "_") + for fs_id, addresses in address_dict.items(): + fs_name, _ = unpack_fs_identifier(fs_id, "_") if addresses: # Cap max length of SSDB - client_env[f"SSDB{db_name}"] = ",".join(addresses[:128]) + client_env[f"SSDB{fs_name}"] = ",".join(addresses[:128]) # Retrieve num_shards to append to client env - client_env[f"SR_DB_TYPE{db_name}"] = ( + client_env[f"SR_fs_TYPE{fs_name}"] = ( CLUSTERED if len(addresses) > 1 else STANDALONE ) @@ -716,20 +726,20 @@ def _prep_entity_client_env(self, entity: Application) -> None: client_env["SSKEYOUT"] = entity.name # Set address to local if it's a colocated application - if entity.colocated and entity.run_settings.colocated_db_settings is not None: - db_name_colo = entity.run_settings.colocated_db_settings["db_identifier"] - assert isinstance(db_name_colo, str) + if entity.colocated and entity.run_settings.colocated_fs_settings is not None: + fs_name_colo = entity.run_settings.colocated_fs_settings["fs_identifier"] + assert isinstance(fs_name_colo, str) for key in address_dict: - _, db_id = unpack_db_identifier(key, "_") - if db_name_colo == db_id: + _, fs_id = unpack_fs_identifier(key, "_") + if fs_name_colo == fs_id: raise SSDBIDConflictError( - f"Database identifier {db_name_colo}" + f"Feature store identifier {fs_name_colo}" " has already been used. Pass in a unique" - " name for db_identifier" + " name for fs_identifier" ) - db_name_colo = unpack_colo_db_identifier(db_name_colo) - if colo_cfg := entity.run_settings.colocated_db_settings: + fs_name_colo = unpack_colo_fs_identifier(fs_name_colo) + if colo_cfg := entity.run_settings.colocated_fs_settings: port = colo_cfg.get("port", None) socket = colo_cfg.get("unix_socket", None) if socket and port: @@ -737,62 +747,81 @@ def _prep_entity_client_env(self, entity: Application) -> None: "Co-located was configured for both TCP/IP and UDS" ) if port: - client_env[f"SSDB{db_name_colo}"] = f"127.0.0.1:{str(port)}" + client_env[f"SSDB{fs_name_colo}"] = f"127.0.0.1:{str(port)}" elif socket: - client_env[f"SSDB{db_name_colo}"] = f"unix://{socket}" + client_env[f"SSDB{fs_name_colo}"] = f"unix://{socket}" else: raise SSInternalError( - "Colocated database was not configured for either TCP or UDS" + "Colocated feature store was not configured for either TCP or UDS" ) - client_env[f"SR_DB_TYPE{db_name_colo}"] = STANDALONE + client_env[f"SR_fs_TYPE{fs_name_colo}"] = STANDALONE entity.run_settings.update_env(client_env) - def _save_orchestrator(self, orchestrator: Orchestrator) -> None: - """Save the orchestrator object via pickle + def _save_feature_store(self, feature_store: FeatureStore) -> None: + """Save the FeatureStore object via pickle - This function saves the orchestrator information to a pickle + This function saves the feature store information to a pickle file that can be imported by subsequent experiments to reconnect - to the orchestrator. + to the featurestore. - :param orchestrator: Orchestrator configuration to be saved + :param featurestore: FeatureStore configuration to be saved """ - if not orchestrator.is_active(): - raise Exception("Orchestrator is not running") + if not feature_store.is_active(): + raise Exception("Feature store is not running") + + # Extract only the fs_jobs associated with this particular feature store + if feature_store.batch: + job_names = [feature_store.name] + else: + job_names = [fsnode.name for fsnode in feature_store.entities] + fs_jobs = { + name: job for name, job in self._jobs.fs_jobs.items() if name in job_names + } + + # Extract the associated steps + steps = [ + self._launcher.step_mapping[fs_job.name] for fs_job in fs_jobs.values() + ] + + feature_store_data = {"fs": feature_store, "fs_jobs": fs_jobs, "steps": steps} + + with open(feature_store.checkpoint_file, "wb") as pickle_file: + pickle.dump(feature_store_data, pickle_file) - # Extract only the db_jobs associated with this particular orchestrator - if orchestrator.batch: - job_names = [orchestrator.name] + # Extract only the fs_jobs associated with this particular featurestore + if feature_store.batch: + job_names = [feature_store.name] else: - job_names = [dbnode.name for dbnode in orchestrator.entities] - db_jobs = { - name: job for name, job in self._jobs.db_jobs.items() if name in job_names + job_names = [fsnode.name for fsnode in feature_store.entities] + fs_jobs = { + name: job for name, job in self._jobs.fs_jobs.items() if name in job_names } # Extract the associated steps steps = [ - self._launcher.step_mapping[db_job.name] for db_job in db_jobs.values() + self._launcher.step_mapping[fs_job.name] for fs_job in fs_jobs.values() ] - orc_data = {"db": orchestrator, "db_jobs": db_jobs, "steps": steps} + feature_store_data = {"fs": feature_store, "fs_jobs": fs_jobs, "steps": steps} - with open(orchestrator.checkpoint_file, "wb") as pickle_file: - pickle.dump(orc_data, pickle_file) + with open(feature_store.checkpoint_file, "wb") as pickle_file: + pickle.dump(feature_store_data, pickle_file) - def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: - """Wait for the orchestrator instances to run + def _feature_store_launch_wait(self, featurestore: FeatureStore) -> None: + """Wait for the featurestore instances to run - In the case where the orchestrator is launched as a batch - through a WLM, we wait for the orchestrator to exit the + In the case where the featurestore is launched as a batch + through a WLM, we wait for the featurestore to exit the queue before proceeding so new launched entities can be launched with SSDB address - :param orchestrator: orchestrator instance + :param featurestore: FeatureStore instance :raises SmartSimError: if launch fails or manually stopped by user """ - if orchestrator.batch: - logger.info("Orchestrator launched as a batch") - logger.info("While queued, SmartSim will wait for Orchestrator to run") + if featurestore.batch: + logger.info("FeatureStore launched as a batch") + logger.info("While queued, SmartSim will wait for FeatureStore to run") logger.info("CTRL+C interrupt to abort and cancel launch") ready = False @@ -804,20 +833,20 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: self._jobs.check_jobs() # _jobs.get_status acquires JM lock for main thread, no need for locking - statuses = self.get_entity_list_status(orchestrator) + statuses = self.get_entity_list_status(featurestore) if all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses): ready = True # TODO: Add a node status check elif any(stat in TERMINAL_STATUSES for stat in statuses): - self.stop_db(orchestrator) - msg = "Orchestrator failed during startup" - msg += f" See {orchestrator.path} for details" + self.stop_fs(featurestore) + msg = "FeatureStore failed during startup" + msg += f" See {featurestore.path} for details" raise SmartSimError(msg) else: - logger.debug("Waiting for orchestrator instances to spin up...") + logger.debug("Waiting for featurestore instances to spin up...") except KeyboardInterrupt: - logger.info("Orchestrator launch cancelled - requesting to stop") - self.stop_db(orchestrator) + logger.info("FeatureStore launch cancelled - requesting to stop") + self.stop_fs(featurestore) # re-raise keyboard interrupt so the job manager will display # any running and un-killed jobs as this method is only called @@ -825,82 +854,82 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: # launch explicitly raise - def reload_saved_db( + def reload_saved_fs( self, checkpoint_file: t.Union[str, os.PathLike[str]] - ) -> Orchestrator: + ) -> FeatureStore: with JM_LOCK: if not osp.exists(checkpoint_file): raise FileNotFoundError( - f"The SmartSim database config file {os.fspath(checkpoint_file)} " + f"The SmartSim feature store config file {os.fspath(checkpoint_file)} " "cannot be found." ) try: with open(checkpoint_file, "rb") as pickle_file: - db_config = pickle.load(pickle_file) + fs_config = pickle.load(pickle_file) except (OSError, IOError) as e: - msg = "Database checkpoint corrupted" + msg = "Feature store checkpoint corrupted" raise SmartSimError(msg) from e err_message = ( - "The SmartSim database checkpoint is incomplete or corrupted. " + "The SmartSim feature store checkpoint is incomplete or corrupted. " ) - if not "db" in db_config: + if not "fs" in fs_config: raise SmartSimError( - err_message + "Could not find the orchestrator object." + err_message + "Could not find the featurestore object." ) - if not "db_jobs" in db_config: + if not "fs_jobs" in fs_config: raise SmartSimError( - err_message + "Could not find database job objects." + err_message + "Could not find feature store job objects." ) - if not "steps" in db_config: + if not "steps" in fs_config: raise SmartSimError( - err_message + "Could not find database job objects." + err_message + "Could not find feature store job objects." ) - orc: Orchestrator = db_config["db"] + feature_store: FeatureStore = fs_config["fs"] - # TODO check that each db_object is running + # TODO check that each fs_object is running - job_steps = zip(db_config["db_jobs"].values(), db_config["steps"]) + job_steps = zip(fs_config["fs_jobs"].values(), fs_config["steps"]) try: - for db_job, step in job_steps: - self._jobs.db_jobs[db_job.ename] = db_job - self._launcher.add_step_to_mapping_table(db_job.name, step) + for fs_job, step in job_steps: + self._jobs.fs_jobs[fs_job.ename] = fs_job + self._launcher.add_step_to_mapping_table(fs_job.name, step) if step.task_id: self._launcher.task_manager.add_existing(int(step.task_id)) except LauncherError as e: - raise SmartSimError("Failed to reconnect orchestrator") from e + raise SmartSimError("Failed to reconnect feature store") from e # start job manager if not already started if not self._jobs.actively_monitoring: self._jobs.start() - return orc + return feature_store - def _set_dbobjects(self, manifest: Manifest) -> None: - if not manifest.has_db_objects: + def _set_fsobjects(self, manifest: Manifest) -> None: + if not manifest.has_fs_objects: return - address_dict = self._jobs.get_db_host_addresses() + address_dict = self._jobs.get_fs_host_addresses() for ( - db_id, - db_addresses, + fs_id, + fs_addresses, ) in address_dict.items(): - db_name, name = unpack_db_identifier(db_id, "_") + fs_name, name = unpack_fs_identifier(fs_id, "_") - hosts = list({address.split(":")[0] for address in db_addresses}) - ports = list({int(address.split(":")[-1]) for address in db_addresses}) + hosts = list({address.split(":")[0] for address in fs_addresses}) + ports = list({int(address.split(":")[-1]) for address in fs_addresses}) - if not db_is_active(hosts=hosts, ports=ports, num_shards=len(db_addresses)): - raise SSInternalError("Cannot set DB Objects, DB is not running") + if not fs_is_active(hosts=hosts, ports=ports, num_shards=len(fs_addresses)): + raise SSInternalError("Cannot set FS Objects, FS is not running") - os.environ[f"SSDB{db_name}"] = db_addresses[0] + os.environ[f"SSDB{fs_name}"] = fs_addresses[0] - os.environ[f"SR_DB_TYPE{db_name}"] = ( - CLUSTERED if len(db_addresses) > 1 else STANDALONE + os.environ[f"SR_fs_TYPE{fs_name}"] = ( + CLUSTERED if len(fs_addresses) > 1 else STANDALONE ) options = ConfigOptions.create_from_environment(name) @@ -908,27 +937,27 @@ def _set_dbobjects(self, manifest: Manifest) -> None: for application in manifest.applications: if not application.colocated: - for db_model in application.db_models: - set_ml_model(db_model, client) - for db_script in application.db_scripts: - set_script(db_script, client) + for fs_model in application.fs_models: + set_ml_model(fs_model, client) + for fs_script in application.fs_scripts: + set_script(fs_script, client) for ensemble in manifest.ensembles: - for db_model in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in ensemble.db_scripts: - set_script(db_script, client) + for fs_model in ensemble.fs_models: + set_ml_model(fs_model, client) + for fs_script in ensemble.fs_scripts: + set_script(fs_script, client) for entity in ensemble.applications: if not entity.colocated: # Set models which could belong only # to the entities and not to the ensemble # but avoid duplicates - for db_model in entity.db_models: - if db_model not in ensemble.db_models: - set_ml_model(db_model, client) - for db_script in entity.db_scripts: - if db_script not in ensemble.db_scripts: - set_script(db_script, client) + for fs_model in entity.fs_models: + if fs_model not in ensemble.fs_models: + set_ml_model(fs_model, client) + for fs_script in entity.fs_scripts: + if fs_script not in ensemble.fs_scripts: + set_script(fs_script, client) def _start_telemetry_monitor(self, exp_dir: str) -> None: """Spawns a telemetry monitor process to keep track of the life times diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index 6941d7607..7a9db0927 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -76,9 +76,9 @@ def __init__(self) -> None: """Flag indicating if the entity has completed execution""" @property - def is_db(self) -> bool: - """Returns `True` if the entity represents a database or database shard""" - return self.type in ["orchestrator", "dbnode"] + def is_fs(self) -> bool: + """Returns `True` if the entity represents a feature store or feature store shard""" + return self.type in ["featurestore", "fsnode"] @property def is_managed(self) -> bool: @@ -112,13 +112,13 @@ def check_completion_status(self) -> None: self._is_complete = True @staticmethod - def _map_db_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: - """Map DB-specific properties from a runtime manifest onto a `JobEntity` + def _map_fs_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> None: + """Map FS-specific properties from a runtime manifest onto a `JobEntity` :param entity_dict: The raw dictionary deserialized from manifest JSON :param entity: The entity instance to modify """ - if entity.is_db: + if entity.is_fs: # add collectors if they're configured to be enabled in the manifest entity.collectors = { "client": entity_dict.get("client_file", ""), @@ -184,7 +184,7 @@ def from_manifest( cls._map_standard_metadata( entity_type, entity_dict, entity, exp_dir, raw_experiment ) - cls._map_db_metadata(entity_dict, entity) + cls._map_fs_metadata(entity_dict, entity) return entity @@ -222,7 +222,7 @@ def __init__( # output is only populated if it's system related (e.g. cmd failed immediately) self.output: t.Optional[str] = None self.error: t.Optional[str] = None # same as output - self.hosts: t.List[str] = [] # currently only used for DB jobs + self.hosts: t.List[str] = [] # currently only used for FS jobs self.launched_with = launcher self.is_task = is_task self.start_time = time.time() diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 645950a93..37f379024 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -33,8 +33,8 @@ from types import FrameType from ..._core.launcher.step import Step -from ...database import Orchestrator -from ...entity import DBNode, EntitySequence, SmartSimEntity +from ...database import FeatureStore +from ...entity import EntitySequence, FSNode, SmartSimEntity from ...log import ContextThread, get_logger from ...status import TERMINAL_STATUSES, SmartSimStatus from ..config import CONFIG @@ -67,7 +67,7 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: # active jobs self.jobs: t.Dict[str, Job] = {} - self.db_jobs: t.Dict[str, Job] = {} + self.fs_jobs: t.Dict[str, Job] = {} # completed jobs self.completed: t.Dict[str, Job] = {} @@ -130,8 +130,8 @@ def move_to_completed(self, job: Job) -> None: job.record_history() # remove from actively monitored jobs - if job.ename in self.db_jobs: - del self.db_jobs[job.ename] + if job.ename in self.fs_jobs: + del self.fs_jobs[job.ename] elif job.ename in self.jobs: del self.jobs[job.ename] @@ -143,7 +143,7 @@ def __getitem__(self, entity_name: str) -> Job: :returns: the Job associated with the entity_name """ with self._lock: - entities = ChainMap(self.db_jobs, self.jobs, self.completed) + entities = ChainMap(self.fs_jobs, self.jobs, self.completed) return entities[entity_name] def __call__(self) -> t.Dict[str, Job]: @@ -151,7 +151,7 @@ def __call__(self) -> t.Dict[str, Job]: :returns: Dictionary of all jobs """ - all_jobs = {**self.jobs, **self.db_jobs} + all_jobs = {**self.jobs, **self.fs_jobs} return all_jobs def __contains__(self, key: str) -> bool: @@ -177,10 +177,10 @@ def add_job( launcher = str(self._launcher) # all operations here should be atomic job = Job(step.name, job_id, step.entity, launcher, is_task) - if isinstance(step.entity, (DBNode, Orchestrator)): - self.db_jobs[step.entity.name] = job - elif isinstance(step.entity, JobEntity) and step.entity.is_db: - self.db_jobs[step.entity.name] = job + if isinstance(step.entity, (FSNode, FeatureStore)): + self.fs_jobs[step.entity.name] = job + elif isinstance(step.entity, JobEntity) and step.entity.is_fs: + self.fs_jobs[step.entity.name] = job else: self.jobs[step.entity.name] = job @@ -282,50 +282,50 @@ def restart_job( del self.completed[entity_name] job.reset(job_name, job_id, is_task) - if isinstance(job.entity, (DBNode, Orchestrator)): - self.db_jobs[entity_name] = job + if isinstance(job.entity, (FSNode, FeatureStore)): + self.fs_jobs[entity_name] = job else: self.jobs[entity_name] = job - def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: - """Retrieve the list of hosts for the database - for corresponding database identifiers + def get_fs_host_addresses(self) -> t.Dict[str, t.List[str]]: + """Retrieve the list of hosts for the feature store + for corresponding feature store identifiers :return: dictionary of host ip addresses """ address_dict: t.Dict[str, t.List[str]] = {} - for db_job in self.db_jobs.values(): + for fs_job in self.fs_jobs.values(): addresses = [] - if isinstance(db_job.entity, (DBNode, Orchestrator)): - db_entity = db_job.entity - for combine in itertools.product(db_job.hosts, db_entity.ports): + if isinstance(fs_job.entity, (FSNode, FeatureStore)): + fs_entity = fs_job.entity + for combine in itertools.product(fs_job.hosts, fs_entity.ports): ip_addr = get_ip_from_host(combine[0]) addresses.append(":".join((ip_addr, str(combine[1])))) - dict_entry: t.List[str] = address_dict.get(db_entity.db_identifier, []) + dict_entry: t.List[str] = address_dict.get(fs_entity.fs_identifier, []) dict_entry.extend(addresses) - address_dict[db_entity.db_identifier] = dict_entry + address_dict[fs_entity.fs_identifier] = dict_entry return address_dict - def set_db_hosts(self, orchestrator: Orchestrator) -> None: - """Set the DB hosts in db_jobs so future entities can query this + def set_fs_hosts(self, FeatureStore: FeatureStore) -> None: + """Set the fs hosts in fs_jobs so future entities can query this - :param orchestrator: orchestrator instance + :param FeatureStore: FeatureStore instance """ # should only be called during launch in the controller with self._lock: - if orchestrator.batch: - self.db_jobs[orchestrator.name].hosts = orchestrator.hosts + if FeatureStore.batch: + self.fs_jobs[FeatureStore.name].hosts = FeatureStore.hosts else: - for dbnode in orchestrator.entities: - if not dbnode.is_mpmd: - self.db_jobs[dbnode.name].hosts = [dbnode.host] + for fsnode in FeatureStore.entities: + if not fsnode.is_mpmd: + self.fs_jobs[fsnode.name].hosts = [fsnode.host] else: - self.db_jobs[dbnode.name].hosts = dbnode.hosts + self.fs_jobs[fsnode.name].hosts = fsnode.hosts def signal_interrupt(self, signo: int, _frame: t.Optional[FrameType]) -> None: """Custom handler for whenever SIGINT is received""" @@ -361,4 +361,4 @@ def _thread_sleep(self) -> None: def __len__(self) -> int: # number of active jobs - return len(self.db_jobs) + len(self.jobs) + return len(self.fs_jobs) + len(self.jobs) diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index bf99bb050..36b030504 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -29,8 +29,8 @@ import typing as t from dataclasses import dataclass, field -from ...database import Orchestrator -from ...entity import Application, DBNode, Ensemble, EntitySequence, SmartSimEntity +from ...database import FeatureStore +from ...entity import Application, Ensemble, EntitySequence, FSNode, SmartSimEntity from ...error import SmartSimError from ..config import CONFIG from ..utils import helpers as _helpers @@ -38,7 +38,7 @@ _T = t.TypeVar("_T") _U = t.TypeVar("_U") -_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Application, DBNode) +_AtomicLaunchableT = t.TypeVar("_AtomicLaunchableT", Application, FSNode) if t.TYPE_CHECKING: import os @@ -50,7 +50,7 @@ class Manifest: `SmartSimEntity`-derived objects or `EntitySequence`-derived objects) can be accessed by using the corresponding accessor. - Instances of ``Application``, ``Ensemble`` and ``Orchestrator`` + Instances of ``Application``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments """ @@ -63,14 +63,14 @@ def __init__( self._check_entity_lists_nonempty() @property - def dbs(self) -> t.List[Orchestrator]: - """Return a list of Orchestrator instances in Manifest + def fss(self) -> t.List[FeatureStore]: + """Return a list of FeatureStore instances in Manifest - :raises SmartSimError: if user added to databases to manifest - :return: List of orchestrator instances + :raises SmartSimError: if user added to feature stores to manifest + :return: List of feature store instances """ - dbs = [item for item in self._deployables if isinstance(item, Orchestrator)] - return dbs + fss = [item for item in self._deployables if isinstance(item, FeatureStore)] + return fss @property def applications(self) -> t.List[Application]: @@ -94,14 +94,14 @@ def ensembles(self) -> t.List[Ensemble]: @property def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: """All entity lists, including ensembles and - exceptional ones like Orchestrator + exceptional ones like FeatureStore :return: list of entity lists """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) - for db in self.dbs: - _all_entity_lists.append(db) + for fs in self.fss: + _all_entity_lists.append(fs) return _all_entity_lists @@ -144,7 +144,7 @@ def __str__(self) -> str: output = "" e_header = "=== Ensembles ===\n" m_header = "=== Applications ===\n" - db_header = "=== Database ===\n" + db_header = "=== Feature Stores ===\n" if self.ensembles: output += e_header @@ -168,27 +168,27 @@ def __str__(self) -> str: output += f"Parameters: \n{_helpers.fmt_dict(application.params)}\n" output += "\n" - for adb in self.dbs: - output += db_header - output += f"Shards: {adb.num_shards}\n" - output += f"Port: {str(adb.ports[0])}\n" - output += f"Network: {adb._interfaces}\n" - output += f"Batch Launch: {adb.batch}\n" - if adb.batch: - output += f"{str(adb.batch_settings)}\n" + for afs in self.fss: + output += fs_header + output += f"Shards: {afs.num_shards}\n" + output += f"Port: {str(afs.ports[0])}\n" + output += f"Network: {afs._interfaces}\n" + output += f"Batch Launch: {afs.batch}\n" + if afs.batch: + output += f"{str(afs.batch_settings)}\n" output += "\n" return output @property - def has_db_objects(self) -> bool: - """Check if any entity has DBObjects to set""" + def has_fs_objects(self) -> bool: + """Check if any entity has FSObjects to set""" ents: t.Iterable[t.Union[Application, Ensemble]] = itertools.chain( self.applications, self.ensembles, (member for ens in self.ensembles for member in ens.entities), ) - return any(any(ent.db_models) or any(ent.db_scripts) for ent in ents) + return any(any(ent.fs_models) or any(ent.fs_scripts) for ent in ents) class _LaunchedManifestMetadata(t.NamedTuple): @@ -222,7 +222,9 @@ class LaunchedManifest(t.Generic[_T]): metadata: _LaunchedManifestMetadata applications: t.Tuple[t.Tuple[Application, _T], ...] ensembles: t.Tuple[t.Tuple[Ensemble, t.Tuple[t.Tuple[Application, _T], ...]], ...] - databases: t.Tuple[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]], ...] + featurestores: t.Tuple[ + t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]], ... + ] def map(self, func: t.Callable[[_T], _U]) -> "LaunchedManifest[_U]": def _map_entity_data( @@ -238,9 +240,9 @@ def _map_entity_data( (ens, _map_entity_data(func, application_data)) for ens, application_data in self.ensembles ), - databases=tuple( - (db_, _map_entity_data(func, node_data)) - for db_, node_data in self.databases + featurestores=tuple( + (fs_, _map_entity_data(func, node_data)) + for fs_, node_data in self.featurestores ), ) @@ -263,7 +265,7 @@ class LaunchedManifestBuilder(t.Generic[_T]): _ensembles: t.List[t.Tuple[Ensemble, t.Tuple[t.Tuple[Application, _T], ...]]] = ( field(default_factory=list, init=False) ) - _databases: t.List[t.Tuple[Orchestrator, t.Tuple[t.Tuple[DBNode, _T], ...]]] = ( + _featurestores: t.List[t.Tuple[FeatureStore, t.Tuple[t.Tuple[FSNode, _T], ...]]] = ( field(default_factory=list, init=False) ) @@ -281,8 +283,8 @@ def add_application(self, application: Application, data: _T) -> None: def add_ensemble(self, ens: Ensemble, data: t.Sequence[_T]) -> None: self._ensembles.append((ens, self._entities_to_data(ens.entities, data))) - def add_database(self, db_: Orchestrator, data: t.Sequence[_T]) -> None: - self._databases.append((db_, self._entities_to_data(db_.entities, data))) + def add_feature_store(self, fs_: FeatureStore, data: t.Sequence[_T]) -> None: + self._featurestores.append((fs_, self._entities_to_data(fs_.entities, data))) @staticmethod def _entities_to_data( @@ -307,7 +309,7 @@ def finalize(self) -> LaunchedManifest[_T]: ), applications=tuple(self._applications), ensembles=tuple(self._ensembles), - databases=tuple(self._databases), + featurestores=tuple(self._featurestores), ) diff --git a/smartsim/_core/control/previewrenderer.py b/smartsim/_core/control/previewrenderer.py index 857a70397..d3e4f6be2 100644 --- a/smartsim/_core/control/previewrenderer.py +++ b/smartsim/_core/control/previewrenderer.py @@ -65,7 +65,7 @@ def as_toggle(_eval_ctx: u.F, value: bool) -> str: @pass_eval_context def get_ifname(_eval_ctx: u.F, value: t.List[str]) -> str: - """Extract Network Interface from orchestrator run settings.""" + """Extract Network Interface from feature store run settings.""" if value: for val in value: if "ifname=" in val: @@ -75,12 +75,12 @@ def get_ifname(_eval_ctx: u.F, value: t.List[str]) -> str: @pass_eval_context -def get_dbtype(_eval_ctx: u.F, value: str) -> str: - """Extract data base type.""" +def get_fstype(_eval_ctx: u.F, value: str) -> str: + """Extract feature store type.""" if value: if "-cli" in value: - db_type, _ = value.split("/")[-1].split("-", 1) - return db_type + fs_type, _ = value.split("/")[-1].split("-", 1) + return fs_type return "" @@ -112,7 +112,7 @@ def render( verbosity_level: Verbosity = Verbosity.INFO, output_format: Format = Format.PLAINTEXT, output_filename: t.Optional[str] = None, - active_dbjobs: t.Optional[t.Dict[str, Job]] = None, + active_fsjobs: t.Optional[t.Dict[str, Job]] = None, ) -> str: """ Render the template from the supplied entities. @@ -133,7 +133,7 @@ def render( env.filters["as_toggle"] = as_toggle env.filters["get_ifname"] = get_ifname - env.filters["get_dbtype"] = get_dbtype + env.filters["get_fstype"] = get_fstype env.filters["is_list"] = is_list env.globals["Verbosity"] = Verbosity @@ -150,7 +150,7 @@ def render( rendered_preview = tpl.render( exp_entity=exp, - active_dbjobs=active_dbjobs, + active_dbjobs=active_fsjobs, manifest=manifest, config=CONFIG, verbosity_level=verbosity_level, diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index 508251fe0..44429adaf 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -58,14 +58,14 @@ def handle_signal(signo: int, _frame: t.Optional[FrameType]) -> None: cleanup() -def launch_db_model(client: Client, db_model: t.List[str]) -> str: +def launch_fs_model(client: Client, fs_model: t.List[str]) -> str: """Parse options to launch model on local cluster - :param client: SmartRedis client connected to local DB - :param db_model: List of arguments defining the model + :param client: SmartRedis client connected to local FS + :param fs_model: List of arguments defining the model :return: Name of model """ - parser = argparse.ArgumentParser("Set ML model on DB") + parser = argparse.ArgumentParser("Set ML model on FS") parser.add_argument("--name", type=str) parser.add_argument("--file", type=str) parser.add_argument("--backend", type=str) @@ -78,7 +78,7 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: parser.add_argument("--tag", type=str, default="") parser.add_argument("--inputs", nargs="+", default=None) parser.add_argument("--outputs", nargs="+", default=None) - args = parser.parse_args(db_model) + args = parser.parse_args(fs_model) inputs = None outputs = None @@ -122,14 +122,14 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: return name -def launch_db_script(client: Client, db_script: t.List[str]) -> str: +def launch_fs_script(client: Client, fs_script: t.List[str]) -> str: """Parse options to launch script on local cluster - :param client: SmartRedis client connected to local DB - :param db_model: List of arguments defining the script + :param client: SmartRedis client connected to local FS + :param fs_model: List of arguments defining the script :return: Name of model """ - parser = argparse.ArgumentParser("Set script on DB") + parser = argparse.ArgumentParser("Set script on FS") parser.add_argument("--name", type=str) parser.add_argument("--func", type=str) parser.add_argument("--file", type=str) @@ -137,7 +137,7 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: parser.add_argument("--device", type=str) parser.add_argument("--devices_per_node", type=int, default=1) parser.add_argument("--first_device", type=int, default=0) - args = parser.parse_args(db_script) + args = parser.parse_args(fs_script) if args.file and args.func: raise ValueError("Both file and func cannot be provided.") @@ -165,11 +165,11 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: def main( network_interface: str, - db_cpus: int, + fs_cpus: int, command: t.List[str], - db_models: t.List[t.List[str]], - db_scripts: t.List[t.List[str]], - db_identifier: str, + fs_models: t.List[t.List[str]], + fs_scripts: t.List[t.List[str]], + fs_identifier: str, ) -> None: # pylint: disable=too-many-statements global DBPID # pylint: disable=global-statement @@ -198,7 +198,7 @@ def main( try: hostname = socket.gethostname() filename = ( - f"colo_orc_{hostname}.log" + f"colo_feature_store_{hostname}.log" if os.getenv("SMARTSIM_LOG_LEVEL") == "debug" else os.devnull ) @@ -210,66 +210,68 @@ def main( except Exception as e: cleanup() - logger.error(f"Failed to start database process: {str(e)}") + logger.error(f"Failed to start feature store process: {str(e)}") raise SSInternalError("Colocated process failed to start") from e try: logger.debug( - "\n\nColocated database information\n" + "\n\nColocated feature store information\n" f"\n\tIP Address(es): {' '.join(ip_addresses + [lo_address])}" f"\n\tCommand: {' '.join(cmd)}\n\n" - f"\n\t# of Database CPUs: {db_cpus}" - f"\n\tDatabase Identifier: {db_identifier}" + f"\n\t# of Feature Store CPUs: {fs_cpus}" + f"\n\tFeature Store Identifier: {fs_identifier}" ) except Exception as e: cleanup() - logger.error(f"Failed to start database process: {str(e)}") + logger.error(f"Failed to start feature store process: {str(e)}") raise SSInternalError("Colocated process failed to start") from e - def launch_models(client: Client, db_models: t.List[t.List[str]]) -> None: - for i, db_model in enumerate(db_models): + def launch_models(client: Client, fs_models: t.List[t.List[str]]) -> None: + for i, fs_model in enumerate(fs_models): logger.debug("Uploading model") - model_name = launch_db_model(client, db_model) - logger.debug(f"Added model {model_name} ({i+1}/{len(db_models)})") + model_name = launch_fs_model(client, fs_model) + logger.debug(f"Added model {model_name} ({i+1}/{len(fs_models)})") - def launch_db_scripts(client: Client, db_scripts: t.List[t.List[str]]) -> None: - for i, db_script in enumerate(db_scripts): + def launch_fs_scripts(client: Client, fs_scripts: t.List[t.List[str]]) -> None: + for i, fs_script in enumerate(fs_scripts): logger.debug("Uploading script") - script_name = launch_db_script(client, db_script) - logger.debug(f"Added script {script_name} ({i+1}/{len(db_scripts)})") + script_name = launch_fs_script(client, fs_script) + logger.debug(f"Added script {script_name} ({i+1}/{len(fs_scripts)})") try: - if db_models or db_scripts: + if fs_models or fs_scripts: try: - options = ConfigOptions.create_from_environment(db_identifier) + options = ConfigOptions.create_from_environment(fs_identifier) client = Client(options, logger_name="SmartSim") - launch_models(client, db_models) - launch_db_scripts(client, db_scripts) + launch_models(client, fs_models) + launch_fs_scripts(client, fs_scripts) except (RedisConnectionError, RedisReplyError) as ex: raise SSInternalError( - "Failed to set model or script, could not connect to database" + "Failed to set model or script, could not connect to feature store" ) from ex # Make sure we don't keep this around del client except Exception as e: cleanup() - logger.error(f"Colocated database process failed: {str(e)}") + logger.error(f"Colocated feature store process failed: {str(e)}") raise SSInternalError("Colocated entrypoint raised an error") from e def cleanup() -> None: try: - logger.debug("Cleaning up colocated database") - # attempt to stop the database process - db_proc = psutil.Process(DBPID) - db_proc.terminate() + logger.debug("Cleaning up colocated feature store") + # attempt to stop the feature store process + fs_proc = psutil.Process(DBPID) + fs_proc.terminate() except psutil.NoSuchProcess: - logger.warning("Couldn't find database process to kill.") + logger.warning("Couldn't find feature store process to kill.") except OSError as e: - logger.warning(f"Failed to clean up colocated database gracefully: {str(e)}") + logger.warning( + f"Failed to clean up colocated feature store gracefully: {str(e)}" + ) finally: if LOCK.is_locked: LOCK.release() @@ -294,27 +296,27 @@ def register_signal_handlers() -> None: "+lockfile", type=str, help="Filename to create for single proc per host" ) arg_parser.add_argument( - "+db_cpus", type=int, default=2, help="Number of CPUs to use for DB" + "+fs_cpus", type=int, default=2, help="Number of CPUs to use for FS" ) arg_parser.add_argument( - "+db_identifier", type=str, default="", help="Database Identifier" + "+fs_identifier", type=str, default="", help="Feature Store Identifier" ) arg_parser.add_argument("+command", nargs="+", help="Command to run") arg_parser.add_argument( - "+db_model", + "+fs_model", nargs="+", action="append", default=[], - help="Model to set on DB", + help="Model to set on FS", ) arg_parser.add_argument( - "+db_script", + "+fs_script", nargs="+", action="append", default=[], - help="Script to set on DB", + help="Script to set on FS", ) os.environ["PYTHONUNBUFFERED"] = "1" @@ -325,20 +327,22 @@ def register_signal_handlers() -> None: LOCK = filelock.FileLock(tmp_lockfile) LOCK.acquire(timeout=0.1) - logger.debug(f"Starting colocated database on host: {socket.gethostname()}") + logger.debug( + f"Starting colocated feature store on host: {socket.gethostname()}" + ) # make sure to register the cleanup before we start # the proecss so our signaller will be able to stop - # the database process. + # the feature store process. register_signal_handlers() main( parsed_args.ifname, - parsed_args.db_cpus, + parsed_args.fs_cpus, parsed_args.command, - parsed_args.db_model, - parsed_args.db_script, - parsed_args.db_identifier, + parsed_args.fs_model, + parsed_args.fs_script, + parsed_args.fs_identifier, ) # gracefully exit the processes in the distributed application that diff --git a/smartsim/_core/entrypoints/dragon.py b/smartsim/_core/entrypoints/dragon.py index 92ebd735f..2bfde74f2 100644 --- a/smartsim/_core/entrypoints/dragon.py +++ b/smartsim/_core/entrypoints/dragon.py @@ -297,7 +297,7 @@ def cleanup() -> None: def register_signal_handlers() -> None: # make sure to register the cleanup before the start # the process so our signaller will be able to stop - # the database process. + # the feature store process. for sig in SIGNALS: signal.signal(sig, handle_signal) diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index 6944f3a38..38dc9a7ec 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -61,7 +61,7 @@ def main( :param cmd: a base64 encoded cmd to execute :param entity_type: `SmartSimEntity` entity class. Valid values - include: orchestrator, dbnode, ensemble, application + include: feature store, fsnode, ensemble, application :param cwd: working directory to execute the cmd from :param status_dir: path to the output directory for status updates """ @@ -233,7 +233,7 @@ def get_parser() -> argparse.ArgumentParser: logger.debug("Starting indirect step execution") # make sure to register the cleanup before the start the process - # so our signaller will be able to stop the database process. + # so our signaller will be able to stop the feature store process. register_signal_handlers() rc = main( diff --git a/smartsim/_core/entrypoints/redis.py b/smartsim/_core/entrypoints/redis.py index c4d8cbbd6..995c6faa0 100644 --- a/smartsim/_core/entrypoints/redis.py +++ b/smartsim/_core/entrypoints/redis.py @@ -121,21 +121,21 @@ def main(args: argparse.Namespace) -> int: print(line.decode("utf-8").rstrip(), flush=True) except Exception: cleanup() - logger.error("Database process starter raised an exception", exc_info=True) + logger.error("Feature store process starter raised an exception", exc_info=True) return 1 return 0 def cleanup() -> None: - logger.debug("Cleaning up database instance") + logger.debug("Cleaning up feature store instance") try: - # attempt to stop the database process + # attempt to stop the feature store process if DBPID is not None: psutil.Process(DBPID).terminate() except psutil.NoSuchProcess: - logger.warning("Couldn't find database process to kill.") + logger.warning("Couldn't find feature store process to kill.") except OSError as e: - logger.warning(f"Failed to clean up database gracefully: {str(e)}") + logger.warning(f"Failed to clean up feature store gracefully: {str(e)}") if __name__ == "__main__": @@ -145,12 +145,12 @@ def cleanup() -> None: prefix_chars="+", description="SmartSim Process Launcher" ) parser.add_argument( - "+orc-exe", type=str, help="Path to the orchestrator executable", required=True + "+orc-exe", type=str, help="Path to the feature store executable", required=True ) parser.add_argument( "+conf-file", type=str, - help="Path to the orchestrator configuration file", + help="Path to the feature store configuration file", required=True, ) parser.add_argument( @@ -169,7 +169,7 @@ def cleanup() -> None: parser.add_argument( "+port", type=int, - help="The port on which to launch the shard of the orchestrator", + help="The port on which to launch the shard of the feature store", required=True, ) parser.add_argument( @@ -178,14 +178,14 @@ def cleanup() -> None: parser.add_argument( "+cluster", action="store_true", - help="Specify if this orchestrator shard is part of a cluster", + help="Specify if this feature store shard is part of a cluster", ) args_ = parser.parse_args() # make sure to register the cleanup before the start # the process so our signaller will be able to stop - # the database process. + # the feature store process. for sig in SIGNALS: signal.signal(sig, handle_signal) diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index a34219026..e17f43b85 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -35,7 +35,7 @@ from tabulate import tabulate -from ...database import Orchestrator +from ...database import FeatureStore from ...entity import Application, Ensemble, TaggedFilesHierarchy from ...log import get_logger from ..control import Manifest @@ -105,7 +105,7 @@ def generate_experiment(self, *args: t.Any) -> None: generator_manifest = Manifest(*args) self._gen_exp_dir() - self._gen_orc_dir(generator_manifest.dbs) + self._gen_feature_store_dir(generator_manifest.fss) self._gen_entity_list_dir(generator_manifest.ensembles) self._gen_entity_dirs(generator_manifest.applications) @@ -154,21 +154,23 @@ def _gen_exp_dir(self) -> None: dt_string = datetime.now().strftime("%d/%m/%Y %H:%M:%S") log_file.write(f"Generation start date and time: {dt_string}\n") - def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: + def _gen_feature_store_dir(self, feature_store_list: t.List[FeatureStore]) -> None: """Create the directory that will hold the error, output and - configuration files for the orchestrator. + configuration files for the feature store. - :param orchestrator: Orchestrator instance + :param featurestore: FeatureStore instance """ - # Loop through orchestrators - for orchestrator in orchestrator_list: - orc_path = path.join(self.gen_path, orchestrator.name) - - orchestrator.set_path(orc_path) - # Always remove orchestrator files if present. - if path.isdir(orc_path): - shutil.rmtree(orc_path, ignore_errors=True) - pathlib.Path(orc_path).mkdir(exist_ok=self.overwrite, parents=True) + # Loop through feature stores + for featurestore in feature_store_list: + feature_store_path = path.join(self.gen_path, featurestore.name) + + featurestore.set_path(feature_store_path) + # Always remove featurestore files if present. + if path.isdir(feature_store_path): + shutil.rmtree(feature_store_path, ignore_errors=True) + pathlib.Path(feature_store_path).mkdir( + exist_ok=self.overwrite, parents=True + ) def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index c69a9cef1..9f307968b 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -27,14 +27,14 @@ import sys import typing as t -from ...entity.dbobject import DBModel, DBScript +from ...entity.dbobject import FSModel, FSScript from ...error import SSInternalError from ..config import CONFIG from ..utils.helpers import create_lockfile_name def write_colocated_launch_script( - file_name: str, db_log: str, colocated_settings: t.Dict[str, t.Any] + file_name: str, fs_log: str, colocated_settings: t.Dict[str, t.Any] ) -> None: """Write the colocated launch script @@ -42,11 +42,11 @@ def write_colocated_launch_script( is created for this entity. :param file_name: name of the script to write - :param db_log: log file for the db - :param colocated_settings: db settings from entity run_settings + :param fs_log: log file for the fs + :param colocated_settings: fs settings from entity run_settings """ - colocated_cmd = _build_colocated_wrapper_cmd(db_log, **colocated_settings) + colocated_cmd = _build_colocated_wrapper_cmd(fs_log, **colocated_settings) with open(file_name, "w", encoding="utf-8") as script_file: script_file.write("#!/bin/bash\n") @@ -78,24 +78,24 @@ def write_colocated_launch_script( def _build_colocated_wrapper_cmd( - db_log: str, + fs_log: str, cpus: int = 1, rai_args: t.Optional[t.Dict[str, str]] = None, - extra_db_args: t.Optional[t.Dict[str, str]] = None, + extra_fs_args: t.Optional[t.Dict[str, str]] = None, port: int = 6780, ifname: t.Optional[t.Union[str, t.List[str]]] = None, custom_pinning: t.Optional[str] = None, **kwargs: t.Any, ) -> str: - """Build the command use to run a colocated DB application + """Build the command use to run a colocated fs application - :param db_log: log file for the db - :param cpus: db cpus + :param fs_log: log file for the fs + :param cpus: fs cpus :param rai_args: redisai args - :param extra_db_args: extra redis args - :param port: port to bind DB to - :param ifname: network interface(s) to bind DB to - :param db_cpu_list: The list of CPUs that the database should be limited to + :param extra_fs_args: extra redis args + :param port: port to bind fs to + :param ifname: network interface(s) to bind fs to + :param fs_cpu_list: The list of CPUs that the feature store should be limited to :return: the command to run """ # pylint: disable=too-many-locals @@ -108,8 +108,8 @@ def _build_colocated_wrapper_cmd( lockfile = create_lockfile_name() # create the command that will be used to launch the - # database with the python entrypoint for starting - # up the backgrounded db process + # feature store with the python entrypoint for starting + # up the backgrounded fs process cmd = [ sys.executable, @@ -117,7 +117,7 @@ def _build_colocated_wrapper_cmd( "smartsim._core.entrypoints.colocated", "+lockfile", lockfile, - "+db_cpus", + "+fs_cpus", str(cpus), ] # Add in the interface if using TCP/IP @@ -126,12 +126,12 @@ def _build_colocated_wrapper_cmd( ifname = [ifname] cmd.extend(["+ifname", ",".join(ifname)]) cmd.append("+command") - # collect DB binaries and libraries from the config + # collect fs binaries and libraries from the config - db_cmd = [] + fs_cmd = [] if custom_pinning: - db_cmd.extend(["taskset", "-c", custom_pinning]) - db_cmd.extend( + fs_cmd.extend(["taskset", "-c", custom_pinning]) + fs_cmd.extend( [CONFIG.database_exe, CONFIG.database_conf, "--loadmodule", CONFIG.redisai] ) @@ -140,16 +140,16 @@ def _build_colocated_wrapper_cmd( if value: # RAI wants arguments for inference in all caps # ex. THREADS_PER_QUEUE=1 - db_cmd.append(f"{arg.upper()} {str(value)}") + fs_cmd.append(f"{arg.upper()} {str(value)}") - db_cmd.extend(["--port", str(port)]) + fs_cmd.extend(["--port", str(port)]) # Add socket and permissions for UDS unix_socket = kwargs.get("unix_socket", None) socket_permissions = kwargs.get("socket_permissions", None) if unix_socket and socket_permissions: - db_cmd.extend( + fs_cmd.extend( [ "--unixsocket", str(unix_socket), @@ -162,72 +162,72 @@ def _build_colocated_wrapper_cmd( "`unix_socket` and `socket_permissions` must both be defined or undefined." ) - db_cmd.extend( - ["--logfile", db_log] + fs_cmd.extend( + ["--logfile", fs_log] ) # usually /dev/null, unless debug was specified - if extra_db_args: - for db_arg, value in extra_db_args.items(): - # replace "_" with "-" in the db_arg because we use kwargs + if extra_fs_args: + for fs_arg, value in extra_fs_args.items(): + # replace "_" with "-" in the fs_arg because we use kwargs # for the extra configurations and Python doesn't allow a hyphen # in a variable name. All redis and KeyDB configuration options # use hyphens in their names. - db_arg = db_arg.replace("_", "-") - db_cmd.extend([f"--{db_arg}", value]) + fs_arg = fs_arg.replace("_", "-") + fs_cmd.extend([f"--{fs_arg}", value]) - db_models = kwargs.get("db_models", None) - if db_models: - db_model_cmd = _build_db_model_cmd(db_models) - db_cmd.extend(db_model_cmd) + fs_models = kwargs.get("fs_models", None) + if fs_models: + fs_model_cmd = _build_fs_model_cmd(fs_models) + fs_cmd.extend(fs_model_cmd) - db_scripts = kwargs.get("db_scripts", None) - if db_scripts: - db_script_cmd = _build_db_script_cmd(db_scripts) - db_cmd.extend(db_script_cmd) + fs_scripts = kwargs.get("fs_scripts", None) + if fs_scripts: + fs_script_cmd = _build_fs_script_cmd(fs_scripts) + fs_cmd.extend(fs_script_cmd) - cmd.extend(db_cmd) + cmd.extend(fs_cmd) return " ".join(cmd) -def _build_db_model_cmd(db_models: t.List[DBModel]) -> t.List[str]: +def _build_fs_model_cmd(fs_models: t.List[FSModel]) -> t.List[str]: cmd = [] - for db_model in db_models: - cmd.append("+db_model") - cmd.append(f"--name={db_model.name}") - - # Here db_model.file is guaranteed to exist - # because we don't allow the user to pass a serialized DBModel - cmd.append(f"--file={db_model.file}") - - cmd.append(f"--backend={db_model.backend}") - cmd.append(f"--device={db_model.device}") - cmd.append(f"--devices_per_node={db_model.devices_per_node}") - cmd.append(f"--first_device={db_model.first_device}") - if db_model.batch_size: - cmd.append(f"--batch_size={db_model.batch_size}") - if db_model.min_batch_size: - cmd.append(f"--min_batch_size={db_model.min_batch_size}") - if db_model.min_batch_timeout: - cmd.append(f"--min_batch_timeout={db_model.min_batch_timeout}") - if db_model.tag: - cmd.append(f"--tag={db_model.tag}") - if db_model.inputs: - cmd.append("--inputs=" + ",".join(db_model.inputs)) - if db_model.outputs: - cmd.append("--outputs=" + ",".join(db_model.outputs)) + for fs_model in fs_models: + cmd.append("+fs_model") + cmd.append(f"--name={fs_model.name}") + + # Here fs_model.file is guaranteed to exist + # because we don't allow the user to pass a serialized FSModel + cmd.append(f"--file={fs_model.file}") + + cmd.append(f"--backend={fs_model.backend}") + cmd.append(f"--device={fs_model.device}") + cmd.append(f"--devices_per_node={fs_model.devices_per_node}") + cmd.append(f"--first_device={fs_model.first_device}") + if fs_model.batch_size: + cmd.append(f"--batch_size={fs_model.batch_size}") + if fs_model.min_batch_size: + cmd.append(f"--min_batch_size={fs_model.min_batch_size}") + if fs_model.min_batch_timeout: + cmd.append(f"--min_batch_timeout={fs_model.min_batch_timeout}") + if fs_model.tag: + cmd.append(f"--tag={fs_model.tag}") + if fs_model.inputs: + cmd.append("--inputs=" + ",".join(fs_model.inputs)) + if fs_model.outputs: + cmd.append("--outputs=" + ",".join(fs_model.outputs)) return cmd -def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: +def _build_fs_script_cmd(fs_scripts: t.List[FSScript]) -> t.List[str]: cmd = [] - for db_script in db_scripts: - cmd.append("+db_script") - cmd.append(f"--name={db_script.name}") - if db_script.func: - # Notice that here db_script.func is guaranteed to be a str + for fs_script in fs_scripts: + cmd.append("+fs_script") + cmd.append(f"--name={fs_script.name}") + if fs_script.func: + # Notice that here fs_script.func is guaranteed to be a str # because we don't allow the user to pass a serialized function - func = db_script.func + func = fs_script.func sanitized_func = func.replace("\n", "\\n") if not ( sanitized_func.startswith("'") @@ -236,9 +236,9 @@ def _build_db_script_cmd(db_scripts: t.List[DBScript]) -> t.List[str]: ): sanitized_func = '"' + sanitized_func + '"' cmd.append(f"--func={sanitized_func}") - elif db_script.file: - cmd.append(f"--file={db_script.file}") - cmd.append(f"--device={db_script.device}") - cmd.append(f"--devices_per_node={db_script.devices_per_node}") - cmd.append(f"--first_device={db_script.first_device}") + elif fs_script.file: + cmd.append(f"--file={fs_script.file}") + cmd.append(f"--device={fs_script.device}") + cmd.append(f"--devices_per_node={fs_script.devices_per_node}") + cmd.append(f"--first_device={fs_script.first_device}") return cmd diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 0753354b4..047e75d2c 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -29,7 +29,7 @@ import typing as t from shlex import split as sh_split -from ....entity import Application, DBNode +from ....entity import Application, FSNode from ....error import AllocationError from ....log import get_logger from ....settings import AprunSettings, RunSettings, Singularity @@ -40,7 +40,7 @@ class AprunStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: AprunSettings + self, entity: t.Union[Application, FSNode], run_settings: AprunSettings ) -> None: """Initialize a ALPS aprun job step @@ -77,9 +77,9 @@ def get_launch_cmd(self) -> t.List[str]: aprun_cmd.extend(self.run_settings.format_env_vars()) aprun_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now aprun_cmd.extend(["--cc", "none"]) # Replace the command with the entrypoint wrapper script diff --git a/smartsim/_core/launcher/step/dragonStep.py b/smartsim/_core/launcher/step/dragonStep.py index 036a9e565..a0a3e038d 100644 --- a/smartsim/_core/launcher/step/dragonStep.py +++ b/smartsim/_core/launcher/step/dragonStep.py @@ -68,7 +68,7 @@ def get_launch_cmd(self) -> t.List[str]: run_settings = self.run_settings exe_cmd = [] - if run_settings.colocated_db_settings: + if run_settings.colocated_fs_settings: # Replace the command with the entrypoint wrapper script bash = shutil.which("bash") if not bash: diff --git a/smartsim/_core/launcher/step/localStep.py b/smartsim/_core/launcher/step/localStep.py index 06fa57459..7d02ca70f 100644 --- a/smartsim/_core/launcher/step/localStep.py +++ b/smartsim/_core/launcher/step/localStep.py @@ -28,14 +28,14 @@ import shutil import typing as t -from ....entity import Application, DBNode +from ....entity import Application, FSNode from ....settings import Singularity from ....settings.base import RunSettings from .step import Step, proxyable_launch_cmd class LocalStep(Step): - def __init__(self, entity: t.Union[Application, DBNode], run_settings: RunSettings): + def __init__(self, entity: t.Union[Application, FSNode], run_settings: RunSettings): super().__init__(entity, run_settings) self.run_settings = entity.run_settings self._env = self._set_env() @@ -55,7 +55,7 @@ def get_launch_cmd(self) -> t.List[str]: run_args = self.run_settings.format_run_args() cmd.extend(run_args) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # Replace the command with the entrypoint wrapper script if not (bash := shutil.which("bash")): raise RuntimeError("Unable to locate bash interpreter") diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 8c3951bd1..c7e56d2ec 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -28,7 +28,7 @@ import shutil import typing as t -from ....entity import Application, DBNode +from ....entity import Application, FSNode from ....error import AllocationError from ....log import get_logger from ....settings import BsubBatchSettings, JsrunSettings @@ -40,7 +40,7 @@ class BsubBatchStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], batch_settings: BsubBatchSettings + self, entity: t.Union[Application, FSNode], batch_settings: BsubBatchSettings ) -> None: """Initialize a LSF bsub step @@ -106,7 +106,7 @@ def _write_script(self) -> str: class JsrunStep(Step): - def __init__(self, entity: t.Union[Application, DBNode], run_settings: RunSettings): + def __init__(self, entity: t.Union[Application, FSNode], run_settings: RunSettings): """Initialize a LSF jsrun job step :param name: name of the entity to be launched @@ -173,9 +173,9 @@ def get_launch_cmd(self) -> t.List[str]: jsrun_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now jsrun_cmd.extend(["--bind", "none"]) # Replace the command with the entrypoint wrapper script diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 4ee10e4d2..931f901b4 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -29,7 +29,7 @@ import typing as t from shlex import split as sh_split -from ....entity import Application, DBNode +from ....entity import Application, FSNode from ....error import AllocationError, SmartSimError from ....log import get_logger from ....settings import MpiexecSettings, MpirunSettings, OrterunSettings @@ -41,7 +41,7 @@ class _BaseMPIStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: RunSettings + self, entity: t.Union[Application, FSNode], run_settings: RunSettings ) -> None: """Initialize a job step conforming to the MPI standard @@ -76,9 +76,9 @@ def get_launch_cmd(self) -> t.List[str]: # add mpi settings to command mpi_cmd.extend(self.run_settings.format_run_args()) - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # disable cpu binding as the entrypoint will set that - # for the application and database process now + # for the application and feature store process now # mpi_cmd.extend(["--cpu-bind", "none"]) # Replace the command with the entrypoint wrapper script @@ -156,7 +156,7 @@ def _make_mpmd(self) -> t.List[str]: class MpiexecStep(_BaseMPIStep): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: MpiexecSettings + self, entity: t.Union[Application, FSNode], run_settings: MpiexecSettings ) -> None: """Initialize an mpiexec job step @@ -172,7 +172,7 @@ def __init__( class MpirunStep(_BaseMPIStep): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: MpirunSettings + self, entity: t.Union[Application, FSNode], run_settings: MpirunSettings ) -> None: """Initialize an mpirun job step @@ -188,7 +188,7 @@ def __init__( class OrterunStep(_BaseMPIStep): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: OrterunSettings + self, entity: t.Union[Application, FSNode], run_settings: OrterunSettings ) -> None: """Initialize an orterun job step diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 34cc0587c..b9e3b3f0c 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -26,7 +26,7 @@ import typing as t -from ....entity import Application, DBNode +from ....entity import Application, FSNode from ....log import get_logger from ....settings import QsubBatchSettings from .step import Step @@ -36,7 +36,7 @@ class QsubBatchStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], batch_settings: QsubBatchSettings + self, entity: t.Union[Application, FSNode], batch_settings: QsubBatchSettings ) -> None: """Initialize a PBSpro qsub step diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 58fcdf97f..3f178d974 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -29,7 +29,7 @@ import typing as t from shlex import split as sh_split -from ....entity import Application, DBNode, Ensemble +from ....entity import Application, Ensemble, FSNode from ....error import AllocationError from ....log import get_logger from ....settings import RunSettings, SbatchSettings, Singularity, SrunSettings @@ -40,7 +40,7 @@ class SbatchStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], batch_settings: SbatchSettings + self, entity: t.Union[Application, FSNode], batch_settings: SbatchSettings ) -> None: """Initialize a Slurm Sbatch step @@ -102,7 +102,7 @@ def _write_script(self) -> str: class SrunStep(Step): def __init__( - self, entity: t.Union[Application, DBNode], run_settings: SrunSettings + self, entity: t.Union[Application, FSNode], run_settings: SrunSettings ) -> None: """Initialize a srun job step @@ -146,7 +146,7 @@ def get_launch_cmd(self) -> t.List[str]: srun_cmd += self.run_settings.format_run_args() - if self.run_settings.colocated_db_settings: + if self.run_settings.colocated_fs_settings: # Replace the command with the entrypoint wrapper script bash = shutil.which("bash") if not bash: @@ -190,7 +190,7 @@ def _get_mpmd(self) -> t.List[RunSettings]: return self.run_settings.mpmd @staticmethod - def _get_exe_args_list(entity: t.Union[Application, DBNode]) -> t.List[str]: + def _get_exe_args_list(entity: t.Union[Application, FSNode]) -> t.List[str]: """Convenience function to encapsulate checking the runsettings.exe_args type to always return a list """ diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index 556e21972..c2aa444c0 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -37,7 +37,7 @@ from smartsim._core.config import CONFIG from smartsim.error.errors import SmartSimError, UnproxyableStepError -from ....entity import Application, DBNode, Ensemble +from ....entity import Application, Ensemble, FSNode from ....log import get_logger from ....settings.base import RunSettings, SettingsBase from ...utils.helpers import encode_cmd, get_base_36_repr @@ -48,7 +48,7 @@ class Step: def __init__( - self, entity: t.Union[Application, DBNode], step_settings: SettingsBase + self, entity: t.Union[Application, FSNode], step_settings: SettingsBase ) -> None: self.name = self._create_unique_name(entity.name) self.entity = entity @@ -109,20 +109,20 @@ def get_colocated_launch_script(self) -> str: ) makedirs(osp.dirname(script_path), exist_ok=True) - db_settings = {} + fs_settings = {} if isinstance(self.step_settings, RunSettings): - db_settings = self.step_settings.colocated_db_settings or {} + fs_settings = self.step_settings.colocated_fs_settings or {} - # db log file causes write contention and kills performance so by + # fs log file causes write contention and kills performance so by # default we turn off logging unless user specified debug=True - if db_settings.get("debug", False): - db_log_file = self.get_step_file(ending="-db.log") + if fs_settings.get("debug", False): + fs_log_file = self.get_step_file(ending="-fs.log") else: - db_log_file = "/dev/null" + fs_log_file = "/dev/null" # write the colocated wrapper shell script to the directory for this # entity currently being prepped to launch - write_colocated_launch_script(script_path, db_log_file, db_settings) + write_colocated_launch_script(script_path, fs_log_file, fs_settings) return script_path # pylint: disable=no-self-use diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index 3ea928797..584a417a2 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -32,4 +32,4 @@ installed_redisai_backends, is_crayex_platform, ) -from .redis import check_cluster_status, create_cluster, db_is_active +from .redis import check_cluster_status, create_cluster, fs_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index def6220a8..a56517dbf 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -48,27 +48,27 @@ _TSignalHandlerFn = t.Callable[[int, t.Optional["FrameType"]], object] -def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: - """Unpack the unformatted database identifier +def unpack_fs_identifier(fs_id: str, token: str) -> t.Tuple[str, str]: + """Unpack the unformatted feature store identifier and format for env variable suffix using the token - :param db_id: the unformatted database identifier eg. identifier_1 - :param token: character to use to construct the db suffix - :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") + :param fs_id: the unformatted feature store identifier eg. identifier_1 + :param token: character to use to construct the fs suffix + :return: fs id suffix and formatted fs_id e.g. ("_identifier_1", "identifier_1") """ - if db_id == "orchestrator": + if fs_id == "featurestore": return "", "" - db_name_suffix = token + db_id - return db_name_suffix, db_id + fs_name_suffix = token + fs_id + return fs_name_suffix, fs_id -def unpack_colo_db_identifier(db_id: str) -> str: - """Create database identifier suffix for colocated database +def unpack_colo_fs_identifier(fs_id: str) -> str: + """Create feature store identifier suffix for colocated feature store - :param db_id: the unformatted database identifier - :return: db suffix + :param fs_id: the unformatted feature store identifier + :return: fs suffix """ - return "_" + db_id if db_id else "" + return "_" + fs_id if fs_id else "" def create_short_id_str() -> str: diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index 7fa59ad83..d033cd067 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -35,7 +35,7 @@ from smartredis import Client from smartredis.error import RedisReplyError -from ...entity import DBModel, DBScript +from ...entity import FSModel, FSScript from ...error import SSInternalError from ...log import get_logger from ..config import CONFIG @@ -73,7 +73,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm if returncode != 0: logger.error(out) logger.error(err) - raise SSInternalError("Database '--cluster create' command failed") + raise SSInternalError("Feature store '--cluster create' command failed") logger.debug(out) @@ -95,10 +95,10 @@ def check_cluster_status( if not cluster_nodes: raise SSInternalError( - "No cluster nodes have been set for database status check." + "No cluster nodes have been set for feature store status check." ) - logger.debug("Beginning database cluster status check...") + logger.debug("Beginning feature store cluster status check...") while trials > 0: # wait for cluster to spin up time.sleep(5) @@ -117,16 +117,16 @@ def check_cluster_status( raise SSInternalError("Cluster setup could not be verified") -def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> bool: - """Check if a DB is running +def fs_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> bool: + """Check if a FS is running - if the DB is clustered, check cluster status, otherwise - just ping DB. + if the FS is clustered, check cluster status, otherwise + just ping FS. :param hosts: list of hosts :param ports: list of ports - :param num_shards: Number of DB shards - :return: Whether DB is running + :param num_shards: Number of FS shards + :return: Whether FS is running """ # if single shard if num_shards < 2: @@ -149,71 +149,71 @@ def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> boo return False -def set_ml_model(db_model: DBModel, client: Client) -> None: - logger.debug(f"Adding DBModel named {db_model.name}") +def set_ml_model(fs_model: FSModel, client: Client) -> None: + logger.debug(f"Adding FSModel named {fs_model.name}") - for device in db_model.devices: + for device in fs_model.devices: try: - if db_model.is_file: + if fs_model.is_file: client.set_model_from_file( - name=db_model.name, - model_file=str(db_model.file), - backend=db_model.backend, + name=fs_model.name, + model_file=str(fs_model.file), + backend=fs_model.backend, device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - min_batch_timeout=db_model.min_batch_timeout, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs, + batch_size=fs_model.batch_size, + min_batch_size=fs_model.min_batch_size, + min_batch_timeout=fs_model.min_batch_timeout, + tag=fs_model.tag, + inputs=fs_model.inputs, + outputs=fs_model.outputs, ) else: - if db_model.model is None: - raise ValueError(f"No model attacted to {db_model.name}") + if fs_model.model is None: + raise ValueError(f"No model attacted to {fs_model.name}") client.set_model( - name=db_model.name, - model=db_model.model, - backend=db_model.backend, + name=fs_model.name, + model=fs_model.model, + backend=fs_model.backend, device=device, - batch_size=db_model.batch_size, - min_batch_size=db_model.min_batch_size, - min_batch_timeout=db_model.min_batch_timeout, - tag=db_model.tag, - inputs=db_model.inputs, - outputs=db_model.outputs, + batch_size=fs_model.batch_size, + min_batch_size=fs_model.min_batch_size, + min_batch_timeout=fs_model.min_batch_timeout, + tag=fs_model.tag, + inputs=fs_model.inputs, + outputs=fs_model.outputs, ) except RedisReplyError as error: # pragma: no cover - logger.error("Error while setting model on orchestrator.") + logger.error("Error while setting model on feature store.") raise error -def set_script(db_script: DBScript, client: Client) -> None: - logger.debug(f"Adding DBScript named {db_script.name}") +def set_script(fs_script: FSScript, client: Client) -> None: + logger.debug(f"Adding FSScript named {fs_script.name}") - for device in db_script.devices: + for device in fs_script.devices: try: - if db_script.is_file: + if fs_script.is_file: client.set_script_from_file( - name=db_script.name, file=str(db_script.file), device=device + name=fs_script.name, file=str(fs_script.file), device=device ) - elif db_script.script: - if isinstance(db_script.script, str): + elif fs_script.script: + if isinstance(fs_script.script, str): client.set_script( - name=db_script.name, script=db_script.script, device=device + name=fs_script.name, script=fs_script.script, device=device ) - elif callable(db_script.script): + elif callable(fs_script.script): client.set_function( - name=db_script.name, function=db_script.script, device=device + name=fs_script.name, function=fs_script.script, device=device ) else: - raise ValueError(f"No script or file attached to {db_script.name}") + raise ValueError(f"No script or file attached to {fs_script.name}") except RedisReplyError as error: # pragma: no cover - logger.error("Error while setting model on orchestrator.") + logger.error("Error while setting model on feature store.") raise error -def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm - """Send shutdown signal to DB node. +def shutdown_fs_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov-wlm + """Send shutdown signal to FS node. Should only be used in the case where cluster deallocation needs to occur manually. Usually, the SmartSim job manager diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index 6082ce4c0..aad38c778 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -36,9 +36,9 @@ if t.TYPE_CHECKING: from smartsim._core.control.manifest import LaunchedManifest as _Manifest - from smartsim.database.orchestrator import Orchestrator - from smartsim.entity import Application, DBNode, Ensemble - from smartsim.entity.dbobject import DBModel, DBScript + from smartsim.database.orchestrator import FeatureStore + from smartsim.entity import Application, Ensemble, FSNode + from smartsim.entity.dbobject import FSModel, FSScript from smartsim.settings.base import BatchSettings, RunSettings @@ -62,8 +62,8 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: _dictify_application(application, *telemetry_metadata) for application, telemetry_metadata in manifest.applications ], - "orchestrator": [ - _dictify_db(db, nodes_info) for db, nodes_info in manifest.databases + "featurestore": [ + _dictify_fs(fs, nodes_info) for fs, nodes_info in manifest.featurestores ], "ensemble": [ _dictify_ensemble(ens, member_info) @@ -105,11 +105,11 @@ def _dictify_application( telemetry_data_path: Path, ) -> t.Dict[str, t.Any]: if application.run_settings is not None: - colo_settings = (application.run_settings.colocated_db_settings or {}).copy() + colo_settings = (application.run_settings.colocated_fs_settings or {}).copy() else: colo_settings = ({}).copy() - db_scripts = t.cast("t.List[DBScript]", colo_settings.pop("db_scripts", [])) - db_models = t.cast("t.List[DBModel]", colo_settings.pop("db_models", [])) + fs_scripts = t.cast("t.List[FSScript]", colo_settings.pop("fs_scripts", [])) + fs_models = t.cast("t.List[FSModel]", colo_settings.pop("fs_models", [])) return { "name": application.name, "path": application.path, @@ -135,7 +135,7 @@ def _dictify_application( "Copy": [], } ), - "colocated_db": ( + "colocated_fs": ( { "settings": colo_settings, "scripts": [ @@ -145,7 +145,7 @@ def _dictify_application( "device": script.device, } } - for script in db_scripts + for script in fs_scripts ], "models": [ { @@ -154,7 +154,7 @@ def _dictify_application( "device": model.device, } } - for model in db_models + for model in fs_models ], } if colo_settings @@ -217,20 +217,20 @@ def _dictify_batch_settings(batch_settings: BatchSettings) -> t.Dict[str, t.Any] } -def _dictify_db( - db: Orchestrator, - nodes: t.Sequence[t.Tuple[DBNode, TStepLaunchMetaData]], +def _dictify_fs( + fs: FeatureStore, + nodes: t.Sequence[t.Tuple[FSNode, TStepLaunchMetaData]], ) -> t.Dict[str, t.Any]: - db_path = _utils.get_db_path() - if db_path: - db_type, _ = db_path.name.split("-", 1) + fs_path = _utils.get_fs_path() + if fs_path: + fs_type, _ = fs_path.name.split("-", 1) else: - db_type = "Unknown" + fs_type = "Unknown" return { - "name": db.name, - "type": db_type, - "interface": db._interfaces, # pylint: disable=protected-access + "name": fs.name, + "type": fs_type, + "interface": fs._interfaces, # pylint: disable=protected-access "shards": [ { **shard.to_dict(), @@ -238,14 +238,14 @@ def _dictify_db( "out_file": out_file, "err_file": err_file, "memory_file": ( - str(status_dir / "memory.csv") if db.telemetry.is_enabled else "" + str(status_dir / "memory.csv") if fs.telemetry.is_enabled else "" ), "client_file": ( - str(status_dir / "client.csv") if db.telemetry.is_enabled else "" + str(status_dir / "client.csv") if fs.telemetry.is_enabled else "" ), "client_count_file": ( str(status_dir / "client_count.csv") - if db.telemetry.is_enabled + if fs.telemetry.is_enabled else "" ), "telemetry_metadata": { @@ -255,7 +255,7 @@ def _dictify_db( "managed": managed, }, } - for dbnode, ( + for fsnode, ( step_id, task_id, managed, @@ -263,6 +263,6 @@ def _dictify_db( err_file, status_dir, ) in nodes - for shard in dbnode.get_launched_shard_info() + for shard in fsnode.get_launched_shard_info() ], } diff --git a/smartsim/_core/utils/telemetry/collector.py b/smartsim/_core/utils/telemetry/collector.py index 178126dec..4d0a79af3 100644 --- a/smartsim/_core/utils/telemetry/collector.py +++ b/smartsim/_core/utils/telemetry/collector.py @@ -95,8 +95,8 @@ class _DBAddress: def __init__(self, host: str, port: int) -> None: """Initialize the instance - :param host: host address for database connections - :param port: port number for database connections + :param host: host address for feature store connections + :param port: port number for feature store connections """ self.host = host.strip() if host else "" self.port = port @@ -115,7 +115,7 @@ def __str__(self) -> str: class DBCollector(Collector): - """A base class for collectors that retrieve statistics from an orchestrator""" + """A base class for collectors that retrieve statistics from a feature store""" def __init__(self, entity: JobEntity, sink: Sink) -> None: """Initialize the `DBCollector` @@ -131,7 +131,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: ) async def _configure_client(self) -> None: - """Configure the client connection to the target database""" + """Configure the client connection to the target feature store""" try: if not self._client: self._client = redisa.Redis( @@ -146,7 +146,7 @@ async def _configure_client(self) -> None: ) async def prepare(self) -> None: - """Initialization logic for the DB collector. Creates a database + """Initialization logic for the FS collector. Creates a feature store connection then executes the `post_prepare` callback function.""" if self._client: return @@ -157,7 +157,7 @@ async def prepare(self) -> None: @abc.abstractmethod async def _post_prepare(self) -> None: """Hook function to enable subclasses to perform actions - after a db client is ready""" + after a fss client is ready""" @abc.abstractmethod async def _perform_collection( @@ -171,7 +171,7 @@ async def _perform_collection( """ async def collect(self) -> None: - """Execute database metric collection if the collector is enabled. Writes + """Execute feature store metric collection if the collector is enabled. Writes the resulting metrics to the associated output sink. Calling `collect` when `self.enabled` is `False` performs no actions.""" if not self.enabled: @@ -186,8 +186,8 @@ async def collect(self) -> None: return try: - # if we can't communicate w/the db, exit - if not await self._check_db(): + # if we can't communicate w/the fs, exit + if not await self._check_fs(): return all_metrics = await self._perform_collection() @@ -197,7 +197,7 @@ async def collect(self) -> None: logger.warning(f"Collect failed for {type(self).__name__}", exc_info=ex) async def shutdown(self) -> None: - """Execute cleanup of database client connections""" + """Execute cleanup of feature store client connections""" try: if self._client: logger.info( @@ -210,8 +210,8 @@ async def shutdown(self) -> None: f"An error occurred during {type(self).__name__} shutdown", exc_info=ex ) - async def _check_db(self) -> bool: - """Check if the target database is reachable. + async def _check_fs(self) -> bool: + """Check if the target feature store is reachable. :return: `True` if connection succeeds, `False` otherwise. """ @@ -219,7 +219,7 @@ async def _check_db(self) -> bool: if self._client: return await self._client.ping() except redisex.ConnectionError: - logger.warning(f"Cannot ping db {self._address}") + logger.warning(f"Cannot ping fs {self._address}") return False @@ -233,7 +233,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -247,11 +247,11 @@ async def _perform_collection( if self._client is None: return [] - db_info = await self._client.info("memory") + fs_info = await self._client.info("memory") - used = float(db_info["used_memory"]) - peak = float(db_info["used_memory_peak"]) - total = float(db_info["total_system_memory"]) + used = float(fs_info["used_memory"]) + peak = float(fs_info["used_memory_peak"]) + total = float(fs_info["total_system_memory"]) value = (get_ts_ms(), used, peak, total) @@ -261,7 +261,7 @@ async def _perform_collection( class DBConnectionCollector(DBCollector): - """A `DBCollector` that collects database client-connection metrics""" + """A `DBCollector` that collects feature store client-connection metrics""" def __init__(self, entity: JobEntity, sink: Sink) -> None: super().__init__(entity, sink) @@ -269,7 +269,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -306,7 +306,7 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: async def _post_prepare(self) -> None: """Write column headers for a CSV formatted output sink after - the database connection is established""" + the feature store connection is established""" await self._sink.save("timestamp", *self._columns) async def _perform_collection( @@ -457,9 +457,9 @@ def register_collectors(self, entity: JobEntity) -> None: """ collectors: t.List[Collector] = [] - # ONLY db telemetry is implemented at this time. This resolver must - # be updated when non-database or always-on collectors are introduced - if entity.is_db and entity.telemetry_on: + # ONLY fs telemetry is implemented at this time. This resolver must + # be updated when non-feature store or always-on collectors are introduced + if entity.is_fs and entity.telemetry_on: if mem_out := entity.collectors.get("memory", None): collectors.append(DBMemoryCollector(entity, FileSink(mem_out))) @@ -469,7 +469,7 @@ def register_collectors(self, entity: JobEntity) -> None: if num_out := entity.collectors.get("client_count", None): collectors.append(DBConnectionCountCollector(entity, FileSink(num_out))) else: - logger.debug(f"Collectors disabled for db {entity.name}") + logger.debug(f"Collectors disabled for fs {entity.name}") self.add_all(collectors) diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py index f5b6a92e0..4cf067f08 100644 --- a/smartsim/_core/utils/telemetry/manifest.py +++ b/smartsim/_core/utils/telemetry/manifest.py @@ -45,8 +45,8 @@ class Run: """the timestamp at the time the `Experiment.start` is called""" applications: t.List[JobEntity] """applications started in this run""" - orchestrators: t.List[JobEntity] - """orchestrators started in this run""" + featurestores: t.List[JobEntity] + """featurestores started in this run""" ensembles: t.List[JobEntity] """ensembles started in this run""" @@ -58,7 +58,7 @@ def flatten( :param filter_fn: optional boolean filter that returns True for entities to include in the result """ - entities = self.applications + self.orchestrators + self.ensembles + entities = self.applications + self.featurestores + self.ensembles if filter_fn: entities = [entity for entity in entities if filter_fn(entity)] return entities @@ -86,7 +86,7 @@ def load_entity( parent_keys = parent_keys.intersection(entity_dict.keys()) if parent_keys: container = "shards" if "shards" in parent_keys else "applications" - child_type = "orchestrator" if container == "shards" else "application" + child_type = "featurestore" if container == "shards" else "application" for child_entity in entity_dict[container]: entity = JobEntity.from_manifest( child_type, child_entity, str(exp_dir), raw_experiment @@ -119,7 +119,7 @@ def load_entities( """ persisted: t.Dict[str, t.List[JobEntity]] = { "application": [], - "orchestrator": [], + "featurestore": [], } for item in run[entity_type]: entities = Run.load_entity(entity_type, item, exp_dir, raw_experiment) @@ -145,7 +145,7 @@ def load_run( # create an output mapping to hold the deserialized entities run_entities: t.Dict[str, t.List[JobEntity]] = { "application": [], - "orchestrator": [], + "featurestore": [], "ensemble": [], } @@ -165,7 +165,7 @@ def load_run( loaded_run = Run( raw_run["timestamp"], run_entities["application"], - run_entities["orchestrator"], + run_entities["featurestore"], run_entities["ensemble"], ) return loaded_run diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index e9e4c46bc..8a9a99aed 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -458,7 +458,7 @@ def __init__(self, telemetry_monitor_args: TelemetryMonitorArgs): def _can_shutdown(self) -> bool: """Determines if the telemetry monitor can perform shutdown. An automatic shutdown will occur if there are no active jobs being monitored. - Managed jobs and databases are considered separately due to the way they + Managed jobs and feature stores are considered separately due to the way they are stored in the job manager :return: return True if capable of automatically shutting down @@ -471,20 +471,20 @@ def _can_shutdown(self) -> bool: unmanaged_jobs = ( list(self._action_handler.tracked_jobs) if self._action_handler else [] ) - # get an individual count of databases for logging - n_dbs: int = len( + # get an individual count of feature stores for logging + n_fss: int = len( [ job for job in managed_jobs + unmanaged_jobs - if isinstance(job, JobEntity) and job.is_db + if isinstance(job, JobEntity) and job.is_fs ] ) # if we have no jobs currently being monitored we can shutdown - n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_dbs - shutdown_ok = n_jobs + n_dbs == 0 + n_jobs = len(managed_jobs) + len(unmanaged_jobs) - n_fss + shutdown_ok = n_jobs + n_fss == 0 - logger.debug(f"{n_jobs} active job(s), {n_dbs} active db(s)") + logger.debug(f"{n_jobs} active job(s), {n_fss} active fs(s)") return shutdown_ok async def monitor(self) -> None: diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index e716af150..1e7e2d83c 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -55,7 +55,7 @@ def write_event( :param task_id: the task_id of a managed task :param step_id: the step_id of an unmanaged task :param entity_type: the SmartSimEntity subtype - (e.g. `orchestrator`, `ensemble`, `application`, `dbnode`, ...) + (e.g. `featurestore`, `ensemble`, `application`, `fsnode`, ...) :param event_type: the event subtype :param status_dir: path where the SmartSimEntity outputs are written :param detail: (optional) additional information to write with the event diff --git a/smartsim/database/__init__.py b/smartsim/database/__init__.py index 106f8e1e2..0801c682b 100644 --- a/smartsim/database/__init__.py +++ b/smartsim/database/__init__.py @@ -24,4 +24,4 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .orchestrator import Orchestrator +from .orchestrator import FeatureStore diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index 3309c591c..75b4bca95 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -38,10 +38,10 @@ from smartredis.error import RedisReplyError from .._core.config import CONFIG -from .._core.utils import db_is_active -from .._core.utils.helpers import is_valid_cmd, unpack_db_identifier +from .._core.utils import fs_is_active +from .._core.utils.helpers import is_valid_cmd, unpack_fs_identifier from .._core.utils.network import get_ip_from_host -from ..entity import DBNode, EntityList, TelemetryConfiguration +from ..entity import EntityList, FSNode, TelemetryConfiguration from ..error import ( SmartSimError, SSConfigError, @@ -126,7 +126,7 @@ def _get_single_command( if run_command == "srun" and getenv("SLURM_HET_SIZE") is not None: msg = ( - "srun can not launch an orchestrator with single_cmd=True in " + "srun can not launch an FeatureStore with single_cmd=True in " + "a hetereogeneous job. Automatically switching to single_cmd=False." ) logger.info(msg) @@ -137,7 +137,7 @@ def _get_single_command( if run_command == "aprun": msg = ( - "aprun can not launch an orchestrator with batch=True and " + "aprun can not launch an FeatureStore with batch=True and " + "single_cmd=True. Automatically switching to single_cmd=False." ) logger.info(msg) @@ -149,13 +149,13 @@ def _get_single_command( def _check_local_constraints(launcher: str, batch: bool) -> None: """Check that the local launcher is not launched with invalid batch config""" if launcher == "local" and batch: - msg = "Local orchestrator can not be launched with batch=True" + msg = "Local FeatureStore can not be launched with batch=True" raise SmartSimError(msg) # pylint: disable-next=too-many-public-methods -class Orchestrator(EntityList[DBNode]): - """The Orchestrator is an in-memory database that can be launched +class FeatureStore(EntityList[FSNode]): + """The FeatureStore is an in-memory database that can be launched alongside entities in SmartSim. Data can be transferred between entities by using one of the Python, C, C++ or Fortran clients within an entity. @@ -168,7 +168,7 @@ def __init__( interface: t.Union[str, t.List[str]] = "lo", launcher: str = "local", run_command: str = "auto", - db_nodes: int = 1, + fs_nodes: int = 1, batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, account: t.Optional[str] = None, @@ -179,16 +179,16 @@ def __init__( threads_per_queue: t.Optional[int] = None, inter_op_threads: t.Optional[int] = None, intra_op_threads: t.Optional[int] = None, - db_identifier: str = "orchestrator", + fs_identifier: str = "featurestore", **kwargs: t.Any, ) -> None: - """Initialize an ``Orchestrator`` reference for local launch + """Initialize an ``FeatureStore`` reference for local launch Extra configurations for RedisAI See https://oss.redis.com/redisai/configuration/ - :param path: path to location of ``Orchestrator`` directory + :param path: path to location of ``FeatureStore`` directory :param port: TCP/IP port :param interface: network interface(s) :param launcher: type of launcher being used, options are "slurm", "pbs", @@ -196,18 +196,18 @@ def __init__( an attempt will be made to find an available launcher on the system. :param run_command: specify launch binary or detect automatically - :param db_nodes: number of database shards + :param fs_nodes: number of feature store shards :param batch: run as a batch workload :param hosts: specify hosts to launch on :param account: account to run batch on :param time: walltime for batch 'HH:MM:SS' format - :param alloc: allocation to launch database on + :param alloc: allocation to launch feature store on :param single_cmd: run all shards with one (MPMD) command :param threads_per_queue: threads per GPU device :param inter_op_threads: threads across CPU operations :param intra_op_threads: threads per CPU operation - :param db_identifier: an identifier to distinguish this orchestrator in - multiple-database experiments + :param fs_identifier: an identifier to distinguish this FeatureStore in + multiple-feature store experiments """ self.launcher, self.run_command = _autodetect(launcher, run_command) _check_run_command(self.launcher, self.run_command) @@ -233,11 +233,11 @@ def __init__( gpus_per_shard = int(kwargs.pop("gpus_per_shard", 0)) cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) super().__init__( - name=db_identifier, + name=fs_identifier, path=str(path), port=port, interface=interface, - db_nodes=db_nodes, + fs_nodes=fs_nodes, batch=batch, launcher=self.launcher, run_command=self.run_command, @@ -270,7 +270,7 @@ def __init__( if self.launcher != "local": self.batch_settings = self._build_batch_settings( - db_nodes, + fs_nodes, alloc or "", batch, account or "", @@ -282,52 +282,52 @@ def __init__( self.set_hosts(hosts) elif not hosts and self.run_command == "mpirun": raise SmartSimError( - "hosts argument is required when launching Orchestrator with mpirun" + "hosts argument is required when launching FeatureStore with mpirun" ) self._reserved_run_args: t.Dict[t.Type[RunSettings], t.List[str]] = {} self._reserved_batch_args: t.Dict[t.Type[BatchSettings], t.List[str]] = {} self._fill_reserved() @property - def db_identifier(self) -> str: - """Return the DB identifier, which is common to a DB and all of its nodes + def fs_identifier(self) -> str: + """Return the FS identifier, which is common to a FS and all of its nodes - :return: DB identifier + :return: FS identifier """ return self.name @property def num_shards(self) -> int: - """Return the number of DB shards contained in the Orchestrator. - This might differ from the number of ``DBNode`` objects, as each - ``DBNode`` may start more than one shard (e.g. with MPMD). + """Return the number of FS shards contained in the FeatureStore. + This might differ from the number of ``FSNode`` objects, as each + ``FSNode`` may start more than one shard (e.g. with MPMD). - :returns: the number of DB shards contained in the Orchestrator + :returns: the number of FS shards contained in the FeatureStore """ return sum(node.num_shards for node in self.entities) @property - def db_nodes(self) -> int: - """Read only property for the number of nodes an ``Orchestrator`` is + def fs_nodes(self) -> int: + """Read only property for the number of nodes an ``FeatureStore`` is launched across. Notice that SmartSim currently assumes that each shard will be launched on its own node. Therefore this property is currently an alias to the ``num_shards`` attribute. - :returns: Number of database nodes + :returns: Number of feature store nodes """ return self.num_shards @property def hosts(self) -> t.List[str]: - """Return the hostnames of Orchestrator instance hosts + """Return the hostnames of FeatureStore instance hosts - Note that this will only be populated after the orchestrator + Note that this will only be populated after the FeatureStore has been launched by SmartSim. - :return: the hostnames of Orchestrator instance hosts + :return: the hostnames of FeatureStore instance hosts """ if not self._hosts: - self._hosts = self._get_db_hosts() + self._hosts = self._get_fs_hosts() return self._hosts @property @@ -348,22 +348,22 @@ def reset_hosts(self) -> None: self.set_hosts(self._user_hostlist) def remove_stale_files(self) -> None: - """Can be used to remove database files of a previous launch""" + """Can be used to remove feature store files of a previous launch""" - for db in self.entities: - db.remove_stale_dbnode_files() + for fs in self.entities: + fs.remove_stale_fsnode_files() def get_address(self) -> t.List[str]: - """Return database addresses + """Return feature store addresses :return: addresses - :raises SmartSimError: If database address cannot be found or is not active + :raises SmartSimError: If feature store address cannot be found or is not active """ if not self._hosts: - raise SmartSimError("Could not find database address") + raise SmartSimError("Could not find feature store address") if not self.is_active(): - raise SmartSimError("Database is not active") + raise SmartSimError("Feature store is not active") return self._get_address() def _get_address(self) -> t.List[str]: @@ -373,21 +373,21 @@ def _get_address(self) -> t.List[str]: ] def is_active(self) -> bool: - """Check if the database is active + """Check if the feature store is active - :return: True if database is active, False otherwise + :return: True if feature store is active, False otherwise """ try: hosts = self.hosts except SSDBFilesNotParseable: return False - return db_is_active(hosts, self.ports, self.num_shards) + return fs_is_active(hosts, self.ports, self.num_shards) @property def _rai_module(self) -> t.Tuple[str, ...]: """Get the RedisAI module from third-party installations - :return: Tuple of args to pass to the orchestrator exe + :return: Tuple of args to pass to the FeatureStore exe to load and configure the RedisAI """ module = ["--loadmodule", CONFIG.redisai] @@ -409,14 +409,14 @@ def _redis_conf(self) -> str: @property def checkpoint_file(self) -> str: - """Get the path to the checkpoint file for this Orchestrator + """Get the path to the checkpoint file for this Feature Store :return: Path to the checkpoint file if it exists, otherwise a None """ return osp.join(self.path, "smartsim_db.dat") def set_cpus(self, num_cpus: int) -> None: - """Set the number of CPUs available to each database shard + """Set the number of CPUs available to each feature store shard This effectively will determine how many cpus can be used for compute threads, background threads, and network I/O. @@ -433,19 +433,19 @@ def set_cpus(self, num_cpus: int) -> None: if hasattr(self.batch_settings, "set_cpus_per_task"): self.batch_settings.set_cpus_per_task(num_cpus) - for db in self.entities: - db.run_settings.set_cpus_per_task(num_cpus) - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for mpmd in db.run_settings.mpmd: + for fs in self.entities: + fs.run_settings.set_cpus_per_task(num_cpus) + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for mpmd in fs.run_settings.mpmd: mpmd.set_cpus_per_task(num_cpus) def set_walltime(self, walltime: str) -> None: - """Set the batch walltime of the orchestrator + """Set the batch walltime of the FeatureStore - Note: This will only effect orchestrators launched as a batch + Note: This will only effect FeatureStores launched as a batch :param walltime: amount of time e.g. 10 hours is 10:00:00 - :raises SmartSimError: if orchestrator isn't launching as batch + :raises SmartSimError: if FeatureStore isn't launching as batch """ if not self.batch: raise SmartSimError("Not running as batch, cannot set walltime") @@ -454,7 +454,7 @@ def set_walltime(self, walltime: str) -> None: self.batch_settings.set_walltime(walltime) def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: - """Specify the hosts for the ``Orchestrator`` to launch on + """Specify the hosts for the ``FeatureStore`` to launch on :param host_list: list of host (compute node names) :raises TypeError: if wrong type @@ -471,8 +471,8 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: self.batch_settings.set_hostlist(host_list) if self.launcher == "lsf": - for db in self.entities: - db.set_hosts(host_list) + for fs in self.entities: + fs.set_hosts(host_list) elif ( self.launcher == "pals" and isinstance(self.entities[0].run_settings, PalsMpiexecSettings) @@ -481,26 +481,26 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: # In this case, --hosts is a global option, set it to first run command self.entities[0].run_settings.set_hostlist(host_list) else: - for host, db in zip(host_list, self.entities): - if isinstance(db.run_settings, AprunSettings): + for host, fs in zip(host_list, self.entities): + if isinstance(fs.run_settings, AprunSettings): if not self.batch: - db.run_settings.set_hostlist([host]) + fs.run_settings.set_hostlist([host]) else: - db.run_settings.set_hostlist([host]) + fs.run_settings.set_hostlist([host]) - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for i, mpmd_runsettings in enumerate(db.run_settings.mpmd, 1): + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for i, mpmd_runsettings in enumerate(fs.run_settings.mpmd, 1): mpmd_runsettings.set_hostlist(host_list[i]) def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: - """Set a batch argument the orchestrator should launch with + """Set a batch argument the FeatureStore should launch with Some commonly used arguments such as --job-name are used by SmartSim and will not be allowed to be set. :param arg: batch argument to set e.g. "exclusive" :param value: batch param - set to None if no param value - :raises SmartSimError: if orchestrator not launching as batch + :raises SmartSimError: if FeatureStore not launching as batch """ if not hasattr(self, "batch_settings") or not self.batch_settings: raise SmartSimError("Not running as batch, cannot set batch_arg") @@ -508,13 +508,13 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: if arg in self._reserved_batch_args[type(self.batch_settings)]: logger.warning( f"Can not set batch argument {arg}: " - "it is a reserved keyword in Orchestrator" + "it is a reserved keyword in FeatureStore" ) else: self.batch_settings.batch_args[arg] = value def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: - """Set a run argument the orchestrator should launch + """Set a run argument the FeatureStore should launch each node with (it will be passed to `jrun`) Some commonly used arguments are used @@ -527,24 +527,24 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: if arg in self._reserved_run_args[type(self.entities[0].run_settings)]: logger.warning( f"Can not set batch argument {arg}: " - "it is a reserved keyword in Orchestrator" + "it is a reserved keyword in FeatureStore" ) else: - for db in self.entities: - db.run_settings.run_args[arg] = value - if db.is_mpmd and hasattr(db.run_settings, "mpmd"): - for mpmd in db.run_settings.mpmd: + for fs in self.entities: + fs.run_settings.run_args[arg] = value + if fs.is_mpmd and hasattr(fs.run_settings, "mpmd"): + for mpmd in fs.run_settings.mpmd: mpmd.run_args[arg] = value def enable_checkpoints(self, frequency: int) -> None: - """Sets the database's save configuration to save the DB every 'frequency' - seconds given that at least one write operation against the DB occurred in - that time. E.g., if `frequency` is 900, then the database will save to disk + """Sets the feature store's save configuration to save the fs every 'frequency' + seconds given that at least one write operation against the fs occurred in + that time. E.g., if `frequency` is 900, then the feature store will save to disk after 900 seconds if there is at least 1 change to the dataset. - :param frequency: the given number of seconds before the DB saves + :param frequency: the given number of seconds before the FS saves """ - self.set_db_conf("save", f"{frequency} 1") + self.set_fs_conf("save", f"{frequency} 1") def set_max_memory(self, mem: str) -> None: """Sets the max memory configuration. By default there is no memory limit. @@ -561,33 +561,33 @@ def set_max_memory(self, mem: str) -> None: :param mem: the desired max memory size e.g. 3gb :raises SmartSimError: If 'mem' is an invalid memory value - :raises SmartSimError: If database is not active + :raises SmartSimError: If feature store is not active """ - self.set_db_conf("maxmemory", mem) + self.set_fs_conf("maxmemory", mem) def set_eviction_strategy(self, strategy: str) -> None: - """Sets how the database will select what to remove when + """Sets how the feature store will select what to remove when 'maxmemory' is reached. The default is noeviction. :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. :raises SmartSimError: If 'strategy' is an invalid maxmemory policy - :raises SmartSimError: If database is not active + :raises SmartSimError: If feature store is not active """ - self.set_db_conf("maxmemory-policy", strategy) + self.set_fs_conf("maxmemory-policy", strategy) def set_max_clients(self, clients: int = 50_000) -> None: """Sets the max number of connected clients at the same time. - When the number of DB shards contained in the orchestrator is + When the number of FS shards contained in the feature store is more than two, then every node will use two connections, one incoming and another outgoing. :param clients: the maximum number of connected clients """ - self.set_db_conf("maxclients", str(clients)) + self.set_fs_conf("maxclients", str(clients)) def set_max_message_size(self, size: int = 1_073_741_824) -> None: - """Sets the database's memory size limit for bulk requests, + """Sets the feature store's memory size limit for bulk requests, which are elements representing single strings. The default is 1 gigabyte. Message size must be greater than or equal to 1mb. The specified memory size should be an integer that represents @@ -596,16 +596,16 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: :param size: maximum message size in bytes """ - self.set_db_conf("proto-max-bulk-len", str(size)) + self.set_fs_conf("proto-max-bulk-len", str(size)) - def set_db_conf(self, key: str, value: str) -> None: + def set_fs_conf(self, key: str, value: str) -> None: """Set any valid configuration at runtime without the need - to restart the database. All configuration parameters - that are set are immediately loaded by the database and + to restart the feature store. All configuration parameters + that are set are immediately loaded by the feature store and will take effect starting with the next command executed. :param key: the configuration parameter - :param value: the database configuration parameter's new value + :param value: the feature store configuration parameter's new value """ if self.is_active(): addresses = [] @@ -613,12 +613,12 @@ def set_db_conf(self, key: str, value: str) -> None: for port in self.ports: addresses.append(":".join([get_ip_from_host(host), str(port)])) - db_name, name = unpack_db_identifier(self.db_identifier, "_") + fs_name, name = unpack_fs_identifier(self.fs_identifier, "_") - environ[f"SSDB{db_name}"] = addresses[0] + environ[f"SSDB{fs_name}"] = addresses[0] - db_type = CLUSTERED if self.num_shards > 2 else STANDALONE - environ[f"SR_DB_TYPE{db_name}"] = db_type + fs_type = CLUSTERED if self.num_shards > 2 else STANDALONE + environ[f"SR_DB_TYPE{fs_name}"] = fs_type options = ConfigOptions.create_from_environment(name) client = Client(options) @@ -634,17 +634,17 @@ def set_db_conf(self, key: str, value: str) -> None: except TypeError: raise TypeError( "Incompatible function arguments. The key and value used for " - "setting the database configurations must be strings." + "setting the feature store configurations must be strings." ) from None else: raise SmartSimError( - "The SmartSim Orchestrator must be active in order to set the " - "database's configurations." + "The SmartSim FeatureStore must be active in order to set the " + "feature store's configurations." ) @staticmethod def _build_batch_settings( - db_nodes: int, + fs_nodes: int, alloc: str, batch: bool, account: str, @@ -662,7 +662,7 @@ def _build_batch_settings( # on or if user specified batch=False (alloc will be found through env) if not alloc and batch: batch_settings = create_batch_settings( - launcher, nodes=db_nodes, time=time, account=account, **kwargs + launcher, nodes=fs_nodes, time=time, account=account, **kwargs ) return batch_settings @@ -673,12 +673,12 @@ def _build_run_settings( exe_args: t.List[t.List[str]], *, run_args: t.Optional[t.Dict[str, t.Any]] = None, - db_nodes: int = 1, + fs_nodes: int = 1, single_cmd: bool = True, **kwargs: t.Any, ) -> RunSettings: run_args = {} if run_args is None else run_args - mpmd_nodes = single_cmd and db_nodes > 1 + mpmd_nodes = single_cmd and fs_nodes > 1 if mpmd_nodes: run_settings = create_run_settings( @@ -728,7 +728,7 @@ def _build_run_settings_lsf( if gpus_per_shard is None: raise ValueError("Expected an integer number of gpus per shard") - # We always run the DB on cpus 0:cpus_per_shard-1 + # We always run the fs on cpus 0:cpus_per_shard-1 # and gpus 0:gpus_per_shard-1 for shard_id, args in enumerate(exe_args): host = shard_id @@ -737,8 +737,8 @@ def _build_run_settings_lsf( run_settings = JsrunSettings(exe, args, run_args=run_args.copy()) run_settings.set_binding("none") - # This makes sure output is written to orchestrator_0.out, - # orchestrator_1.out, and so on + # This makes sure output is written to featurestore_0.out, + # featurestore_1.out, and so on run_settings.set_individual_output("_%t") erf_sets = { @@ -765,93 +765,93 @@ def _build_run_settings_lsf( def _initialize_entities( self, *, - db_nodes: int = 1, + fs_nodes: int = 1, single_cmd: bool = True, port: int = 6379, **kwargs: t.Any, ) -> None: - db_nodes = int(db_nodes) - if db_nodes == 2: - raise SSUnsupportedError("Orchestrator does not support clusters of size 2") + fs_nodes = int(fs_nodes) + if fs_nodes == 2: + raise SSUnsupportedError("FeatureStore does not support clusters of size 2") - if self.launcher == "local" and db_nodes > 1: + if self.launcher == "local" and fs_nodes > 1: raise ValueError( - "Local Orchestrator does not support multiple database shards" + "Local FeatureStore does not support multiple feature store shards" ) - mpmd_nodes = (single_cmd and db_nodes > 1) or self.launcher == "lsf" + mpmd_nodes = (single_cmd and fs_nodes > 1) or self.launcher == "lsf" if mpmd_nodes: self._initialize_entities_mpmd( - db_nodes=db_nodes, single_cmd=single_cmd, port=port, **kwargs + fs_nodes=fs_nodes, single_cmd=single_cmd, port=port, **kwargs ) else: - cluster = db_nodes >= 3 + cluster = fs_nodes >= 3 - for db_id in range(db_nodes): - db_node_name = "_".join((self.name, str(db_id))) + for fs_id in range(fs_nodes): + fs_node_name = "_".join((self.name, str(fs_id))) - # create the exe_args list for launching multiple databases - # per node. also collect port range for dbnode + # create the exe_args list for launching multiple feature stores + # per node. also collect port range for fsnode start_script_args = self._get_start_script_args( - db_node_name, port, cluster + fs_node_name, port, cluster ) - # if only launching 1 db per command, we don't need a + # if only launching 1 fs per command, we don't need a # list of exe args lists run_settings = self._build_run_settings( sys.executable, [start_script_args], port=port, **kwargs ) - node = DBNode( - db_node_name, + node = FSNode( + fs_node_name, self.path, exe=sys.executable, exe_args=[start_script_args], run_settings=run_settings, ports=[port], - output_files=[db_node_name + ".out"], - db_identifier=self.db_identifier, + output_files=[fs_node_name + ".out"], + fs_identifier=self.fs_identifier, ) self.entities.append(node) self.ports = [port] def _initialize_entities_mpmd( - self, *, db_nodes: int = 1, port: int = 6379, **kwargs: t.Any + self, *, fs_nodes: int = 1, port: int = 6379, **kwargs: t.Any ) -> None: - cluster = db_nodes >= 3 + cluster = fs_nodes >= 3 mpmd_node_name = self.name + "_0" exe_args_mpmd: t.List[t.List[str]] = [] - for db_id in range(db_nodes): - db_shard_name = "_".join((self.name, str(db_id))) - # create the exe_args list for launching multiple databases - # per node. also collect port range for dbnode + for fs_id in range(fs_nodes): + fs_shard_name = "_".join((self.name, str(fs_id))) + # create the exe_args list for launching multiple feature stores + # per node. also collect port range for fsnode start_script_args = self._get_start_script_args( - db_shard_name, port, cluster + fs_shard_name, port, cluster ) exe_args = " ".join(start_script_args) exe_args_mpmd.append(sh_split(exe_args)) run_settings: t.Optional[RunSettings] = None if self.launcher == "lsf": run_settings = self._build_run_settings_lsf( - sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs + sys.executable, exe_args_mpmd, fs_nodes=fs_nodes, port=port, **kwargs ) - output_files = [f"{self.name}_{db_id}.out" for db_id in range(db_nodes)] + output_files = [f"{self.name}_{fs_id}.out" for fs_id in range(fs_nodes)] else: run_settings = self._build_run_settings( - sys.executable, exe_args_mpmd, db_nodes=db_nodes, port=port, **kwargs + sys.executable, exe_args_mpmd, fs_nodes=fs_nodes, port=port, **kwargs ) output_files = [mpmd_node_name + ".out"] if not run_settings: raise ValueError(f"Could not build run settings for {self.launcher}") - node = DBNode( + node = FSNode( mpmd_node_name, self.path, run_settings, [port], output_files, - db_identifier=self.db_identifier, + fs_identifier=self.fs_identifier, ) self.entities.append(node) self.ports = [port] @@ -875,13 +875,13 @@ def _get_start_script_args( return cmd - def _get_db_hosts(self) -> t.List[str]: + def _get_fs_hosts(self) -> t.List[str]: hosts = [] - for db in self.entities: - if not db.is_mpmd: - hosts.append(db.host) + for fs in self.entities: + if not fs.is_mpmd: + hosts.append(fs.host) else: - hosts.extend(db.hosts) + hosts.extend(fs.hosts) return hosts def _check_network_interface(self) -> None: diff --git a/smartsim/entity/__init__.py b/smartsim/entity/__init__.py index 38162ac42..ce6140844 100644 --- a/smartsim/entity/__init__.py +++ b/smartsim/entity/__init__.py @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .dbnode import DBNode +from .dbnode import FSNode from .dbobject import * from .ensemble import Ensemble from .entity import SmartSimEntity, TelemetryConfiguration diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 9e370f64d..91bffdb79 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -43,12 +43,12 @@ logger = get_logger(__name__) -class DBNode(SmartSimEntity): - """DBNode objects are the entities that make up the orchestrator. - Each database node can be launched in a cluster configuration - and take launch multiple databases per node. +class FSNode(SmartSimEntity): + """FSNode objects are the entities that make up the feature store. + Each feature store node can be launched in a cluster configuration + and take launch multiple feature stores per node. - To configure how each instance of the database operates, look + To configure how each instance of the feature store operates, look into the smartsimdb.conf. """ @@ -61,9 +61,9 @@ def __init__( run_settings: RunSettings, ports: t.List[int], output_files: t.List[str], - db_identifier: str = "", + fs_identifier: str = "", ) -> None: - """Initialize a database node within an orchestrator.""" + """Initialize a feature store node within an feature store.""" super().__init__(name, path, run_settings) self.exe = [exe] if run_settings.container else [expand_exe_path(exe)] self.exe_args = exe_args or [] @@ -77,7 +77,7 @@ def __init__( ): raise ValueError("output_files must be of type list[str]") self._output_files = output_files - self.db_identifier = db_identifier + self.fs_identifier = fs_identifier @property def num_shards(self) -> int: @@ -93,14 +93,14 @@ def host(self) -> str: (host,) = self.hosts except ValueError: raise ValueError( - f"Multiple hosts detected for this DB Node: {', '.join(self.hosts)}" + f"Multiple hosts detected for this FS Node: {', '.join(self.hosts)}" ) from None return host @property def hosts(self) -> t.List[str]: if not self._hosts: - self._hosts = self._parse_db_hosts() + self._hosts = self._parse_fs_hosts() return self._hosts def clear_hosts(self) -> None: @@ -117,9 +117,9 @@ def is_mpmd(self) -> bool: def set_hosts(self, hosts: t.List[str]) -> None: self._hosts = [str(host) for host in hosts] - def remove_stale_dbnode_files(self) -> None: + def remove_stale_fsnode_files(self) -> None: """This function removes the .conf, .err, and .out files that - have the same names used by this dbnode that may have been + have the same names used by this fsnode that may have been created from a previous experiment execution. """ @@ -151,7 +151,7 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf This function should bu used if and only if ``_mpmd==True`` :param port: port number - :return: the dbnode configuration file name + :return: the fsnode configuration file name """ if self.num_shards == 1: return [f"nodes-{self.name}-{port}.conf"] @@ -187,7 +187,7 @@ def _parse_launched_shard_info_from_files( return cls._parse_launched_shard_info_from_iterable(ifstream, num_shards) def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": - """Parse the launched database shard info from the output files + """Parse the launched feature store shard info from the output files :raises SSDBFilesNotParseable: if all shard info could not be found :return: The found launched shard info @@ -211,16 +211,16 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": if len(ips) < self.num_shards: msg = ( - f"Failed to parse the launched DB shard information from file(s) " + f"Failed to parse the launched FS shard information from file(s) " f"{', '.join(output_files)}. Found the information for " - f"{len(ips)} out of {self.num_shards} DB shards." + f"{len(ips)} out of {self.num_shards} FS shards." ) logger.error(msg) raise SSDBFilesNotParseable(msg) return ips - def _parse_db_hosts(self) -> t.List[str]: - """Parse the database hosts/IPs from the output files + def _parse_fs_hosts(self) -> t.List[str]: + """Parse the feature store hosts/IPs from the output files The IP address is preferred, but if hostname is only present then a lookup to /etc/hosts is done through the socket library. @@ -233,7 +233,7 @@ def _parse_db_hosts(self) -> t.List[str]: @dataclass(frozen=True) class LaunchedShardData: - """Data class to write and parse data about a launched database shard""" + """Data class to write and parse data about a launched feature store shard""" name: str hostname: str diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index 5cb0d061f..f82aeea18 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -30,28 +30,28 @@ from .._core._install.builder import Device from ..error import SSUnsupportedError -__all__ = ["DBObject", "DBModel", "DBScript"] +__all__ = ["FSObject", "FSModel", "FSScript"] -_DBObjectFuncT = t.TypeVar("_DBObjectFuncT", str, bytes) +_FSObjectFuncT = t.TypeVar("_FSObjectFuncT", str, bytes) -class DBObject(t.Generic[_DBObjectFuncT]): - """Base class for ML objects residing on DB. Should not +class FSObject(t.Generic[_FSObjectFuncT]): + """Base class for ML objects residing on FS. Should not be instantiated. """ def __init__( self, name: str, - func: t.Optional[_DBObjectFuncT], + func: t.Optional[_FSObjectFuncT], file_path: t.Optional[str], device: str, devices_per_node: int, first_device: int, ) -> None: self.name = name - self.func: t.Optional[_DBObjectFuncT] = func + self.func: t.Optional[_FSObjectFuncT] = func self.file: t.Optional[Path] = ( None # Need to have this explicitly to check on it ) @@ -107,9 +107,9 @@ def _check_device(device: str) -> str: return device def _enumerate_devices(self) -> t.List[str]: - """Enumerate devices for a DBObject + """Enumerate devices for a FSObject - :param dbobject: DBObject to enumerate + :param FSObject: FSObject to enumerate :return: list of device names """ @@ -149,7 +149,7 @@ def _check_devices( raise ValueError(msg) -class DBScript(DBObject[str]): +class FSScript(FSObject[str]): def __init__( self, name: str, @@ -204,7 +204,7 @@ def __str__(self) -> str: return desc_str -class DBModel(DBObject[bytes]): +class FSModel(FSObject[bytes]): def __init__( self, name: str, @@ -221,7 +221,7 @@ def __init__( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the FS at runtime One of either model (in memory representation) or model_path (file) must be provided diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 4ce7239fa..c6b6fad3a 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -41,7 +41,7 @@ ) from ..log import get_logger from ..settings.base import BatchSettings, RunSettings -from .dbobject import DBModel, DBScript +from .dbobject import FSModel, FSScript from .entity import SmartSimEntity from .entityList import EntityList from .model import Application @@ -208,10 +208,10 @@ def add_application(self, application: Application) -> None: f"Application {application.name} already exists in ensemble {self.name}" ) - if self._db_models: - self._extend_entity_db_models(application, self._db_models) - if self._db_scripts: - self._extend_entity_db_scripts(application, self._db_scripts) + if self._fs_models: + self._extend_entity_fs_models(application, self._fs_models) + if self._fs_scripts: + self._extend_entity_fs_scripts(application, self._fs_scripts) self.entities.append(application) @@ -368,10 +368,10 @@ def add_ml_model( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the fs at runtime - Each ML Model added will be loaded into an - orchestrator (converged or not) prior to the execution + Each ML Model added will be loaded into a + feature store (converged or not) prior to the execution of every entity belonging to this ensemble One of either model (in memory representation) or model_path (file) @@ -392,7 +392,7 @@ def add_ml_model( :param inputs: model inputs (TF only) :param outputs: model outupts (TF only) """ - db_model = DBModel( + fs_model = FSModel( name=name, backend=backend, model=model, @@ -409,19 +409,19 @@ def add_ml_model( ) dupe = next( ( - db_model.name - for ensemble_ml_model in self._db_models - if ensemble_ml_model.name == db_model.name + fs_model.name + for ensemble_ml_model in self._fs_models + if ensemble_ml_model.name == fs_model.name ), None, ) if dupe: raise SSUnsupportedError( - f'An ML Model with name "{db_model.name}" already exists' + f'An ML Model with name "{fs_model.name}" already exists' ) - self._db_models.append(db_model) + self._fs_models.append(fs_model) for entity in self.applications: - self._extend_entity_db_models(entity, [db_model]) + self._extend_entity_fs_models(entity, [fs_model]) def add_script( self, @@ -435,7 +435,7 @@ def add_script( """TorchScript to launch with every entity belonging to this ensemble Each script added to the application will be loaded into an - orchestrator (converged or not) prior to the execution + feature store (converged or not) prior to the execution of every entity belonging to this ensemble Device selection is either "GPU" or "CPU". If many devices are @@ -454,7 +454,7 @@ def add_script( :param devices_per_node: number of devices on each host :param first_device: first device to use on each host """ - db_script = DBScript( + fs_script = FSScript( name=name, script=script, script_path=script_path, @@ -464,19 +464,19 @@ def add_script( ) dupe = next( ( - db_script.name - for ensemble_script in self._db_scripts - if ensemble_script.name == db_script.name + fs_script.name + for ensemble_script in self._fs_scripts + if ensemble_script.name == fs_script.name ), None, ) if dupe: raise SSUnsupportedError( - f'A Script with name "{db_script.name}" already exists' + f'A Script with name "{fs_script.name}" already exists' ) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) for entity in self.applications: - self._extend_entity_db_scripts(entity, [db_script]) + self._extend_entity_fs_scripts(entity, [fs_script]) def add_function( self, @@ -489,10 +489,10 @@ def add_function( """TorchScript function to launch with every entity belonging to this ensemble Each script function to the application will be loaded into a - non-converged orchestrator prior to the execution + non-converged feature store prior to the execution of every entity belonging to this ensemble. - For converged orchestrators, the :meth:`add_script` method should be used. + For converged feature stores, the :meth:`add_script` method should be used. Device selection is either "GPU" or "CPU". If many devices are present, a number can be passed for specification e.g. "GPU:1". @@ -508,7 +508,7 @@ def add_function( :param devices_per_node: number of devices on each host :param first_device: first device to use on each host """ - db_script = DBScript( + fs_script = FSScript( name=name, script=function, device=device, @@ -517,23 +517,23 @@ def add_function( ) dupe = next( ( - db_script.name - for ensemble_script in self._db_scripts - if ensemble_script.name == db_script.name + fs_script.name + for ensemble_script in self._fs_scripts + if ensemble_script.name == fs_script.name ), None, ) if dupe: raise SSUnsupportedError( - f'A Script with name "{db_script.name}" already exists' + f'A Script with name "{fs_script.name}" already exists' ) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) for entity in self.applications: - self._extend_entity_db_scripts(entity, [db_script]) + self._extend_entity_fs_scripts(entity, [fs_script]) @staticmethod - def _extend_entity_db_models( - application: Application, db_models: t.List[DBModel] + def _extend_entity_fs_models( + application: Application, fs_models: t.List[FSModel] ) -> None: """ Ensures that the Machine Learning model names being added to the Ensemble @@ -541,17 +541,17 @@ def _extend_entity_db_models( This static method checks if the provided ML model names already exist in the Ensemble. An SSUnsupportedError is raised if any duplicate names are - found. Otherwise, it appends the given list of DBModels to the Ensemble. + found. Otherwise, it appends the given list of FSModel to the Ensemble. :param application: SmartSim Application object. - :param db_models: List of DBModels to append to the Ensemble. + :param fs_models: List of FSModels to append to the Ensemble. """ - for add_ml_model in db_models: + for add_ml_model in fs_models: dupe = next( ( - db_model.name - for db_model in application.db_models - if db_model.name == add_ml_model.name + fs_model.name + for fs_model in application.fs_models + if fs_model.name == add_ml_model.name ), None, ) @@ -562,26 +562,26 @@ def _extend_entity_db_models( application.add_ml_model_object(add_ml_model) @staticmethod - def _extend_entity_db_scripts( - application: Application, db_scripts: t.List[DBScript] + def _extend_entity_fs_scripts( + application: Application, fs_scripts: t.List[FSScript] ) -> None: """ Ensures that the script/function names being added to the Ensemble are unique. This static method checks if the provided script/function names already exist in the Ensemble. An SSUnsupportedError is raised if any duplicate names - are found. Otherwise, it appends the given list of DBScripts to the + are found. Otherwise, it appends the given list of FSScripts to the Ensemble. :param application: SmartSim Application object. - :param db_scripts: List of DBScripts to append to the Ensemble. + :param fs_scripts: List of FSScripts to append to the Ensemble. """ - for add_script in db_scripts: + for add_script in fs_scripts: dupe = next( ( add_script.name - for db_script in application.db_scripts - if db_script.name == add_script.name + for fs_script in application.fs_scripts + if fs_script.name == add_script.name ), None, ) diff --git a/smartsim/entity/entityList.py b/smartsim/entity/entityList.py index c578d84e3..461ce56ab 100644 --- a/smartsim/entity/entityList.py +++ b/smartsim/entity/entityList.py @@ -68,8 +68,8 @@ def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: # --------------------------------------------------------------------- # self.entities: t.Sequence[_T_co] = [] - self._db_models: t.Sequence["smartsim.entity.DBModel"] = [] - self._db_scripts: t.Sequence["smartsim.entity.DBScript"] = [] + self._fs_models: t.Sequence["smartsim.entity.FSModel"] = [] + self._fs_scripts: t.Sequence["smartsim.entity.FSScript"] = [] # # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @@ -80,14 +80,14 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: raise NotImplementedError @property - def db_models(self) -> t.Iterable["smartsim.entity.DBModel"]: + def fs_models(self) -> t.Iterable["smartsim.entity.FSModel"]: """Return an immutable collection of attached models""" - return (model for model in self._db_models) + return (model for model in self._fs_models) @property - def db_scripts(self) -> t.Iterable["smartsim.entity.DBScript"]: + def fs_scripts(self) -> t.Iterable["smartsim.entity.FSScript"]: """Return an immutable collection of attached scripts""" - return (script for script in self._db_scripts) + return (script for script in self._fs_scripts) @property def batch(self) -> bool: @@ -131,8 +131,8 @@ def __init__(self, name: str, path: str, **kwargs: t.Any) -> None: super().__init__(name=name, path=path, **kwargs) # Change container types to be invariant ``list``s self.entities: t.List[_T] = list(self.entities) - self._db_models: t.List["smartsim.entity.DBModel"] = list(self._db_models) - self._db_scripts: t.List["smartsim.entity.DBScript"] = list(self._db_scripts) + self._fs_models: t.List["smartsim.entity.FSModel"] = list(self._fs_models) + self._fs_scripts: t.List["smartsim.entity.FSScript"] = list(self._fs_scripts) def _initialize_entities(self, **kwargs: t.Any) -> None: """Initialize the SmartSimEntity objects in the container""" diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 98c254a80..76af42152 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -40,7 +40,7 @@ from ..error import EntityExistsError, SSUnsupportedError from ..log import get_logger from ..settings.base import BatchSettings, RunSettings -from .dbobject import DBModel, DBScript +from .dbobject import FSModel, FSScript from .entity import SmartSimEntity from .files import EntityFiles @@ -82,8 +82,8 @@ def __init__( self.incoming_entities: t.List[SmartSimEntity] = [] self._key_prefixing_enabled = False self.batch_settings = batch_settings - self._db_models: t.List[DBModel] = [] - self._db_scripts: t.List[DBScript] = [] + self._fs_models: t.List[FSModel] = [] + self._fs_scripts: t.List[FSScript] = [] self.files: t.Optional[EntityFiles] = None @property @@ -103,31 +103,31 @@ def exe_args(self, value: t.Union[str, t.List[str], None]) -> None: self._exe_args = self._build_exe_args(value) @property - def db_models(self) -> t.Iterable[DBModel]: + def fs_models(self) -> t.Iterable[FSModel]: """Retrieve an immutable collection of attached models :return: Return an immutable collection of attached models """ - return (model for model in self._db_models) + return (model for model in self._fs_models) @property - def db_scripts(self) -> t.Iterable[DBScript]: + def fs_scripts(self) -> t.Iterable[FSScript]: """Retrieve an immutable collection attached of scripts :return: Return an immutable collection of attached scripts """ - return (script for script in self._db_scripts) + return (script for script in self._fs_scripts) @property def colocated(self) -> bool: - """Return True if this Model will run with a colocated Orchestrator + """Return True if this Model will run with a colocated FeatureStore - :return: Return True of the Model will run with a colocated Orchestrator + :return: Return True of the Model will run with a colocated FeatureStore """ if self.run_settings is None: return False else: - return bool(self.run_settings.colocated_db_settings) + return bool(self.run_settings.colocated_fs_settings) def add_exe_args(self, args: t.Union[str, t.List[str]]) -> None: """Add executable arguments to executable @@ -232,34 +232,34 @@ def print_attached_files(self) -> None: """Print a table of the attached files on std out""" print(self.attached_files_table) - def colocate_db(self, *args: t.Any, **kwargs: t.Any) -> None: - """An alias for ``Application.colocate_db_tcp``""" + def colocate_fs(self, *args: t.Any, **kwargs: t.Any) -> None: + """An alias for ``Application.colocate_fs_tcp``""" warnings.warn( ( - "`colocate_db` has been deprecated and will be removed in a \n" - "future release. Please use `colocate_db_tcp` or `colocate_db_uds`." + "`colocate_fs` has been deprecated and will be removed in a \n" + "future release. Please use `colocate_fs_tcp` or `colocate_fs_uds`." ), FutureWarning, ) - self.colocate_db_tcp(*args, **kwargs) + self.colocate_fs_tcp(*args, **kwargs) - def colocate_db_uds( + def colocate_fs_uds( self, unix_socket: str = "/tmp/redis.socket", socket_permissions: int = 755, - db_cpus: int = 1, + fs_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, - db_identifier: str = "", + fs_identifier: str = "", **kwargs: t.Any, ) -> None: - """Colocate an Orchestrator instance with this Application over UDS. + """Colocate an FeatureStore instance with this Application over UDS. This method will initialize settings which add an unsharded - database to this Application instance. Only this Application will be able to communicate - with this colocated database by using Unix Domain sockets. + feature store to this Application instance. Only this Application will be able to communicate + with this colocated feature store by using Unix Domain sockets. - Extra parameters for the db can be passed through kwargs. This includes + Extra parameters for the fs can be passed through kwargs. This includes many performance, caching and inference settings. .. highlight:: python @@ -277,11 +277,11 @@ def colocate_db_uds( :param unix_socket: path to where the socket file will be created :param socket_permissions: permissions for the socketfile - :param db_cpus: number of cpus to use for orchestrator - :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty + :param fs_cpus: number of cpus to use for FeatureStore + :param custom_pinning: CPUs to pin the FeatureStore to. Passing an empty iterable disables pinning - :param debug: launch Application with extra debug information about the colocated db - :param kwargs: additional keyword arguments to pass to the orchestrator database + :param debug: launch Application with extra debug information about the colocated fs + :param kwargs: additional keyword arguments to pass to the FeatureStore feature store """ if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): @@ -297,30 +297,30 @@ def colocate_db_uds( } common_options = { - "cpus": db_cpus, + "cpus": fs_cpus, "custom_pinning": custom_pinning, "debug": debug, - "db_identifier": db_identifier, + "fs_identifier": fs_identifier, } - self._set_colocated_db_settings(uds_options, common_options, **kwargs) + self._set_colocated_fs_settings(uds_options, common_options, **kwargs) - def colocate_db_tcp( + def colocate_fs_tcp( self, port: int = 6379, ifname: t.Union[str, list[str]] = "lo", - db_cpus: int = 1, + fs_cpus: int = 1, custom_pinning: t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]] = None, debug: bool = False, - db_identifier: str = "", + fs_identifier: str = "", **kwargs: t.Any, ) -> None: - """Colocate an Orchestrator instance with this Application over TCP/IP. + """Colocate an FeatureStore instance with this Application over TCP/IP. This method will initialize settings which add an unsharded - database to this Application instance. Only this Application will be able to communicate - with this colocated database by using the loopback TCP interface. + feature store to this Application instance. Only this Application will be able to communicate + with this colocated feature store by using the loopback TCP interface. - Extra parameters for the db can be passed through kwargs. This includes + Extra parameters for the fs can be passed through kwargs. This includes many performance, caching and inference settings. .. highlight:: python @@ -336,25 +336,25 @@ def colocate_db_tcp( Generally these don't need to be changed. - :param port: port to use for orchestrator database - :param ifname: interface to use for orchestrator - :param db_cpus: number of cpus to use for orchestrator - :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty + :param port: port to use for FeatureStore feature store + :param ifname: interface to use for FeatureStore + :param fs_cpus: number of cpus to use for FeatureStore + :param custom_pinning: CPUs to pin the FeatureStore to. Passing an empty iterable disables pinning - :param debug: launch Application with extra debug information about the colocated db - :param kwargs: additional keyword arguments to pass to the orchestrator database + :param debug: launch Application with extra debug information about the colocated fs + :param kwargs: additional keyword arguments to pass to the FeatureStore feature store """ tcp_options = {"port": port, "ifname": ifname} common_options = { - "cpus": db_cpus, + "cpus": fs_cpus, "custom_pinning": custom_pinning, "debug": debug, - "db_identifier": db_identifier, + "fs_identifier": fs_identifier, } - self._set_colocated_db_settings(tcp_options, common_options, **kwargs) + self._set_colocated_fs_settings(tcp_options, common_options, **kwargs) - def _set_colocated_db_settings( + def _set_colocated_fs_settings( self, connection_options: t.Mapping[str, t.Union[int, t.List[str], str]], common_options: t.Dict[ @@ -371,17 +371,17 @@ def _set_colocated_db_settings( ) -> None: """ Ingest the connection-specific options (UDS/TCP) and set the final settings - for the colocated database + for the colocated feature store """ if hasattr(self.run_settings, "mpmd") and len(self.run_settings.mpmd) > 0: raise SSUnsupportedError( - "Applications colocated with databases cannot be run as a mpmd workload" + "Applications colocated with feature stores cannot be run as a mpmd workload" ) - if hasattr(self.run_settings, "_prep_colocated_db"): + if hasattr(self.run_settings, "_prep_colocated_fs"): # pylint: disable-next=protected-access - self.run_settings._prep_colocated_db(common_options["cpus"]) + self.run_settings._prep_colocated_fs(common_options["cpus"]) if "limit_app_cpus" in kwargs: raise SSUnsupportedError( @@ -389,7 +389,7 @@ def _set_colocated_db_settings( "RunSettings using the correct binding option for your launcher." ) - # TODO list which db settings can be extras + # TODO list which fs settings can be extras custom_pinning_ = t.cast( t.Optional[t.Iterable[t.Union[int, t.Iterable[int]]]], common_options.get("custom_pinning"), @@ -399,7 +399,7 @@ def _set_colocated_db_settings( custom_pinning_, cpus_ ) - colo_db_config: t.Dict[ + colo_fs_config: t.Dict[ str, t.Union[ bool, @@ -408,14 +408,14 @@ def _set_colocated_db_settings( None, t.List[str], t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], + t.List[FSModel], + t.List[FSScript], t.Dict[str, t.Union[int, None]], t.Dict[str, str], ], ] = {} - colo_db_config.update(connection_options) - colo_db_config.update(common_options) + colo_fs_config.update(connection_options) + colo_fs_config.update(common_options) redis_ai_temp = { "threads_per_queue": kwargs.get("threads_per_queue", None), @@ -423,16 +423,16 @@ def _set_colocated_db_settings( "intra_op_parallelism": kwargs.get("intra_op_parallelism", None), } # redisai arguments for inference settings - colo_db_config["rai_args"] = redis_ai_temp - colo_db_config["extra_db_args"] = { + colo_fs_config["rai_args"] = redis_ai_temp + colo_fs_config["extra_fs_args"] = { k: str(v) for k, v in kwargs.items() if k not in redis_ai_temp } - self._check_db_objects_colo() - colo_db_config["db_models"] = self._db_models - colo_db_config["db_scripts"] = self._db_scripts + self._check_fs_objects_colo() + colo_fs_config["fs_models"] = self._fs_models + colo_fs_config["fs_scripts"] = self._fs_scripts - self.run_settings.colocated_db_settings = colo_db_config + self.run_settings.colocated_fs_settings = colo_fs_config @staticmethod def _create_pinning_string( @@ -514,10 +514,10 @@ def add_ml_model( inputs: t.Optional[t.List[str]] = None, outputs: t.Optional[t.List[str]] = None, ) -> None: - """A TF, TF-lite, PT, or ONNX model to load into the DB at runtime + """A TF, TF-lite, PT, or ONNX model to load into the fs at runtime Each ML Model added will be loaded into an - orchestrator (converged or not) prior to the execution + FeatureStore (converged or not) prior to the execution of this Model instance One of either model (in memory representation) or model_path (file) @@ -525,7 +525,7 @@ def add_ml_model( :param name: key to store model under :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :param model: A model in memory (only supported for non-colocated orchestrators) + :param model: A model in memory (only supported for non-colocated feature stores) :param model_path: serialized model :param device: name of device for execution :param devices_per_node: The number of GPU devices available on the host. @@ -541,7 +541,7 @@ def add_ml_model( :param inputs: model inputs (TF only) :param outputs: model outupts (TF only) """ - db_model = DBModel( + fs_model = FSModel( name=name, backend=backend, model=model, @@ -556,7 +556,7 @@ def add_ml_model( inputs=inputs, outputs=outputs, ) - self.add_ml_model_object(db_model) + self.add_ml_model_object(fs_model) def add_script( self, @@ -570,7 +570,7 @@ def add_script( """TorchScript to launch with this Model instance Each script added to the application will be loaded into an - orchestrator (converged or not) prior to the execution + FeatureStore (converged or not) prior to the execution of this Model instance Device selection is either "GPU" or "CPU". If many devices are @@ -585,7 +585,7 @@ def add_script( must be provided :param name: key to store script under - :param script: TorchScript code (only supported for non-colocated orchestrators) + :param script: TorchScript code (only supported for non-colocated featurestores) :param script_path: path to TorchScript code :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. @@ -595,7 +595,7 @@ def add_script( This parameter only applies to GPU devices and will be ignored if device is specified as CPU. """ - db_script = DBScript( + fs_script = FSScript( name=name, script=script, script_path=script_path, @@ -603,7 +603,7 @@ def add_script( devices_per_node=devices_per_node, first_device=first_device, ) - self.add_script_object(db_script) + self.add_script_object(fs_script) def add_function( self, @@ -616,10 +616,10 @@ def add_function( """TorchScript function to launch with this Application instance Each script function to the application will be loaded into a - non-converged orchestrator prior to the execution + non-converged FeatureStore prior to the execution of this Application instance. - For converged orchestrators, the :meth:`add_script` method should be used. + For converged featurestores, the :meth:`add_script` method should be used. Device selection is either "GPU" or "CPU". If many devices are present, a number can be passed for specification e.g. "GPU:1". @@ -637,14 +637,14 @@ def add_function( This parameter only applies to GPU devices and will be ignored if device is specified as CPU. """ - db_script = DBScript( + fs_script = FSScript( name=name, script=function, device=device, devices_per_node=devices_per_node, first_device=first_device, ) - self.add_script_object(db_script) + self.add_script_object(fs_script) def __hash__(self) -> int: return hash(self.name) @@ -661,52 +661,54 @@ def __str__(self) -> str: # pragma: no cover entity_str = "Name: " + self.name + "\n" entity_str += "Type: " + self.type + "\n" entity_str += str(self.run_settings) + "\n" - if self._db_models: - entity_str += "DB Models: \n" + str(len(self._db_models)) + "\n" - if self._db_scripts: - entity_str += "DB Scripts: \n" + str(len(self._db_scripts)) + "\n" + if self._fs_models: + entity_str += "FS Models: \n" + str(len(self._fs_models)) + "\n" + if self._fs_scripts: + entity_str += "FS Scripts: \n" + str(len(self._fs_scripts)) + "\n" return entity_str - def add_ml_model_object(self, db_model: DBModel) -> None: - if not db_model.is_file and self.colocated: - err_msg = "ML model can not be set from memory for colocated databases.\n" + def add_ml_model_object(self, fs_model: FSModel) -> None: + if not fs_model.is_file and self.colocated: + err_msg = ( + "ML model can not be set from memory for colocated feature stores.\n" + ) err_msg += ( - f"Please store the ML model named {db_model.name} in binary format " + f"Please store the ML model named {fs_model.name} in binary format " ) err_msg += "and add it to the SmartSim Application as file." raise SSUnsupportedError(err_msg) - self._db_models.append(db_model) + self._fs_models.append(fs_model) - def add_script_object(self, db_script: DBScript) -> None: - if db_script.func and self.colocated: - if not isinstance(db_script.func, str): + def add_script_object(self, fs_script: FSScript) -> None: + if fs_script.func and self.colocated: + if not isinstance(fs_script.func, str): err_msg = ( - "Functions can not be set from memory for colocated databases.\n" - f"Please convert the function named {db_script.name} " + "Functions can not be set from memory for colocated feature stores.\n" + f"Please convert the function named {fs_script.name} " "to a string or store it as a text file and add it to the " "SmartSim Application with add_script." ) raise SSUnsupportedError(err_msg) - self._db_scripts.append(db_script) + self._fs_scripts.append(fs_script) - def _check_db_objects_colo(self) -> None: - for db_model in self._db_models: - if not db_model.is_file: + def _check_fs_objects_colo(self) -> None: + for fs_model in self._fs_models: + if not fs_model.is_file: err_msg = ( - "ML model can not be set from memory for colocated databases.\n" - f"Please store the ML model named {db_model.name} in binary " + "ML model can not be set from memory for colocated feature stores.\n" + f"Please store the ML model named {fs_model.name} in binary " "format and add it to the SmartSim Application as file." ) raise SSUnsupportedError(err_msg) - for db_script in self._db_scripts: - if db_script.func: - if not isinstance(db_script.func, str): + for fs_script in self._fs_scripts: + if fs_script.func: + if not isinstance(fs_script.func, str): err_msg = ( "Functions can not be set from memory for colocated " - "databases.\nPlease convert the function named " - f"{db_script.name} to a string or store it as a text" + "feature stores.\nPlease convert the function named " + f"{fs_script.name} to a string or store it as a text" "file and add it to the SmartSim Application with add_script." ) raise SSUnsupportedError(err_msg) diff --git a/smartsim/error/errors.py b/smartsim/error/errors.py index 9a5d0c92d..1e10ac3bb 100644 --- a/smartsim/error/errors.py +++ b/smartsim/error/errors.py @@ -82,8 +82,14 @@ class SSReservedKeywordError(SmartSimError): class SSDBIDConflictError(SmartSimError): - """Raised in the event that a database identifier - is not unique when multiple databases are created + """Raised in the event that a feature store identifier + is not unique when multiple feature stores are created + """ + + +class SSDBFilesNotParseable(SmartSimError): + """Raised when the files related to the feature store cannot be parsed. + Includes the case when the files do not exist. """ diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 288d50a5c..0caad3bbf 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -38,7 +38,7 @@ from smartsim.status import SmartSimStatus from ._core import Controller, Generator, Manifest, previewrenderer -from .database import Orchestrator +from .database import FeatureStore from .entity import ( Application, Ensemble, @@ -87,8 +87,8 @@ class Experiment: The instances created by an Experiment represent executable code that is either user-specified, like the ``Application`` instance created - by ``Experiment.create_application``, or pre-configured, like the ``Orchestrator`` - instance created by ``Experiment.create_database``. + by ``Experiment.create_application``, or pre-configured, like the ``FeatureStore`` + instance created by ``Experiment.create_feature_store``. Experiment methods that accept a variable list of arguments, such as ``Experiment.start`` or ``Experiment.stop``, accept any number of the @@ -172,7 +172,7 @@ def __init__( self._control = Controller(launcher=self._launcher) - self.db_identifiers: t.Set[str] = set() + self.fs_identifiers: t.Set[str] = set() self._telemetry_cfg = ExperimentTelemetryConfiguration() def _set_dragon_server_path(self) -> None: @@ -192,7 +192,7 @@ def start( ) -> None: """Start passed instances using Experiment launcher - Any instance ``Application``, ``Ensemble`` or ``Orchestrator`` + Any instance ``Application``, ``Ensemble`` or ``FeatureStore`` instance created by the Experiment can be passed as an argument to the start method. @@ -211,17 +211,17 @@ def start( .. highlight:: python .. code-block:: python - exp.start(application_1, application_2, db, ensemble, block=True) + exp.start(application_1, application_2, fs, ensemble, block=True) # alternatively - stage_1 = [application_1, application_2, db, ensemble] + stage_1 = [application_1, application_2, fs, ensemble] exp.start(*stage_1, block=True) If `block==True` the Experiment will poll the launched instances - at runtime until all non-database jobs have completed. Database + at runtime until all non-feature store jobs have completed. Feature store jobs *must* be killed by the user by passing them to ``Experiment.stop``. This allows for multiple stages of a workflow - to produce to and consume from the same Orchestrator database. + to produce to and consume from the same FeatureStore feature store. If `kill_on_interrupt=True`, then all jobs launched by this experiment are guaranteed to be killed when ^C (SIGINT) signal is @@ -229,7 +229,7 @@ def start( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. - :param block: block execution until all non-database + :param block: block execution until all non-feature store jobs are finished :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) @@ -257,7 +257,7 @@ def stop( ) -> None: """Stop specific instances launched by this ``Experiment`` - Instances of ``Application``, ``Ensemble`` and ``Orchestrator`` + Instances of ``Application``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments to the stop method. Whichever launcher was specified at Experiment initialization @@ -272,7 +272,7 @@ def stop( exp.stop(application) # multiple - exp.stop(application_1, application_2, db, ensemble) + exp.stop(application_1, application_2, fs, ensemble) :param args: One or more SmartSimEntity or EntitySequence objects. :raises TypeError: if wrong type @@ -284,9 +284,9 @@ def stop( self._control.stop_entity(entity) for entity_list in stop_manifest.ensembles: self._control.stop_entity_list(entity_list) - dbs = stop_manifest.dbs - for db in dbs: - self._control.stop_db(db) + fss = stop_manifest.fss + for fs in fss: + self._control.stop_fs(fs) except SmartSimError as e: logger.error(e) raise @@ -309,7 +309,7 @@ def generate( directories will be symlinked, copied, or configured and written into the created directory for that instance. - Instances of ``application``, ``Ensemble`` and ``Orchestrator`` + Instances of ``application``, ``Ensemble`` and ``FeatureStore`` can all be passed as arguments to the generate method. :param tag: tag used in `to_configure` generator files @@ -372,8 +372,8 @@ def finished(self, entity: SmartSimEntity) -> bool: An instance of ``application`` or ``Ensemble`` can be passed as an argument. - Passing ``Orchestrator`` will return an error as a - database deployment is never finished until stopped + Passing ``FeatureStore`` will return an error as a + feature store deployment is never finished until stopped by the user. :param entity: object launched by this ``Experiment`` @@ -408,7 +408,7 @@ def get_status( .. highlight:: python .. code-block:: python - statuses = exp.get_status(application, ensemble, orchestrator) + statuses = exp.get_status(application, ensemble, featurestore) complete = [s == smartsim.status.STATUS_COMPLETED for s in statuses] assert all(complete) @@ -428,21 +428,21 @@ def get_status( raise @_contextualize - def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: - """Reconnect to a running ``Orchestrator`` + def reconnect_feature_store(self, checkpoint: str) -> FeatureStore: + """Reconnect to a running ``FeatureStore`` - This method can be used to connect to a ``Orchestrator`` deployment + This method can be used to connect to a ``FeatureStore`` deployment that was launched by a previous ``Experiment``. This can be helpful in the case where separate runs of an ``Experiment`` - wish to use the same ``Orchestrator`` instance currently + wish to use the same ``FeatureStore`` instance currently running on a system. :param checkpoint: the `smartsim_db.dat` file created - when an ``Orchestrator`` is launched + when an ``FeatureStore`` is launched """ try: - orc = self._control.reload_saved_db(checkpoint) - return orc + feature_store = self._control.reload_saved_fs(checkpoint) + return feature_store except SmartSimError as e: logger.error(e) raise @@ -457,7 +457,7 @@ def preview( """Preview entity information prior to launch. This method aggregates multiple pieces of information to give users insight into what and how entities will be launched. Any instance of - ``Model``, ``Ensemble``, or ``Orchestrator`` created by the + ``Model``, ``Ensemble``, or ``Feature Store`` created by the Experiment can be passed as an argument to the preview method. Verbosity levels: @@ -476,8 +476,8 @@ def preview( output to stdout. Defaults to None. """ - # Retrieve any active orchestrator jobs - active_dbjobs = self._control.active_orchestrator_jobs + # Retrieve any active feature store jobs + active_fsjobs = self._control.active_active_feature_store_jobs preview_manifest = Manifest(*args) @@ -487,7 +487,7 @@ def preview( verbosity_level, output_format, output_filename, - active_dbjobs, + active_fsjobs, ) @property @@ -559,12 +559,12 @@ def _launch_summary(self, manifest: Manifest) -> None: if manifest.applications: summary += f"Applications: {len(manifest.applications)}\n" - if self._control.orchestrator_active: - summary += "Database Status: active\n" - elif manifest.dbs: - summary += "Database Status: launching\n" + if self._control.feature_store_active: + summary += "Feature Store Status: active\n" + elif manifest.fss: + summary += "Feature Store Status: launching\n" else: - summary += "Database Status: inactive\n" + summary += "Feature Store Status: inactive\n" summary += f"\n{str(manifest)}" @@ -572,7 +572,7 @@ def _launch_summary(self, manifest: Manifest) -> None: def _create_entity_dir(self, start_manifest: Manifest) -> None: def create_entity_dir( - entity: t.Union[Orchestrator, Application, Ensemble] + entity: t.Union[FeatureStore, Application, Ensemble] ) -> None: if not os.path.isdir(entity.path): os.makedirs(entity.path) @@ -580,8 +580,8 @@ def create_entity_dir( for application in start_manifest.applications: create_entity_dir(application) - for orch in start_manifest.dbs: - create_entity_dir(orch) + for feature_store in start_manifest.fss: + create_entity_dir(feature_store) for ensemble in start_manifest.ensembles: create_entity_dir(ensemble) @@ -592,13 +592,13 @@ def create_entity_dir( def __str__(self) -> str: return self.name - def _append_to_db_identifier_list(self, db_identifier: str) -> None: - """Check if db_identifier already exists when calling create_database""" - if db_identifier in self.db_identifiers: + def _append_to_fs_identifier_list(self, fs_identifier: str) -> None: + """Check if fs_identifier already exists when calling create_feature_store""" + if fs_identifier in self.fs_identifiers: logger.warning( - f"A database with the identifier {db_identifier} has already been made " - "An error will be raised if multiple databases are started " + f"A feature store with the identifier {fs_identifier} has already been made " + "An error will be raised if multiple Feature Stores are started " "with the same identifier" ) # Otherwise, add - self.db_identifiers.add(db_identifier) + self.fs_identifiers.add(fs_identifier) diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 6175259b2..36c0ae415 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -76,12 +76,12 @@ def __init__( self._ds_name = form_name(self.list_name, "info") def publish(self, client: Client) -> None: - """Upload DataInfo information to Orchestrator + """Upload DataInfo information to FeatureStore The information is put on the DB as a DataSet, with strings stored as metastrings and integers stored as metascalars. - :param client: Client to connect to Database + :param client: Client to connect to Feature Store """ info_ds = Dataset(self._ds_name) info_ds.add_meta_string("sample_name", self.sample_name) @@ -92,13 +92,13 @@ def publish(self, client: Client) -> None: client.put_dataset(info_ds) def download(self, client: Client) -> None: - """Download DataInfo information from Orchestrator + """Download DataInfo information from FeatureStore The information retrieved from the DB is used to populate this object's members. If the information is not available on the DB, the object members are not modified. - :param client: Client to connect to Database + :param client: Client to connect to Feature Store """ try: info_ds = client.get_dataset(self._ds_name) @@ -134,7 +134,7 @@ class TrainingDataUploader: This class can be used to upload samples following a simple convention for naming. Once created, the function `publish_info` can be used - to put all details about the data set on the Orchestrator. A training + to put all details about the data set on the FeatureStore. A training process can thus access them and get all relevant information to download the batches which are uploaded. @@ -142,11 +142,11 @@ class TrainingDataUploader: and the data will be stored following the naming convention specified by the attributes of this class. - :param list_name: Name of the dataset as stored on the Orchestrator + :param list_name: Name of the dataset as stored on the FeatureStore :param sample_name: Name of samples tensor in uploaded Datasets :param target_name: Name of targets tensor (if needed) in uploaded Datasets :param num_classes: Number of classes of targets, if categorical - :param cluster: Whether the SmartSim Orchestrator is being run as a cluster + :param cluster: Whether the SmartSim FeatureStore is being run as a cluster :param address: Address of Redis DB as : :param rank: Rank of DataUploader in multi-process application (e.g. MPI rank). :param verbose: If output should be logged to screen. @@ -261,7 +261,7 @@ class DataDownloader: download, if a string is passed, it is used to download DataInfo data from DB, assuming it was stored with ``list_name=data_info_or_list_name`` :param list_name: Name of aggregation list used to upload data - :param cluster: Whether the Orchestrator will be run as a cluster + :param cluster: Whether the FeatureStore will be run as a cluster :param address: Address of Redis client as : :param replica_rank: When StaticDataDownloader is used distributedly, indicates the rank of this object diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index cf69b65e5..9e16a21dc 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -44,7 +44,7 @@ def freeze_model( smartredis.client.set_model_from_file() method. This utiliy function provides everything users need to take - a trained model and put it inside an ``orchestrator`` instance + a trained model and put it inside an ``featurestore`` instance :param model: TensorFlow or Keras model :param output_dir: output dir to save model file to @@ -86,7 +86,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] smartredis.client.set_model() method. This utiliy function provides everything users need to take - a trained model and put it inside an ``orchestrator`` instance. + a trained model and put it inside an ``featurestore`` instance. :param model: TensorFlow or Keras model :return: serialized model, model input layer names, model output layer names diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 9e22a42b4..b3a086c7c 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -62,7 +62,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: ``AprunSettings`` instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated applications cannot be run as a mpmd workload" ) diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index ed87e223a..fa1bed036 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -30,7 +30,7 @@ from smartsim.settings.containers import Container from .._core.utils.helpers import expand_exe_path, fmt_dict, is_valid_cmd -from ..entity.dbobject import DBModel, DBScript +from ..entity.dbobject import FSModel, FSScript from ..log import get_logger logger = get_logger(__name__) @@ -83,7 +83,7 @@ def __init__( self.container = container self._run_command = run_command self.in_batch = False - self.colocated_db_settings: t.Optional[ + self.colocated_fs_settings: t.Optional[ t.Dict[ str, t.Union[ @@ -93,8 +93,8 @@ def __init__( None, t.List[str], t.Iterable[t.Union[int, t.Iterable[int]]], - t.List[DBModel], - t.List[DBScript], + t.List[FSModel], + t.List[FSScript], t.Dict[str, t.Union[int, None]], t.Dict[str, str], ], @@ -535,8 +535,8 @@ def __str__(self) -> str: # pragma: no-cover string += f"\nRun Command: {self.run_command}" if self.run_args: string += f"\nRun Arguments:\n{fmt_dict(self.run_args)}" - if self.colocated_db_settings: - string += "\nCo-located Database: True" + if self.colocated_fs_settings: + string += "\nCo-located Feature Store: True" return string diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index c9a93c40f..841505ca5 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -86,15 +86,15 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: :param cpus_per_rs: number of cpus to use per resource set or ALL_CPUS """ - if self.colocated_db_settings: - db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) - if not db_cpus: - raise ValueError("db_cpus must be configured on colocated_db_settings") + if self.colocated_fs_settings: + fs_cpus = int(t.cast(int, self.colocated_fs_settings.get("fs_cpus", 0))) + if not fs_cpus: + raise ValueError("fs_cpus must be configured on colocated_fs_settings") - if cpus_per_rs < db_cpus: + if cpus_per_rs < fs_cpus: raise ValueError( f"Cannot set cpus_per_rs ({cpus_per_rs}) to less than " - + f"db_cpus ({db_cpus})" + + f"fs_cpus ({fs_cpus})" ) if isinstance(cpus_per_rs, str): self.run_args["cpu_per_rs"] = cpus_per_rs @@ -195,7 +195,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: ``JsrunSettings`` instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated applications cannot be run as a mpmd workload" ) @@ -325,25 +325,25 @@ def __str__(self) -> str: string += "\nERF settings: " + pformat(self.erf_sets) return string - def _prep_colocated_db(self, db_cpus: int) -> None: + def _prep_colocated_fs(self, fs_cpus: int) -> None: cpus_per_flag_set = False for cpu_per_rs_flag in ["cpu_per_rs", "c"]: if run_arg_value := self.run_args.get(cpu_per_rs_flag, 0): cpus_per_flag_set = True cpu_per_rs = int(run_arg_value) - if cpu_per_rs < db_cpus: + if cpu_per_rs < fs_cpus: msg = ( f"{cpu_per_rs_flag} flag was set to {cpu_per_rs}, but " - f"colocated DB requires {db_cpus} CPUs per RS. Automatically " - f"setting {cpu_per_rs_flag} flag to {db_cpus}" + f"colocated db requires {fs_cpus} CPUs per RS. Automatically " + f"setting {cpu_per_rs_flag} flag to {fs_cpus}" ) logger.info(msg) - self.run_args[cpu_per_rs_flag] = db_cpus + self.run_args[cpu_per_rs_flag] = fs_cpus if not cpus_per_flag_set: - msg = f"Colocated DB requires {db_cpus} CPUs per RS. Automatically setting " - msg += f"--cpus_per_rs=={db_cpus}" + msg = f"Colocated fs requires {fs_cpus} CPUs per RS. Automatically setting " + msg += f"--cpus_per_rs=={fs_cpus}" logger.info(msg) - self.set_cpus_per_rs(db_cpus) + self.set_cpus_per_rs(fs_cpus) rs_per_host_set = False for rs_per_host_flag in ["rs_per_host", "r"]: @@ -353,13 +353,13 @@ def _prep_colocated_db(self, db_cpus: int) -> None: if rs_per_host != 1: msg = f"{rs_per_host_flag} flag was set to {rs_per_host}, " msg += ( - "but colocated DB requires running ONE resource set per host. " + "but colocated fs requires running ONE resource set per host. " ) msg += f"Automatically setting {rs_per_host_flag} flag to 1" logger.info(msg) self.run_args[rs_per_host_flag] = "1" if not rs_per_host_set: - msg = "Colocated DB requires one resource set per host. " + msg = "Colocated fs requires one resource set per host. " msg += " Automatically setting --rs_per_host==1" logger.info(msg) self.set_rs_per_host(1) diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index fd7909ec6..66b965938 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -91,7 +91,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: MpirunSettings instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated applications cannot be run as a mpmd workload" ) diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 1b54e37b8..2e9f8fb46 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -175,7 +175,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: :param value: value """ # TODO add error checking here - # TODO include option to overwrite place (warning for orchestrator?) + # TODO include option to overwrite place (warning for featurestore?) updated_dict = self.resources print(f"name of resource: {resource_name}") updated_dict.update({resource_name: value}) diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 983f5329f..1d05169b0 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -84,7 +84,7 @@ def make_mpmd(self, settings: RunSettings) -> None: :param settings: SrunSettings instance """ - if self.colocated_db_settings: + if self.colocated_fs_settings: raise SSUnsupportedError( "Colocated applications cannot be run as a mpmd workload" ) diff --git a/smartsim/templates/templates/preview/plain_text/activeinfra.template b/smartsim/templates/templates/preview/plain_text/activeinfra.template index 8f403fbc0..3e9ed6a2e 100644 --- a/smartsim/templates/templates/preview/plain_text/activeinfra.template +++ b/smartsim/templates/templates/preview/plain_text/activeinfra.template @@ -1,9 +1,9 @@ - = Database Identifier: {{ db.entity.db_identifier }} = - Shards: {{ db.entity.num_shards }} + = Feature Store Identifier: {{ fs.entity.fs_identifier }} = + Shards: {{ fs.entity.num_shards }} TCP/IP Port(s): - {%- for port in db.entity.ports %} + {%- for port in fs.entity.ports %} {{ port }} {%- endfor %} - Network Interface: {{ db.entity.run_settings.exe_args | get_ifname }} - Type: {{ config.database_cli | get_dbtype }} + Network Interface: {{ fs.entity.run_settings.exe_args | get_ifname }} + Type: {{ config.database_cli | get_fstype }} diff --git a/smartsim/templates/templates/preview/plain_text/base.template b/smartsim/templates/templates/preview/plain_text/base.template index 511712554..5686b8676 100644 --- a/smartsim/templates/templates/preview/plain_text/base.template +++ b/smartsim/templates/templates/preview/plain_text/base.template @@ -1,22 +1,22 @@ {% include "experiment.template" %} -{%- if manifest.has_deployable or active_dbjobs %} +{%- if manifest.has_deployable or active_fsjobs %} === Entity Preview === - {%- if active_dbjobs %} + {%- if active_fsjobs %} == Active Infrastructure == - {%- for name, db in active_dbjobs.items() %} + {%- for name, fs in active_fsjobs.items() %} {% include "activeinfra.template" %} {%- endfor %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} - == Orchestrators == - {%- for db in manifest.dbs %} - {%- if db.is_active() %} - WARNING: Cannot preview {{ db.name }}, because it is already started. + == Feature Stores == + {%- for fs in manifest.fss %} + {%- if fs.is_active() %} + WARNING: Cannot preview {{ fs.name }}, because it is already started. {%- else %} {% include "orchestrator.template" %} {%- endif %} @@ -29,12 +29,12 @@ = Model Name: {{ model.name }} = {%- include "model.template" %} - {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template index 51dafd0d1..12e647cdc 100644 --- a/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_debug.template @@ -1,12 +1,12 @@ - {%- for db in manifest.dbs %} - {%- if db.name %} - Database Identifier: {{ db.name }} + {%- for fs in manifest.fss %} + {%- if fs.name %} + Feature StoreIdentifier: {{ fs.name }} {%- endif %} {%- if verbosity_level == Verbosity.DEBUG or verbosity_level == Verbosity.DEVELOPER %} - Database Backend: {{ config.database_cli | get_dbtype }} + Feature Store Backend: {{ config.database_cli | get_fstype }} TCP/IP Port(s): - {%- for port in db.ports %} + {%- for port in fs.ports %} {{ port }} {%- endfor %} Type: Standalone diff --git a/smartsim/templates/templates/preview/plain_text/clientconfig_info.template b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template index 164f4bd4a..998b68707 100644 --- a/smartsim/templates/templates/preview/plain_text/clientconfig_info.template +++ b/smartsim/templates/templates/preview/plain_text/clientconfig_info.template @@ -1,11 +1,11 @@ - {%- for db in manifest.dbs %} - {%- if db.name %} - Database Identifier: {{ db.name }} + {%- for fs in manifest.fss %} + {%- if fs.name %} + Feature Store Identifier: {{ fs.name }} {%- endif %} - Database Backend: {{ config.database_cli | get_dbtype }} + Feature Store Backend: {{ config.database_cli | get_fstype }} TCP/IP Port(s): - {%- for port in db.ports %} + {%- for port in fs.ports %} {{ port }} {%- endfor %} Type: Standalone diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template index 303fd0dca..93ad8aa7b 100644 --- a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_debug.template @@ -1,25 +1,25 @@ - {%- if model.run_settings.colocated_db_settings.db_identifier %} - Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} {%- else %} - Database Identifier: N/A + Feature Store Identifier: N/A {%- endif %} - Database Backend: {{ config.database_cli | get_dbtype }} - {%- if model.run_settings.colocated_db_settings %} - {%- if model.run_settings.colocated_db_settings.port %} + Feature Store Backend: {{ config.database_cli | get_fstype }} + {%- if model.run_settings.colocated_fs_settings %} + {%- if model.run_settings.colocated_fs_settings.port %} Connection Type: TCP TCP/IP Port(s): - {{ model.run_settings.colocated_db_settings.port }} + {{ model.run_settings.colocated_fs_settings.port }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.unix_socket %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} Connection Type: UDS - Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.ifname %} - {%- if model.run_settings.colocated_db_settings.ifname | is_list %} - Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname[0] }} + {%- if model.run_settings.colocated_fs_settings.ifname %} + {%- if model.run_settings.colocated_fs_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname[0] }} {%- else %} - Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname }} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname }} {%- endif %} {%- endif %} Type: Colocated diff --git a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template index e03d7ce3b..3b630f85a 100644 --- a/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template +++ b/smartsim/templates/templates/preview/plain_text/clientconfigcolo_info.template @@ -1,16 +1,16 @@ - {%- if model.run_settings.colocated_db_settings.db_identifier %} - Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} {%- endif %} - Database Backend: {{ config.database_cli | get_dbtype }} - {%- if model.run_settings.colocated_db_settings.port %} + Feature Store Backend: {{ config.database_cli | get_fstype }} + {%- if model.run_settings.colocated_fs_settings.port %} Connection Type: TCP TCP/IP Port(s): - {{ model.run_settings.colocated_db_settings.port }} + {{ model.run_settings.colocated_fs_settings.port }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.unix_socket %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} Connection Type: UDS - Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} {%- endif %} Type: Colocated {%- if model.query_key_prefixing() %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_debug.template b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template index 862db6032..c458813ca 100644 --- a/smartsim/templates/templates/preview/plain_text/ensemble_debug.template +++ b/smartsim/templates/templates/preview/plain_text/ensemble_debug.template @@ -32,12 +32,12 @@ - Model Name: {{ model.name }} - {%- include 'model.template' %} - {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} @@ -48,12 +48,12 @@ - Model Name: {{ model.name }} - {%- include 'model_debug.template' %} - {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/ensemble_info.template b/smartsim/templates/templates/preview/plain_text/ensemble_info.template index 17d1a4054..a7b9c2296 100644 --- a/smartsim/templates/templates/preview/plain_text/ensemble_info.template +++ b/smartsim/templates/templates/preview/plain_text/ensemble_info.template @@ -12,12 +12,12 @@ {% set model = ensemble.models[0] %} - Model Name: {{ model.name }} - {%- include 'model.template' %} - {%- if model.run_settings.colocated_db_settings or manifest.dbs %} + {%- if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} @@ -25,12 +25,12 @@ {% set model = ensemble.models[(ensemble.models | length)-1] %} - Model Name: {{ model.name }} - {%- include 'model.template' %} - {% if model.run_settings.colocated_db_settings or manifest.dbs %} + {% if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} @@ -38,12 +38,12 @@ {% for model in ensemble %} - Model Name: {{ model.name }} - {%- include 'model.template' %} - {% if model.run_settings.colocated_db_settings or manifest.dbs %} + {% if model.run_settings.colocated_fs_settings or manifest.fss %} Client Configuration: - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} {%- include "clientconfigcolo.template" %} {%- endif %} - {%- if manifest.dbs %} + {%- if manifest.fss %} {%- include "clientconfig.template" %} {%- endif %} {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/model_debug.template b/smartsim/templates/templates/preview/plain_text/model_debug.template index 186746186..6605d50ab 100644 --- a/smartsim/templates/templates/preview/plain_text/model_debug.template +++ b/smartsim/templates/templates/preview/plain_text/model_debug.template @@ -54,42 +54,42 @@ {%- endfor %} {%- endif %} {%- endif %} - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} Colocated: - {%- if model.run_settings.colocated_db_settings.db_identifier %} - Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.port %} + {%- if model.run_settings.colocated_fs_settings.port %} Connection Type: TCP TCP/IP Port(s): - {{ model.run_settings.colocated_db_settings.port }} + {{ model.run_settings.colocated_fs_settings.port }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.unix_socket %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} Connection Type: UDS - Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.ifname %} - {%- if model.run_settings.colocated_db_settings.ifname | is_list %} - Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname[0] }} + {%- if model.run_settings.colocated_fs_settings.ifname %} + {%- if model.run_settings.colocated_fs_settings.ifname | is_list %} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname[0] }} {%- else %} - Network Interface Name: {{ model.run_settings.colocated_db_settings.ifname }} + Network Interface Name: {{ model.run_settings.colocated_fs_settings.ifname }} {%- endif %} {%- endif %} - CPUs: {{ model.run_settings.colocated_db_settings.cpus }} - Custom Pinning: {{ model.run_settings.colocated_db_settings.custom_pinning }} + CPUs: {{ model.run_settings.colocated_fs_settings.cpus }} + Custom Pinning: {{ model.run_settings.colocated_fs_settings.custom_pinning }} {%- endif %} - {%- if model._db_scripts %} + {%- if model._fs_scripts %} Torch Scripts: - {%- for script in model._db_scripts%} + {%- for script in model._fs_scripts%} Name: {{ script.name }} Path: {{ script.file }} Backend: {{ script.device }} Devices Per Node: {{ script.devices_per_node }} {%- endfor %} {%- endif %} - {%- if model._db_models %} + {%- if model._fs_models %} ML Models: - {%- for mlmodel in model._db_models %} + {%- for mlmodel in model._fs_models %} Name: {{ mlmodel.name }} Path: {{ mlmodel.file }} Backend: {{ mlmodel.backend }} diff --git a/smartsim/templates/templates/preview/plain_text/model_info.template b/smartsim/templates/templates/preview/plain_text/model_info.template index f746208e5..dc961ae95 100644 --- a/smartsim/templates/templates/preview/plain_text/model_info.template +++ b/smartsim/templates/templates/preview/plain_text/model_info.template @@ -10,32 +10,32 @@ {%- endfor %} {%- endif %} - {%- if model.run_settings.colocated_db_settings %} + {%- if model.run_settings.colocated_fs_settings %} Colocated: - {%- if model.run_settings.colocated_db_settings.db_identifier %} - Database Identifier: {{ model.run_settings.colocated_db_settings.db_identifier }} + {%- if model.run_settings.colocated_fs_settings.fs_identifier %} + Feature Store Identifier: {{ model.run_settings.colocated_fs_settings.fs_identifier }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.port %} + {%- if model.run_settings.colocated_fs_settings.port %} Connection Type: TCP TCP/IP Port(s): - {{ model.run_settings.colocated_db_settings.port }} + {{ model.run_settings.colocated_fs_settings.port }} {%- endif %} - {%- if model.run_settings.colocated_db_settings.unix_socket %} + {%- if model.run_settings.colocated_fs_settings.unix_socket %} Connection Type: UDS - Unix Socket: {{ model.run_settings.colocated_db_settings.unix_socket }} + Unix Socket: {{ model.run_settings.colocated_fs_settings.unix_socket }} {%- endif %} {%- endif %} - {%- if model.run_settings.colocated_db_settings['db_scripts'] %} + {%- if model.run_settings.colocated_fs_settings['fs_scripts'] %} Torch Scripts: - {%- for script in model.run_settings.colocated_db_settings['db_scripts'] %} + {%- for script in model.run_settings.colocated_fs_settings['fs_scripts'] %} Name: {{ script.name }} Path: {{ script.script_path }} {%- endfor %} {%- endif %} - {%- if model.run_settings.colocated_db_settings['db_models'] %} + {%- if model.run_settings.colocated_fs_settings['fs_models'] %} ML Models: - {%- for mlmodel in model.run_settings.colocated_db_settings['db_models'] %} + {%- for mlmodel in model.run_settings.colocated_fs_settings['fs_models'] %} Name: {{ mlmodel.name }} Path: {{ mlmodel.model_file }} Backend: {{ mlmodel.backend }} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template index 127a4949e..8dfa6ae9a 100644 --- a/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_debug.template @@ -1,33 +1,33 @@ - = Database Identifier: {{ db.name }} = - {%- if db.path %} - Path: {{ db.path }} + = Feature Store Identifier: {{ fs.name }} = + {%- if fs.path %} + Path: {{ fs.path }} {%- endif %} - Shards: {{ db.num_shards }} + Shards: {{ fs.num_shards }} TCP/IP Port(s): - {%- for port in db.ports %} + {%- for port in fs.ports %} {{ port }} {%- endfor %} - Network Interface: {{ db._interfaces[0] }} - Type: {{ config.database_cli | get_dbtype }} + Network Interface: {{ fs._interfaces[0] }} + Type: {{ config.database_cli | get_fstype }} Executable: {{ config.database_exe }} - {%- if db.run_settings %} - Run Command: {{ db.run_settings.run_command }} - {%- if db.run_settings.run_args %} + {%- if fs.run_settings %} + Run Command: {{ fs.run_settings.run_command }} + {%- if fs.run_settings.run_args %} Run Arguments: - {%- for key, value in db.run_settings.run_args.items() %} + {%- for key, value in fs.run_settings.run_args.items() %} {{ key }}: {{ value }} {%- endfor %} {%- endif %} {%- endif %} - {%- if db.run_command %} - Run Command: {{ db.run_command }} + {%- if fs.run_command %} + Run Command: {{ fs.run_command }} {%- endif %} - {%- if db.batch_settings %} + {%- if fs.batch_settings %} Batch Launch: True - Batch Command: {{ db.batch_settings.batch_cmd }} + Batch Command: {{ fs.batch_settings.batch_cmd }} Batch Arguments: - {%- for key, value in db.batch_settings.batch_args.items() %} + {%- for key, value in fs.batch_settings.batch_args.items() %} {{ key }}: {{ value }} {%- endfor %} {%- endif %} diff --git a/smartsim/templates/templates/preview/plain_text/orchestrator_info.template b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template index 11608d6c5..7964d126e 100644 --- a/smartsim/templates/templates/preview/plain_text/orchestrator_info.template +++ b/smartsim/templates/templates/preview/plain_text/orchestrator_info.template @@ -1,11 +1,11 @@ - = Database Identifier: {{ db.name }} = + = Feature Store Identifier: {{ fs.name }} = TCP/IP Port(s): - {%- for port in db.ports %} + {%- for port in fs.ports %} {{ port }} {%- endfor %} - Network Interface: {{ db._interfaces[0] }} - Type: {{ config.database_cli | get_dbtype }} - {%- if db.batch %} - Batch Launch: {{ db.batch }} + Network Interface: {{ fs._interfaces[0] }} + Type: {{ config.database_cli | get_fstype }} + {%- if fs.batch %} + Batch Launch: {{ fs.batch }} {%- endif %} diff --git a/tests/_legacy/backends/run_sklearn_onnx.py b/tests/_legacy/backends/run_sklearn_onnx.py index f10c8c7fb..77683ee90 100644 --- a/tests/_legacy/backends/run_sklearn_onnx.py +++ b/tests/_legacy/backends/run_sklearn_onnx.py @@ -75,7 +75,7 @@ def run_model(client, model_name, device, model, model_input, in_name, out_names def run(device): - # connect a client to the database + # connect a client to the feature store client = Client(cluster=False) # linreg test diff --git a/tests/_legacy/backends/run_torch.py b/tests/_legacy/backends/run_torch.py index 6e9ba2859..83c8a9a8e 100644 --- a/tests/_legacy/backends/run_torch.py +++ b/tests/_legacy/backends/run_torch.py @@ -75,7 +75,7 @@ def calc_svd(input_tensor): def run(device): - # connect a client to the database + # connect a client to the feature store client = Client(cluster=False) # test the SVD function diff --git a/tests/_legacy/backends/test_cli_mini_exp.py b/tests/_legacy/backends/test_cli_mini_exp.py index 2fde2ff5f..1fd110721 100644 --- a/tests/_legacy/backends/test_cli_mini_exp.py +++ b/tests/_legacy/backends/test_cli_mini_exp.py @@ -48,8 +48,8 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( - prepare_db, - local_db, + prepare_fs, + local_fs, test_dir, monkeypatch, ): @@ -58,26 +58,26 @@ def test_cli_mini_exp_doesnt_error_out_with_dev_build( to ensure that it does not accidentally report false positive/negatives """ - db = prepare_db(local_db).orchestrator + fs = prepare_fs(local_fs).featurestore @contextmanager - def _mock_make_managed_local_orc(*a, **kw): - (client_addr,) = db.get_address() + def _mock_make_managed_local_feature_store(*a, **kw): + (client_addr,) = fs.get_address() yield smartredis.Client(False, address=client_addr) monkeypatch.setattr( smartsim._core._cli.validate, - "_make_managed_local_orc", - _mock_make_managed_local_orc, + "_make_managed_local_feature_store", + _mock_make_managed_local_feature_store, ) backends = installed_redisai_backends() - (db_port,) = db.ports + (fs_port,) = fs.ports smartsim._core._cli.validate.test_install( # Shouldn't matter bc we are stubbing creation of orc # but best to give it "correct" vals for safety location=test_dir, - port=db_port, + port=fs_port, # Always test on CPU, heads don't always have GPU device=build.Device.CPU, # Test the backends the dev has installed diff --git a/tests/_legacy/backends/test_dataloader.py b/tests/_legacy/backends/test_dataloader.py index 95016f4aa..e299e72c1 100644 --- a/tests/_legacy/backends/test_dataloader.py +++ b/tests/_legacy/backends/test_dataloader.py @@ -30,7 +30,7 @@ import numpy as np import pytest -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error.errors import SSInternalError from smartsim.experiment import Experiment from smartsim.log import get_logger @@ -167,11 +167,11 @@ def train_tf(generator): @pytest.mark.skipif(not shouldrun_tf, reason="Test needs TensorFlow to run") -def test_tf_dataloaders(wlm_experiment, prepare_db, single_db, monkeypatch): +def test_tf_dataloaders(wlm_experiment, prepare_fs, single_fs, monkeypatch): - db = prepare_db(single_db).orchestrator - orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) - monkeypatch.setenv("SSDB", orc.get_address()[0]) + fs = prepare_fs(single_fs).featurestore + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + monkeypatch.setenv("SSDB", feature_store.get_address()[0]) monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: @@ -229,12 +229,12 @@ def create_trainer_torch(experiment: Experiment, filedir, wlmutils): @pytest.mark.skipif(not shouldrun_torch, reason="Test needs Torch to run") def test_torch_dataloaders( - wlm_experiment, prepare_db, single_db, fileutils, test_dir, wlmutils, monkeypatch + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, wlmutils, monkeypatch ): config_dir = fileutils.get_test_dir_path("ml") - db = prepare_db(single_db).orchestrator - orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) - monkeypatch.setenv("SSDB", orc.get_address()[0]) + fs = prepare_fs(single_fs).orchestrator + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) + monkeypatch.setenv("SSDB", feature_store.get_address()[0]) monkeypatch.setenv("SSKEYIN", "test_uploader_0,test_uploader_1") try: @@ -320,22 +320,22 @@ def test_data_info_repr(): @pytest.mark.skipif( not (shouldrun_torch or shouldrun_tf), reason="Requires TF or PyTorch" ) -def test_wrong_dataloaders(wlm_experiment, prepare_db, single_db): - db = prepare_db(single_db).orchestrator - orc = wlm_experiment.reconnect_orchestrator(db.checkpoint_file) +def test_wrong_dataloaders(wlm_experiment, prepare_fs, single_fs): + fs = prepare_fs(single_fs).featurestore + feature_store = wlm_experiment.reconnect_feature_store(fs.checkpoint_file) if shouldrun_tf: with pytest.raises(SSInternalError): _ = TFDataGenerator( data_info_or_list_name="test_data_list", - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, max_fetch_trials=1, ) with pytest.raises(TypeError): _ = TFStaticDataGenerator( test_data_info_repr=1, - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, max_fetch_trials=1, ) @@ -344,7 +344,7 @@ def test_wrong_dataloaders(wlm_experiment, prepare_db, single_db): with pytest.raises(SSInternalError): torch_data_gen = TorchDataGenerator( data_info_or_list_name="test_data_list", - address=orc.get_address()[0], + address=feature_store.get_address()[0], cluster=False, ) torch_data_gen.init_samples(init_trials=1) diff --git a/tests/_legacy/backends/test_dbmodel.py b/tests/_legacy/backends/test_dbmodel.py index 9d12126ab..0672f3ad6 100644 --- a/tests/_legacy/backends/test_dbmodel.py +++ b/tests/_legacy/backends/test_dbmodel.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends from smartsim.entity import Ensemble -from smartsim.entity.dbobject import DBModel +from smartsim.entity.dbobject import FSModel from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.status import SmartSimStatus @@ -146,10 +146,10 @@ def save_torch_cnn(path, file_name): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_tf_db_model( - wlm_experiment, prepare_db, single_db, fileutils, test_dir, mlutils +def test_tf_fs_model( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, mlutils ): - """Test TensorFlow DB Models on remote DB""" + """Test TensorFlow FS Models on remote FS""" # Retrieve parameters from testing environment test_device = mlutils.get_test_device() @@ -167,9 +167,9 @@ def test_tf_db_model( # Create Model smartsim_model = wlm_experiment.create_application("smartsim_model", run_settings) - # Create database - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + # Create feature store + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -200,11 +200,11 @@ def test_tf_db_model( ) logger.debug("The following ML models have been added:") - for db_model in smartsim_model._db_models: - logger.debug(db_model) + for fs_model in smartsim_model._fs_models: + logger.debug(fs_model) # Assert we have added both models - assert len(smartsim_model._db_models) == 2 + assert len(smartsim_model._fs_models) == 2 wlm_experiment.generate(smartsim_model) @@ -217,10 +217,10 @@ def test_tf_db_model( @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_pt_db_model( - wlm_experiment, prepare_db, single_db, fileutils, test_dir, mlutils +def test_pt_fs_model( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, mlutils ): - """Test PyTorch DB Models on remote DB""" + """Test PyTorch FS Models on remote FS""" # Retrieve parameters from testing environment test_device = mlutils.get_test_device() @@ -238,9 +238,9 @@ def test_pt_db_model( # Create Model smartsim_model = wlm_experiment.create_applicationl("smartsim_model", run_settings) - # Create database - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + # Create feature store + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem save_torch_cnn(test_dir, "model1.pt") @@ -258,11 +258,11 @@ def test_pt_db_model( ) logger.debug("The following ML models have been added:") - for db_model in smartsim_model._db_models: - logger.debug(db_model) + for fs_model in smartsim_model._fs_models: + logger.debug(fs_model) # Assert we have added both models - assert len(smartsim_model._db_models) == 1 + assert len(smartsim_model._fs_models) == 1 wlm_experiment.generate(smartsim_model) @@ -275,10 +275,10 @@ def test_pt_db_model( @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble( - wlm_experiment, prepare_db, single_db, fileutils, test_dir, wlmutils, mlutils +def test_fs_model_ensemble( + wlm_experiment, prepare_fs, single_fs, fileutils, test_dir, wlmutils, mlutils ): - """Test DBModels on remote DB, with an ensemble""" + """Test FSModels on remote FS, with an ensemble""" # Retrieve parameters from testing environment test_device = mlutils.get_test_device() @@ -301,9 +301,9 @@ def test_db_model_ensemble( # Create Model smartsim_model = wlm_experiment.create_application("smartsim_model", run_settings) - # Create database - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + # Create feature store + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() @@ -352,9 +352,9 @@ def test_db_model_ensemble( ) # Assert we have added one model to the ensemble - assert len(smartsim_ensemble._db_models) == 1 + assert len(smartsim_ensemble._fs_models) == 1 # Assert we have added two models to each entity - assert all([len(entity._db_models) == 2 for entity in smartsim_ensemble]) + assert all([len(entity._fs_models) == 2 for entity in smartsim_ensemble]) wlm_experiment.generate(smartsim_ensemble) @@ -367,11 +367,11 @@ def test_db_model_ensemble( @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): - """Test DB Models on colocated DB (TensorFlow backend)""" +def test_colocated_fs_model_tf(fileutils, test_dir, wlmutils, mlutils): + """Test fs Models on colocated fs (TensorFlow backend)""" # Set experiment name - exp_name = "test-colocated-db-model-tf" + exp_name = "test-colocated-fs-model-tf" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -392,8 +392,8 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): # Create colocated Model colo_model = exp.create_application("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -423,7 +423,7 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_model._db_models) == 2 + assert len(colo_model._fs_models) == 2 exp.generate(colo_model) @@ -439,11 +439,11 @@ def test_colocated_db_model_tf(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_pt, reason="Test needs PyTorch to run") -def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): - """Test DB Models on colocated DB (PyTorch backend)""" +def test_colocated_fs_model_pytorch(fileutils, test_dir, wlmutils, mlutils): + """Test fs Models on colocated fs (PyTorch backend)""" # Set experiment name - exp_name = "test-colocated-db-model-pytorch" + exp_name = "test-colocated-fs-model-pytorch" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -464,8 +464,8 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_application("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save ML model to filesystem @@ -483,7 +483,7 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_model._db_models) == 1 + assert len(colo_model._fs_models) == 1 exp.generate(colo_model) @@ -499,13 +499,13 @@ def test_colocated_db_model_pytorch(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DBModel on colocated ensembles, first colocating DB, - then adding DBModel. +def test_colocated_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test fsModel on colocated ensembles, first colocating fs, + then adding fsModel. """ # Set experiment name - exp_name = "test-colocated-db-model-ensemble" + exp_name = "test-colocated-fs-model-ensemble" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -529,20 +529,20 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): "colocated_ens", run_settings=colo_settings, replicas=2 ) - # Create a third model with a colocated database + # Create a third model with a colocated feature store colo_model = exp.create_application("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create and save the ML models to the filesystem model_file, inputs, outputs = save_tf_cnn(test_dir, "model1.pb") model_file2, inputs2, outputs2 = save_tf_cnn(test_dir, "model2.pb") - # Colocate a database with the ensemble with two ensemble members + # Colocate a feature store with the ensemble with two ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i + 1, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i + 1, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML model to each ensemble member individual to test that they # do not conflict with models add to the Ensemble object @@ -600,13 +600,13 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): - """Test DBModel on colocated ensembles, first adding the DBModel to the - ensemble, then colocating DB. +def test_colocated_fs_model_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): + """Test fsModel on colocated ensembles, first adding the fsModel to the + ensemble, then colocating fs. """ # Set experiment name - exp_name = "test-colocated-db-model-ensemble-reordered" + exp_name = "test-colocated-fs-model-ensemble-reordered" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -649,10 +649,10 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml outputs=outputs, ) - # Colocate a database with the first ensemble members + # Colocate a feature store with the first ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML models to each ensemble member to make sure they # do not conflict with other ML models @@ -671,10 +671,10 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml # Add another ensemble member colo_ensemble.add_application(colo_model) - # Colocate a database with the new ensemble member - colo_model.colocate_db_tcp( + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble) - 1, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -704,11 +704,11 @@ def test_colocated_db_model_ensemble_reordered(fileutils, test_dir, wlmutils, ml @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): - """Test error when colocated db model has no file.""" +def test_colocated_fs_model_errors(fileutils, test_dir, wlmutils, mlutils): + """Test error when colocated fs model has no file.""" # Set experiment name - exp_name = "test-colocated-db-model-error" + exp_name = "test-colocated-fs-model-error" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -729,8 +729,8 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_application("colocated_model", colo_settings) - colo_model.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_model.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Get and save TF model @@ -755,10 +755,10 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): "colocated_ens", run_settings=colo_settings, replicas=2 ) - # Colocate a db with each ensemble member + # Colocate a fs with each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Check that an error is raised because in-memory models @@ -777,11 +777,11 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Check error is still thrown if an in-memory model is used # with a colocated deployment. This test varies by adding - # the SmartSIm model with a colocated database to the ensemble + # the SmartSIm model with a colocated feature store to the ensemble # after the ML model was been added to the ensemble. colo_settings2 = exp.create_run_settings(exe=sys.executable, exe_args=test_script) - # Reverse order of DBModel and model + # Reverse order of fsModel and model colo_ensemble2 = exp.create_ensemble( "colocated_ens", run_settings=colo_settings2, replicas=2 ) @@ -797,9 +797,9 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): ) for i, entity in enumerate(colo_ensemble2): with pytest.raises(SSUnsupportedError): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -809,13 +809,13 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run_tf, reason="Test needs TensorFlow to run") -def test_inconsistent_params_db_model(): - """Test error when devices_per_node parameter>1 when devices is set to CPU in DBModel""" +def test_inconsistent_params_fs_model(): + """Test error when devices_per_node parameter>1 when devices is set to CPU in fsModel""" # Create and save ML model to filesystem model, inputs, outputs = create_tf_cnn() with pytest.raises(SSUnsupportedError) as ex: - DBModel( + FSModel( "cnn", "TF", model=model, @@ -833,11 +833,11 @@ def test_inconsistent_params_db_model(): @pytest.mark.skipif(not should_run_tf, reason="Test needs TF to run") -def test_db_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): - """Test DBModels on remote DB, with an ensemble""" +def test_fs_model_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test fsModels on remote fs, with an ensemble""" # Set experiment name - exp_name = "test-db-model-ensemble-duplicate" + exp_name = "test-fs-model-ensemble-duplicate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() diff --git a/tests/_legacy/backends/test_dbscript.py b/tests/_legacy/backends/test_dbscript.py index 66b71baca..b9c27b8a0 100644 --- a/tests/_legacy/backends/test_dbscript.py +++ b/tests/_legacy/backends/test_dbscript.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.entity.dbobject import DBScript +from smartsim.entity.dbobject import FSScript from smartsim.error.errors import SSUnsupportedError from smartsim.log import get_logger from smartsim.settings import MpiexecSettings, MpirunSettings @@ -42,7 +42,7 @@ should_run = True -supported_dbs = ["uds", "tcp"] +supported_fss = ["uds", "tcp"] try: import torch @@ -57,8 +57,8 @@ def timestwo(x): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script(wlm_experiment, prepare_db, single_db, fileutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script(wlm_experiment, prepare_fs, single_fs, fileutils, mlutils): + """Test FS scripts on remote Fs""" test_device = mlutils.get_test_device() test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 @@ -78,9 +78,9 @@ def test_db_script(wlm_experiment, prepare_db, single_db, fileutils, mlutils): "smartsim_application", run_settings ) - # Create the SmartSim database - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + # Create the SmartSim feature store + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_orchestrator(fs.checkpoint_file) wlm_experiment.generate(smartsim_application) # Define the torch script string @@ -114,7 +114,7 @@ def test_db_script(wlm_experiment, prepare_db, single_db, fileutils, mlutils): ) # Assert we have all three scripts - assert len(smartsim_application._db_scripts) == 3 + assert len(smartsim_application._fs_scripts) == 3 # Launch and check successful completion wlm_experiment.start(smartsim_application, block=True) @@ -123,11 +123,11 @@ def test_db_script(wlm_experiment, prepare_db, single_db, fileutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble(wlm_experiment, prepare_db, single_db, fileutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script_ensemble(wlm_experiment, prepare_fs, single_fs, fileutils, mlutils): + """Test FS scripts on remote FS""" # Set wlm_experimenteriment name - wlm_experiment_name = "test-db-script" + wlm_experiment_name = "test-fs-script" # Retrieve parameters from testing environment test_device = mlutils.get_test_device() @@ -143,12 +143,12 @@ def test_db_script_ensemble(wlm_experiment, prepare_db, single_db, fileutils, ml run_settings.set_nodes(1) run_settings.set_tasks(1) - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) # Create Ensemble with two identical applications ensemble = wlm_experiment.create_ensemble( - "dbscript_ensemble", run_settings=run_settings, replicas=2 + "fsscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim application @@ -199,10 +199,10 @@ def test_db_script_ensemble(wlm_experiment, prepare_db, single_db, fileutils, ml ) # Assert we have added both models to the ensemble - assert len(ensemble._db_scripts) == 2 + assert len(ensemble._fs_scripts) == 2 # Assert we have added all three models to entities in ensemble - assert all([len(entity._db_scripts) == 3 for entity in ensemble]) + assert all([len(entity._fs_scripts) == 3 for entity in ensemble]) wlm_experiment.generate(ensemble) @@ -212,11 +212,11 @@ def test_db_script_ensemble(wlm_experiment, prepare_db, single_db, fileutils, ml @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB""" +def test_colocated_fs_script(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs""" # Set the experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -236,10 +236,10 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): colo_settings.set_nodes(1) colo_settings.set_tasks(1) - # Create application with colocated database + # Create application with colocated feature store colo_application = exp.create_application("colocated_application", colo_settings) - colo_application.colocate_db_tcp( - port=test_port, db_cpus=1, debug=True, ifname=test_interface + colo_application.colocate_fs_tcp( + port=test_port, fs_cpus=1, debug=True, ifname=test_interface ) # Create string for script creation @@ -263,12 +263,12 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added both models - assert len(colo_application._db_scripts) == 2 + assert len(colo_application._fs_scripts) == 2 exp.generate(colo_application) - for db_script in colo_application._db_scripts: - logger.debug(db_script) + for fs_script in colo_application._fs_scripts: + logger.debug(fs_script) try: exp.start(colo_application, block=True) @@ -279,13 +279,13 @@ def test_colocated_db_script(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB from ensemble, first colocating DB, +def test_colocated_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs from ensemble, first colocating fs, then adding script. """ # Set experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -313,13 +313,13 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create a SmartSim application colo_application = exp.create_application("colocated_application", colo_settings) - # Colocate a db with each ensemble entity and add a script + # Colocate a fs with each ensemble entity and add a script # to each entity via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -332,10 +332,10 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): first_device=0, ) - # Colocate a db with the non-ensemble Application - colo_application.colocate_db_tcp( + # Colocate a feature store with the non-ensemble Application + colo_application.colocate_fs_tcp( port=test_port + len(colo_ensemble), - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -363,9 +363,9 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): ) # Assert we have added one application to the ensemble - assert len(colo_ensemble._db_scripts) == 1 + assert len(colo_ensemble._fs_scripts) == 1 # Assert we have added both applications to each entity - assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) exp.generate(colo_ensemble) @@ -379,12 +379,12 @@ def test_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts on colocated DB from ensemble, first adding the - script to the ensemble, then colocating the DB""" +def test_colocated_fs_script_ensemble_reordered(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts on colocated fs from ensemble, first adding the + script to the ensemble, then colocating the fs""" # Set Experiment name - exp_name = "test-colocated-db-script-reord" + exp_name = "test-colocated-fs-script-reord" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -422,13 +422,13 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m first_device=0, ) - # Add a colocated database to the ensemble members + # Add a colocated feature store to the ensemble members # and then add a script via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -441,10 +441,10 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m first_device=0, ) - # Add a colocated database to the non-ensemble SmartSim Application - colo_application.colocate_db_tcp( + # Add a colocated feature store to the non-ensemble SmartSim Application + colo_application.colocate_fs_tcp( port=test_port + len(colo_ensemble), - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -461,9 +461,9 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m ) # Assert we have added one application to the ensemble - assert len(colo_ensemble._db_scripts) == 1 + assert len(colo_ensemble._fs_scripts) == 1 # Assert we have added both applications to each entity - assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) exp.generate(colo_ensemble) @@ -477,11 +477,11 @@ def test_colocated_db_script_ensemble_reordered(fileutils, test_dir, wlmutils, m @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): - """Test DB Scripts error when setting a serialized function on colocated DB""" +def test_fs_script_errors(fileutils, test_dir, wlmutils, mlutils): + """Test fs Scripts error when setting a serialized function on colocated fs""" # Set Experiment name - exp_name = "test-colocated-db-script" + exp_name = "test-colocated-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -500,11 +500,11 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): colo_settings.set_nodes(1) colo_settings.set_tasks(1) - # Create a SmartSim application with a colocated database + # Create a SmartSim application with a colocated feature store colo_application = exp.create_application("colocated_application", colo_settings) - colo_application.colocate_db_tcp( + colo_application.colocate_fs_tcp( port=test_port, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -526,17 +526,17 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): "colocated_ensemble", run_settings=colo_settings, replicas=2 ) - # Add a colocated database for each ensemble member + # Add a colocated feature store for each ensemble member for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) # Check that an exception is raised when adding an in-memory - # function to the ensemble with colocated databases + # function to the ensemble with colocated feature stores with pytest.raises(SSUnsupportedError): colo_ensemble.add_function( "test_func", @@ -562,31 +562,31 @@ def test_db_script_errors(fileutils, test_dir, wlmutils, mlutils): ) # Check that an error is raised when trying to add - # a colocated database to ensemble members that have + # a colocated feature store to ensemble members that have # an in-memory script for i, entity in enumerate(colo_ensemble): with pytest.raises(SSUnsupportedError): - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) # Check that an error is raised when trying to add - # a colocated database to an Ensemble that has + # a colocated feature store to an Ensemble that has # an in-memory script with pytest.raises(SSUnsupportedError): colo_ensemble.add_application(colo_application) -def test_inconsistent_params_db_script(fileutils): - """Test error when devices_per_node>1 and when devices is set to CPU in DBScript constructor""" +def test_inconsistent_params_fs_script(fileutils): + """Test error when devices_per_node>1 and when devices is set to CPU in FSScript constructor""" torch_script = fileutils.get_test_conf_path("torchscript.py") with pytest.raises(SSUnsupportedError) as ex: - _ = DBScript( - name="test_script_db", + _ = FSScript( + name="test_script_fs", script_path=torch_script, device="CPU", devices_per_node=2, @@ -597,8 +597,8 @@ def test_inconsistent_params_db_script(fileutils): == "Cannot set devices_per_node>1 if CPU is specified under devices" ) with pytest.raises(SSUnsupportedError) as ex: - _ = DBScript( - name="test_script_db", + _ = FSScript( + name="test_script_fs", script_path=torch_script, device="CPU", devices_per_node=1, @@ -611,11 +611,11 @@ def test_inconsistent_params_db_script(fileutils): @pytest.mark.skipif(not should_run, reason="Test needs Torch to run") -def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): - """Test DB scripts on remote DB""" +def test_fs_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): + """Test fs scripts on remote fs""" # Set experiment name - exp_name = "test-db-script" + exp_name = "test-fs-script" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -637,7 +637,7 @@ def test_db_script_ensemble_duplicate(fileutils, test_dir, wlmutils, mlutils): # Create Ensemble with two identical applications ensemble = exp.create_ensemble( - "dbscript_ensemble", run_settings=run_settings, replicas=2 + "fsscript_ensemble", run_settings=run_settings, replicas=2 ) # Create SmartSim application diff --git a/tests/_legacy/backends/test_onnx.py b/tests/_legacy/backends/test_onnx.py index dd666e567..2c196df29 100644 --- a/tests/_legacy/backends/test_onnx.py +++ b/tests/_legacy/backends/test_onnx.py @@ -57,8 +57,8 @@ ) -def test_sklearn_onnx(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): - """This test needs two free nodes, 1 for the db and 1 some sklearn models +def test_sklearn_onnx(wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils): + """This test needs two free nodes, 1 for the fs and 1 some sklearn models here we test the following sklearn models: - LinearRegression @@ -75,8 +75,8 @@ def test_sklearn_onnx(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ test_device = mlutils.get_test_device() - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) run_settings = wlm_experiment.create_run_settings( sys.executable, f"run_sklearn_onnx.py --device={test_device}" diff --git a/tests/_legacy/backends/test_tf.py b/tests/_legacy/backends/test_tf.py index 3d94f2008..52f5bea95 100644 --- a/tests/_legacy/backends/test_tf.py +++ b/tests/_legacy/backends/test_tf.py @@ -50,7 +50,7 @@ (not tf_backend_available) or (not tf_available), reason="Requires RedisAI TF backend", ) -def test_keras_model(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): +def test_keras_model(wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils): """This test needs two free nodes, 1 for the db and 1 for a keras model script this test can run on CPU/GPU by setting SMARTSIM_TEST_DEVICE=GPU @@ -61,8 +61,8 @@ def test_keras_model(wlm_experiment, prepare_db, single_db, mlutils, wlmutils): """ test_device = mlutils.get_test_device() - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) run_settings = wlm_experiment.create_run_settings( "python", f"run_tf.py --device={test_device}" diff --git a/tests/_legacy/backends/test_torch.py b/tests/_legacy/backends/test_torch.py index 8008fa719..196ae96e4 100644 --- a/tests/_legacy/backends/test_torch.py +++ b/tests/_legacy/backends/test_torch.py @@ -49,9 +49,9 @@ def test_torch_model_and_script( - wlm_experiment, prepare_db, single_db, mlutils, wlmutils + wlm_experiment, prepare_fs, single_fs, mlutils, wlmutils ): - """This test needs two free nodes, 1 for the db and 1 for a torch model script + """This test needs two free nodes, 1 for the fs and 1 for a torch model script Here we test both the torchscipt API and the NN API from torch @@ -62,8 +62,8 @@ def test_torch_model_and_script( You may need to put CUDNN in your LD_LIBRARY_PATH if running on GPU """ - db = prepare_db(single_db).orchestrator - wlm_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(single_fs).featurestore + wlm_experiment.reconnect_feature_store(fs.checkpoint_file) test_device = mlutils.get_test_device() run_settings = wlm_experiment.create_run_settings( diff --git a/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py b/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py index 2a5627d6d..b437303b5 100644 --- a/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/_legacy/full_wlm/test_generic_orc_launch_batch.py @@ -40,7 +40,7 @@ if (pytest.test_launcher == "pbs") and (not pytest.has_aprun): pytestmark = pytest.mark.skip( - reason="Launching orchestrators in a batch job is not supported on PBS without ALPS" + reason="Launching feature stores in a batch job is not supported on PBS without ALPS" ) @@ -53,179 +53,180 @@ def add_batch_resources(wlmutils, batch_settings): batch_settings.set_resource(key, value) -def test_launch_orc_auto_batch(test_dir, wlmutils): - """test single node orchestrator""" +def test_launch_feature_store_auto_batch(test_dir, wlmutils): + """test single node feature store""" launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-orc-batch" + exp_name = "test-launch-auto-feature-store-batch" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), batch=True, interface=network_interface, single_cmd=False, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) - add_batch_resources(wlmutils, orc.batch_settings) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - orc.batch_settings.set_walltime("00:05:00") - orc.set_path(test_dir) + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) # don't use assert so that we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_single(test_dir, wlmutils): - """test clustered 3-node orchestrator with single command""" +def test_launch_cluster_feature_store_batch_single(test_dir, wlmutils): + """test clustered 3-node feature store with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-batch-single" + exp_name = "test-launch-auto-cluster-feature-store-batch-single" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface=network_interface, single_cmd=True, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) - add_batch_resources(wlmutils, orc.batch_settings) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - orc.batch_settings.set_walltime("00:05:00") - orc.set_path(test_dir) + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): - """test clustered 3-node orchestrator""" +def test_launch_cluster_feature_store_batch_multi(test_dir, wlmutils): + """test clustered 3-node feature store""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-batch-multi" + exp_name = "test-launch-auto-cluster-feature-store-batch-multi" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface=network_interface, single_cmd=False, ) - orc.batch_settings.set_account(wlmutils.get_test_account()) - add_batch_resources(wlmutils, orc.batch_settings) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - orc.batch_settings.set_walltime("00:05:00") - orc.set_path(test_dir) + feature_store.batch_settings.set_walltime("00:05:00") + feature_store.set_path(test_dir) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_reconnect(test_dir, wlmutils): - """test reconnecting to clustered 3-node orchestrator""" +def test_launch_cluster_feature_store_reconnect(test_dir, wlmutils): + """test reconnecting to clustered 3-node feature store""" p_test_dir = pathlib.Path(test_dir) launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-cluster-orc-batch-reconect" + exp_name = "test-launch-cluster-feature-store-batch-reconect" exp_1_dir = p_test_dir / exp_name exp_1_dir.mkdir() exp = Experiment(exp_name, launcher=launcher, exp_path=str(exp_1_dir)) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( - wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface + feature_store = exp.create_feature_store( + wlmutils.get_test_port(), fs_nodes=3, batch=True, interface=network_interface ) - orc.batch_settings.set_account(wlmutils.get_test_account()) - add_batch_resources(wlmutils, orc.batch_settings) + feature_store.batch_settings.set_account(wlmutils.get_test_account()) + add_batch_resources(wlmutils, feature_store.batch_settings) - orc.batch_settings.set_walltime("00:05:00") + feature_store.batch_settings.set_walltime("00:05:00") - exp.start(orc, block=True) + exp.start(feature_store, block=True) - statuses = exp.get_status(orc) + statuses = exp.get_status(feature_store) try: assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) except Exception: - exp.stop(orc) + exp.stop(feature_store) raise - exp_name = "test-orc-cluster-orc-batch-reconnect-2nd" + exp_name = "test-feature_store-cluster-feature-store-batch-reconnect-2nd" exp_2_dir = p_test_dir / exp_name exp_2_dir.mkdir() exp_2 = Experiment(exp_name, launcher=launcher, exp_path=str(exp_2_dir)) try: - checkpoint = osp.join(orc.path, "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + checkpoint = osp.join(feature_store.path, "smartsim_db.dat") + reloaded_feature_store = exp_2.reconnect_feature_store(checkpoint) # let statuses update once time.sleep(5) - statuses = exp_2.get_status(reloaded_orc) + statuses = exp_2.get_status(reloaded_feature_store) assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) except Exception: - # Something went wrong! Let the experiment that started the DB - # clean up the DB - exp.stop(orc) + # Something went wrong! Let the experiment that started the FS + # clean up the FS + exp.stop(feature_store) raise try: - # Test experiment 2 can stop the DB - exp_2.stop(reloaded_orc) + # Test experiment 2 can stop the FS + exp_2.stop(reloaded_feature_store) assert all( stat == SmartSimStatus.STATUS_CANCELLED - for stat in exp_2.get_status(reloaded_orc) + for stat in exp_2.get_status(reloaded_feature_store) ) except Exception: - # Something went wrong! Let the experiment that started the DB - # clean up the DB - exp.stop(orc) + # Something went wrong! Let the experiment that started the FS + # clean up the FS + exp.stop(feature_store) raise else: - # Ensure it is the same DB that Experiment 1 was tracking + # Ensure it is the same FS that Experiment 1 was tracking time.sleep(5) assert not any( - stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(orc) + stat == SmartSimStatus.STATUS_RUNNING + for stat in exp.get_status(feature_store) ) diff --git a/tests/_legacy/full_wlm/test_symlinking.py b/tests/_legacy/full_wlm/test_symlinking.py index b122c4172..feb5f25f3 100644 --- a/tests/_legacy/full_wlm/test_symlinking.py +++ b/tests/_legacy/full_wlm/test_symlinking.py @@ -146,13 +146,13 @@ def test_batch_application_symlinks(test_dir, wlmutils): ) -def test_batch_orchestrator_symlinks(test_dir, wlmutils): +def test_batch_feature_store_symlinks(test_dir, wlmutils): exp_name = "test-batch-orc" launcher = wlmutils.get_test_launcher() exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) port = 2424 - db = exp.create_database( - db_nodes=3, + db = exp.create_feature_store( + fs_nodes=3, port=port, batch=True, interface=wlmutils.get_test_interface(), @@ -166,7 +166,7 @@ def test_batch_orchestrator_symlinks(test_dir, wlmutils): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.err"), False) - for i in range(db.db_nodes): + for i in range(db.fs_nodes): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) _should_not_be_symlinked( diff --git a/tests/_legacy/on_wlm/test_colocated_model.py b/tests/_legacy/on_wlm/test_colocated_model.py index a615c91da..8ba0fdfc4 100644 --- a/tests/_legacy/on_wlm/test_colocated_model.py +++ b/tests/_legacy/on_wlm/test_colocated_model.py @@ -33,12 +33,12 @@ from smartsim.status import SmartSimStatus if sys.platform == "darwin": - supported_dbs = ["tcp", "deprecated"] + supported_fss = ["tcp", "deprecated"] else: - supported_dbs = ["uds", "tcp", "deprecated"] + supported_fss = ["uds", "tcp", "deprecated"] -# Set to true if DB logs should be generated for debugging -DEBUG_DB = False +# Set to true if fs logs should be generated for debugging +DEBUG_fs = False # retrieved from pytest fixtures launcher = pytest.test_launcher @@ -46,20 +46,20 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -@pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_colocated_application_defaults(fileutils, test_dir, coloutils, db_type): - """Test the launch of a application with a colocated database and local launcher""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_launch_colocated_application_defaults(fileutils, test_dir, coloutils, fs_type): + """Test the launch of a application with a colocated feature store and local launcher""" - db_args = {"debug": DEBUG_DB} + fs_args = {"debug": DEBUG_fs} exp = Experiment( "colocated_application_defaults", launcher=launcher, exp_path=test_dir ) colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) exp.generate(colo_application) - assert colo_application.run_settings.colocated_db_settings["custom_pinning"] == "0" + assert colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0" exp.start(colo_application, block=True) statuses = exp.get_status(colo_application) assert all( @@ -74,22 +74,22 @@ def test_launch_colocated_application_defaults(fileutils, test_dir, coloutils, d ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_application_disable_pinning(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_application_disable_pinning(fileutils, test_dir, coloutils, fs_type): exp = Experiment( "colocated_application_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 1, + fs_args = { + "fs_cpus": 1, "custom_pinning": [], - "debug": DEBUG_DB, + "debug": DEBUG_fs, } # Check to make sure that the CPU mask was correctly generated colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) - assert colo_application.run_settings.colocated_db_settings["custom_pinning"] is None + assert colo_application.run_settings.colocated_fs_settings["custom_pinning"] is None exp.generate(colo_application) exp.start(colo_application, block=True) statuses = exp.get_status(colo_application) @@ -98,9 +98,9 @@ def test_colocated_application_disable_pinning(fileutils, test_dir, coloutils, d ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_application_pinning_auto_2cpu( - fileutils, test_dir, coloutils, db_type + fileutils, test_dir, coloutils, fs_type ): exp = Experiment( "colocated_application_pinning_auto_2cpu", @@ -108,14 +108,14 @@ def test_colocated_application_pinning_auto_2cpu( exp_path=test_dir, ) - db_args = {"db_cpus": 2, "debug": DEBUG_DB} + fs_args = {"fs_cpus": 2, "debug": DEBUG_fs} # Check to make sure that the CPU mask was correctly generated colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0,1" ) exp.generate(colo_application) exp.start(colo_application, block=True) @@ -125,8 +125,8 @@ def test_colocated_application_pinning_auto_2cpu( ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_application_pinning_range(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_application_pinning_range(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # Assume that there are at least 4 cpus on the node @@ -136,13 +136,13 @@ def test_colocated_application_pinning_range(fileutils, test_dir, coloutils, db_ exp_path=test_dir, ) - db_args = {"db_cpus": 4, "custom_pinning": range(4), "debug": DEBUG_DB} + fs_args = {"fs_cpus": 4, "custom_pinning": range(4), "debug": DEBUG_fs} colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0,1,2,3" ) exp.generate(colo_application) @@ -153,8 +153,8 @@ def test_colocated_application_pinning_range(fileutils, test_dir, coloutils, db_ ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_application_pinning_list(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_application_pinning_list(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this has more than 2 CPUs on the supercomputer node @@ -164,13 +164,13 @@ def test_colocated_application_pinning_list(fileutils, test_dir, coloutils, db_t exp_path=test_dir, ) - db_args = {"db_cpus": 2, "custom_pinning": [0, 2]} + fs_args = {"fs_cpus": 2, "custom_pinning": [0, 2]} colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] == "0,2" + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0,2" ) exp.generate(colo_application) exp.start(colo_application, block=True) @@ -180,8 +180,8 @@ def test_colocated_application_pinning_list(fileutils, test_dir, coloutils, db_t ), f"Statuses: {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) -def test_colocated_application_pinning_mixed(fileutils, test_dir, coloutils, db_type): +@pytest.mark.parametrize("fs_type", supported_fss) +def test_colocated_application_pinning_mixed(fileutils, test_dir, coloutils, fs_type): # Check to make sure that the CPU mask was correctly generated # note we presume that this at least 4 CPUs on the supercomputer node @@ -191,13 +191,13 @@ def test_colocated_application_pinning_mixed(fileutils, test_dir, coloutils, db_ exp_path=test_dir, ) - db_args = {"db_cpus": 2, "custom_pinning": [range(2), 3]} + fs_args = {"fs_cpus": 2, "custom_pinning": [range(2), 3]} colo_application = coloutils.setup_test_colo( - fileutils, db_type, exp, "send_data_local_smartredis.py", db_args, on_wlm=True + fileutils, fs_type, exp, "send_data_local_smartredis.py", fs_args, on_wlm=True ) assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] == "0,1,3" + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0,1,3" ) exp.generate(colo_application) exp.start(colo_application, block=True) diff --git a/tests/_legacy/on_wlm/test_containers_wlm.py b/tests/_legacy/on_wlm/test_containers_wlm.py index 50b35dde1..12e3564d6 100644 --- a/tests/_legacy/on_wlm/test_containers_wlm.py +++ b/tests/_legacy/on_wlm/test_containers_wlm.py @@ -59,12 +59,12 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): "smartredis_ensemble_exchange", exp_path=test_dir, launcher=launcher ) - # create and start a database - orc = exp.create_database( + # create and start a feature store + feature_store = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() ) - exp.generate(orc) - exp.start(orc, block=False) + exp.generate(feature_store) + exp.start(feature_store, block=False) container = Singularity(containerURI) rs = exp.create_run_settings( @@ -93,10 +93,10 @@ def test_singularity_wlm_smartredis(fileutils, test_dir, wlmutils): # get and confirm statuses statuses = exp.get_status(ensemble) if not all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]): - exp.stop(orc) + exp.stop(feature_store) assert False # client ensemble failed - # stop the orchestrator - exp.stop(orc) + # stop the feature store + exp.stop(feature_store) print(exp.summary()) diff --git a/tests/_legacy/on_wlm/test_generic_orc_launch.py b/tests/_legacy/on_wlm/test_generic_orc_launch.py index cacdd5be5..fc475a7e2 100644 --- a/tests/_legacy/on_wlm/test_generic_orc_launch.py +++ b/tests/_legacy/on_wlm/test_generic_orc_launch.py @@ -34,16 +34,16 @@ pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") -def test_launch_orc_auto(test_dir, wlmutils): - """test single node orchestrator""" +def test_launch_feature_store_auto(test_dir, wlmutils): + """test single node feature store""" launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-orc" + exp_name = "test-launch-auto-feature_store" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), batch=False, interface=network_interface, @@ -51,78 +51,78 @@ def test_launch_orc_auto(test_dir, wlmutils): hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) # don't use assert so that we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_single(test_dir, wlmutils): - """test clustered 3-node orchestrator with single command""" +def test_launch_cluster_feature_store_single(test_dir, wlmutils): + """test clustered 3-node feature store with single command""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-single" + exp_name = "test-launch-auto-cluster-feature_store-single" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) -def test_launch_cluster_orc_multi(test_dir, wlmutils): - """test clustered 3-node orchestrator with multiple commands""" +def test_launch_cluster_feature_store_multi(test_dir, wlmutils): + """test clustered 3-node feature store with multiple commands""" # TODO detect number of nodes in allocation and skip if not sufficent launcher = wlmutils.get_test_launcher() - exp_name = "test-launch-auto-cluster-orc-multi" + exp_name = "test-launch-auto-cluster-feature-store-multi" exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - exp.start(orc, block=True) - statuses = exp.get_status(orc) + exp.start(feature_store, block=True) + statuses = exp.get_status(feature_store) - # don't use assert so that orc we don't leave an orphan process + # don't use assert so that feature_store we don't leave an orphan process if SmartSimStatus.STATUS_FAILED in statuses: - exp.stop(orc) + exp.stop(feature_store) assert False - exp.stop(orc) - statuses = exp.get_status(orc) + exp.stop(feature_store) + statuses = exp.get_status(feature_store) assert all([stat == SmartSimStatus.STATUS_CANCELLED for stat in statuses]) diff --git a/tests/_legacy/on_wlm/test_het_job.py b/tests/_legacy/on_wlm/test_het_job.py index aeea7b474..459f2a952 100644 --- a/tests/_legacy/on_wlm/test_het_job.py +++ b/tests/_legacy/on_wlm/test_het_job.py @@ -63,19 +63,19 @@ def test_set_het_groups(monkeypatch, test_dir): rs.set_het_group([4]) -def test_orch_single_cmd(monkeypatch, wlmutils, test_dir): +def test_feature_store_single_cmd(monkeypatch, wlmutils, test_dir): """Test that single cmd is rejected in a heterogeneous job""" monkeypatch.setenv("SLURM_HET_SIZE", "1") - exp_name = "test-orch-single-cmd" + exp_name = "test-feature-store-single-cmd" exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=wlmutils.get_test_interface(), single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - for node in orc: + for node in feature_store: assert node.is_mpmd == False diff --git a/tests/_legacy/on_wlm/test_preview_wlm.py b/tests/_legacy/on_wlm/test_preview_wlm.py index 77cd938e6..bea865359 100644 --- a/tests/_legacy/on_wlm/test_preview_wlm.py +++ b/tests/_legacy/on_wlm/test_preview_wlm.py @@ -33,7 +33,7 @@ from smartsim import Experiment from smartsim._core import Manifest, previewrenderer from smartsim._core.config import CONFIG -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.settings import QsubBatchSettings, RunSettings pytestmark = pytest.mark.slow_tests @@ -62,44 +62,44 @@ def add_batch_resources(wlmutils, batch_settings): pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -def test_preview_wlm_run_commands_cluster_orc_model( +def test_preview_wlm_run_commands_cluster_feature_store_model( test_dir, coloutils, fileutils, wlmutils ): """ Test preview of wlm run command and run aruguments on a - orchestrator and model + feature store and model """ - exp_name = "test-preview-orc-model" + exp_name = "test-preview-feature-store-model" launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) network_interface = wlmutils.get_test_interface() - orc = exp.create_database( + feature_store = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=True, hosts=wlmutils.get_test_hostlist(), - db_identifier="testdb_reg", + fs_identifier="testfs_reg", ) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testfs_colo", } # Create model with colocated database smartsim_model = coloutils.setup_test_colo( - fileutils, "uds", exp, test_script, db_args, on_wlm=on_wlm + fileutils, "uds", exp, test_script, fs_args, on_wlm=on_wlm ) - preview_manifest = Manifest(orc, smartsim_model) + preview_manifest = Manifest(feature_store, smartsim_model) # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") @@ -216,7 +216,7 @@ def test_preview_batch_ensemble(fileutils, test_dir, wlmutils): reason="Not testing WLM integrations", ) def test_preview_launch_command(test_dir, wlmutils, choose_host): - """Test preview launch command for orchestrator, models, and + """Test preview launch command for feature store, models, and ensembles""" # Prepare entities test_launcher = wlmutils.get_test_launcher() @@ -225,7 +225,7 @@ def test_preview_launch_command(test_dir, wlmutils, choose_host): exp_name = "test_preview_launch_command" exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) # create regular database - orc = exp.create_database( + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), @@ -256,12 +256,14 @@ def test_preview_launch_command(test_dir, wlmutils, choose_host): n_models=4, ) - preview_manifest = Manifest(orc, spam_eggs_model, hello_world_model, ensemble) + preview_manifest = Manifest( + feature_store, spam_eggs_model, hello_world_model, ensemble + ) # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") - assert "orchestrator" in output + assert "feature store" in output assert "echo-spam" in output assert "echo-hello" in output @@ -293,17 +295,17 @@ def test_preview_batch_launch_command(fileutils, test_dir, wlmutils): ) model.set_path(test_dir) - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface="lo", launcher="slurm", run_command="srun", ) - orc.set_batch_arg("account", "ACCOUNT") + feature_store.set_batch_arg("account", "ACCOUNT") - preview_manifest = Manifest(orc, model) + preview_manifest = Manifest(feature_store, model) # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") @@ -326,9 +328,9 @@ def test_ensemble_batch(test_dir, wlmutils): exp = Experiment( "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher ) - # Create Orchestrator - db = exp.create_database(port=6780, interface="lo") - exp.generate(db, overwrite=True) + # Create feature store + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) rs1 = exp.create_run_settings("echo", ["hello", "world"]) # Create ensemble batch_settings = exp.create_batch_settings(nodes=1, time="00:01:00") @@ -349,15 +351,15 @@ def test_ensemble_batch(test_dir, wlmutils): exp.generate(ml_model, overwrite=True) - preview_manifest = Manifest(db, ml_model, ensemble) + preview_manifest = Manifest(fs, ml_model, ensemble) # Call preview renderer for testing output output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") # Evaluate output assert "Client Configuration" in output - assert "Database Identifier" in output - assert "Database Backend" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output assert "Type" in output @@ -365,7 +367,7 @@ def test_ensemble_batch(test_dir, wlmutils): pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -def test_preview_ensemble_db_script(wlmutils, test_dir): +def test_preview_ensemble_fs_script(wlmutils, test_dir): """ Test preview of a torch script on a model in an ensemble. """ @@ -373,8 +375,8 @@ def test_preview_ensemble_db_script(wlmutils, test_dir): test_launcher = wlmutils.get_test_launcher() exp = Experiment("getting-started", launcher=test_launcher) - orch = exp.create_database(db_identifier="test_db1") - orch_2 = exp.create_database(db_identifier="test_db2", db_nodes=3) + feature_store = exp.create_feature_store(fs_identifier="test_fs1") + feature_store_2 = exp.create_feature_store(fs_identifier="test_fs2", fs_nodes=3) # Initialize a RunSettings object model_settings = exp.create_run_settings(exe="python", exe_args="params.py") model_settings_2 = exp.create_run_settings(exe="python", exe_args="params.py") @@ -400,7 +402,7 @@ def test_preview_ensemble_db_script(wlmutils, test_dir): devices_per_node=2, first_device=0, ) - preview_manifest = Manifest(ensemble, orch, orch_2) + preview_manifest = Manifest(ensemble, feature_store, feature_store_2) # Call preview renderer for testing output output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") diff --git a/tests/_legacy/on_wlm/test_wlm_orc_config_settings.py b/tests/_legacy/on_wlm/test_wlm_orc_config_settings.py index c74f2a497..f4f14fbb7 100644 --- a/tests/_legacy/on_wlm/test_wlm_orc_config_settings.py +++ b/tests/_legacy/on_wlm/test_wlm_orc_config_settings.py @@ -43,61 +43,61 @@ pytestmark = pytest.mark.skip(reason="SmartRedis version is < 0.3.1") -def test_config_methods_on_wlm_single(dbutils, prepare_db, single_db): +def test_config_methods_on_wlm_single(fsutils, prepare_db, single_db): """Test all configuration file edit methods on single node WLM db""" - db = prepare_db(single_db).orchestrator + fs = prepare_fs(single_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): logger.debug(f"Setting {setting}={value}") - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) - # ensure SmartSimError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + # ensure SmartSimError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) - # ensure TypeError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - type_error_configs = dbutils.get_type_error_db_configs() + # ensure TypeError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) -def test_config_methods_on_wlm_cluster(dbutils, prepare_db, clustered_db): +def test_config_methods_on_wlm_cluster(fsutils, prepare_fs, clustered_fs): """Test all configuration file edit methods on an active clustered db""" - db = prepare_db(clustered_db).orchestrator + fs = prepare_fs(clustered_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): logger.debug(f"Setting {setting}={value}") - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) - # ensure SmartSimError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + # ensure SmartSimError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): logger.debug(f"Setting {key}={value}") - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) - # ensure TypeError is raised when a clustered database's - # Orchestrator.set_db_conf is given invalid CONFIG key-value pairs - type_error_configs = dbutils.get_type_error_db_configs() + # ensure TypeError is raised when a clustered feature store's + # FeatureStore.set_fs_conf is given invalid CONFIG key-value pairs + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): logger.debug(f"Setting {key}={value}") - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) diff --git a/tests/_legacy/test_alps_settings.py b/tests/_legacy/test_alps_settings.py index b3c4c3bdb..f96d0e60d 100644 --- a/tests/_legacy/test_alps_settings.py +++ b/tests/_legacy/test_alps_settings.py @@ -67,7 +67,7 @@ def test_aprun_add_mpmd(): def test_catch_colo_mpmd(): settings = AprunSettings("python") - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = AprunSettings("python") with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/_legacy/test_cli.py b/tests/_legacy/test_cli.py index 710a9a659..397f1196c 100644 --- a/tests/_legacy/test_cli.py +++ b/tests/_legacy/test_cli.py @@ -232,7 +232,7 @@ def test_cli_command_execution(capsys): exp_b_help = "this is my mock help text for build" exp_b_cmd = "build" - dbcli_exec = lambda x, y: mock_execute_custom(msg="Database", good=True) + dbcli_exec = lambda x, y: mock_execute_custom(msg="FeatureStore", good=True) build_exec = lambda x, y: mock_execute_custom(msg="Builder", good=True) menu = [ @@ -249,7 +249,7 @@ def test_cli_command_execution(capsys): captured = capsys.readouterr() # capture new output # show that `smart dbcli` calls the build parser and build execute function - assert "Database" in captured.out + assert "FeatureStore" in captured.out assert ret_val == 0 build_args = ["smart", exp_b_cmd] @@ -670,13 +670,13 @@ def mock_operation(*args, **kwargs) -> int: def test_cli_full_dbcli_execute(capsys, monkeypatch): """Ensure that the execute method of dbcli is called""" exp_retval = 0 - exp_output = "mocked-get_db_path utility" + exp_output = "mocked-get_fs_path utility" def mock_operation(*args, **kwargs) -> int: return exp_output - # mock out the internal get_db_path method so we don't actually do file system ops - monkeypatch.setattr(smartsim._core._cli.dbcli, "get_db_path", mock_operation) + # mock out the internal get_fs_path method so we don't actually do file system ops + monkeypatch.setattr(smartsim._core._cli.dbcli, "get_fs_path", mock_operation) command = "dbcli" cfg = MenuItemConfig(command, f"test {command} help text", dbcli_execute) @@ -703,7 +703,7 @@ def mock_operation(*args, **kwargs) -> int: print(exp_output) return exp_retval - # mock out the internal get_db_path method so we don't actually do file system ops + # mock out the internal get_fs_path method so we don't actually do file system ops monkeypatch.setattr(smartsim._core._cli.site, "get_install_path", mock_operation) command = "site" @@ -731,9 +731,11 @@ def mock_operation(*args, **kwargs) -> int: print(exp_output) return exp_retval - # mock out the internal get_db_path method so we don't actually do file system ops + # mock out the internal get_fs_path method so we don't actually do file system ops monkeypatch.setattr(smartsim._core._cli.build, "tabulate", mock_operation) - monkeypatch.setattr(smartsim._core._cli.build, "build_database", mock_operation) + monkeypatch.setattr( + smartsim._core._cli.build, "build_feature_store", mock_operation + ) monkeypatch.setattr(smartsim._core._cli.build, "build_redis_ai", mock_operation) monkeypatch.setattr( smartsim._core._cli.build, "check_py_torch_version", mock_operation diff --git a/tests/_legacy/test_collector_manager.py b/tests/_legacy/test_collector_manager.py index 7cc475afe..98e87c2ad 100644 --- a/tests/_legacy/test_collector_manager.py +++ b/tests/_legacy/test_collector_manager.py @@ -246,13 +246,13 @@ async def test_collector_manager_collect_filesink( @pytest.mark.asyncio async def test_collector_manager_collect_integration( - test_dir: str, mock_entity: MockCollectorEntityFunc, prepare_db, local_db, mock_sink + test_dir: str, mock_entity: MockCollectorEntityFunc, prepare_fs, local_fs, mock_sink ) -> None: """Ensure that all collectors are executed and some metric is retrieved""" - db = prepare_db(local_db).orchestrator - entity1 = mock_entity(port=db.ports[0], name="e1", telemetry_on=True) - entity2 = mock_entity(port=db.ports[0], name="e2", telemetry_on=True) + fs = prepare_fs(local_fs).featurestore + entity1 = mock_entity(port=fs.ports[0], name="e1", telemetry_on=True) + entity2 = mock_entity(port=fs.ports[0], name="e2", telemetry_on=True) # todo: consider a MockSink so i don't have to save the last value in the collector sinks = [mock_sink(), mock_sink(), mock_sink()] @@ -341,20 +341,20 @@ async def snooze() -> None: pytest.param("application", True, id="applications, telemetry enabled"), pytest.param("ensemble", False, id="ensemble"), pytest.param("ensemble", True, id="ensemble, telemetry enabled"), - pytest.param("orchestrator", False, id="orchestrator"), - pytest.param("orchestrator", True, id="orchestrator, telemetry enabled"), - pytest.param("dbnode", False, id="dbnode"), - pytest.param("dbnode", True, id="dbnode, telemetry enabled"), + pytest.param("featurestore", False, id="featurestore"), + pytest.param("featurestore", True, id="featurestore, telemetry enabled"), + pytest.param("fsnode", False, id="fsnode"), + pytest.param("fsnode", True, id="fsnode, telemetry enabled"), ], ) @pytest.mark.asyncio -async def test_collector_manager_find_nondb( +async def test_collector_manager_find_nonfs( mock_entity: MockCollectorEntityFunc, e_type: str, telemetry_on: bool, ) -> None: """Ensure that the number of collectors returned for entity types match expectations - NOTE: even orchestrator returns 0 mapped collectors because no collector output + NOTE: even featurestore returns 0 mapped collectors because no collector output paths are set on the entity""" entity = mock_entity(port=1234, name="e1", type=e_type, telemetry_on=telemetry_on) manager = CollectorManager(timeout_ms=10000) @@ -383,7 +383,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # 1. ensure DBConnectionCountCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["client"] = "mock/path.csv" manager = CollectorManager() @@ -397,7 +397,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # 3. ensure DBConnectionCountCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["client_count"] = "mock/path.csv" manager = CollectorManager() @@ -411,7 +411,7 @@ async def test_collector_manager_find_db(mock_entity: MockCollectorEntityFunc) - # ensure DbMemoryCollector is mapped entity = mock_entity( - port=1234, name="entity1", type="orchestrator", telemetry_on=True + port=1234, name="entity1", type="featurestore", telemetry_on=True ) entity.collectors["memory"] = "mock/path.csv" manager = CollectorManager() @@ -429,7 +429,7 @@ async def test_collector_manager_find_entity_disabled( mock_entity: MockCollectorEntityFunc, ) -> None: """Ensure that disabling telemetry on the entity results in no collectors""" - entity: JobEntity = mock_entity(port=1234, name="entity1", type="orchestrator") + entity: JobEntity = mock_entity(port=1234, name="entity1", type="featurestore") # set paths for all known collectors entity.collectors["client"] = "mock/path.csv" diff --git a/tests/_legacy/test_collectors.py b/tests/_legacy/test_collectors.py index 2eb61d62d..fdc8f6780 100644 --- a/tests/_legacy/test_collectors.py +++ b/tests/_legacy/test_collectors.py @@ -42,7 +42,7 @@ # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -PrepareDB = t.Callable[[dict], smartsim.experiment.Orchestrator] +PrepareFS = t.Callable[[dict], smartsim.experiment.FeatureStore] @pytest.mark.asyncio @@ -173,15 +173,15 @@ async def test_dbmemcollector_collect( async def test_dbmemcollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - prepare_db: PrepareDB, - local_db: dict, + prepare_fs: PrepareFS, + local_fs: dict, monkeypatch: pytest.MonkeyPatch, ) -> None: - """Integration test with a real orchestrator instance to ensure + """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - db = prepare_db(local_db).orchestrator - entity = mock_entity(port=db.ports[0], telemetry_on=True) + fs = prepare_fs(local_fs).featurestore + entity = mock_entity(port=fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBMemoryCollector(entity, sink) @@ -273,15 +273,15 @@ async def test_dbconn_count_collector_collect( async def test_dbconncollector_integration( mock_entity: MockCollectorEntityFunc, mock_sink: MockSink, - prepare_db: PrepareDB, - local_db: dict, + prepare_fs: PrepareFS, + local_fs: dict, monkeypatch: pytest.MonkeyPatch, ) -> None: - """Integration test with a real orchestrator instance to ensure + """Integration test with a real feature store instance to ensure output data matches expectations and proper db client API uage""" - db = prepare_db(local_db).orchestrator - entity = mock_entity(port=db.ports[0], telemetry_on=True) + fs = prepare_fs(local_fs).featurestore + entity = mock_entity(port=fs.ports[0], telemetry_on=True) sink = mock_sink() collector = DBConnectionCollector(entity, sink) diff --git a/tests/_legacy/test_colo_model_local.py b/tests/_legacy/test_colo_model_local.py index d4d8c97f3..34e8f1b70 100644 --- a/tests/_legacy/test_colo_model_local.py +++ b/tests/_legacy/test_colo_model_local.py @@ -38,17 +38,17 @@ if sys.platform == "darwin": - supported_dbs = ["tcp", "deprecated"] + supported_fss = ["tcp", "deprecated"] else: - supported_dbs = ["uds", "tcp", "deprecated"] + supported_fss = ["uds", "tcp", "deprecated"] is_mac = sys.platform == "darwin" @pytest.mark.skipif(not is_mac, reason="MacOS-only test") def test_macosx_warning(fileutils, test_dir, coloutils): - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment( "colocated_application_defaults", launcher="local", exp_path=test_dir @@ -59,16 +59,16 @@ def test_macosx_warning(fileutils, test_dir, coloutils): ): _ = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) def test_unsupported_limit_app(fileutils, test_dir, coloutils): - db_args = {"limit_app_cpus": True} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"limit_app_cpus": True} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment( "colocated_application_defaults", launcher="local", exp_path=test_dir @@ -76,18 +76,18 @@ def test_unsupported_limit_app(fileutils, test_dir, coloutils): with pytest.raises(SSUnsupportedError): coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) @pytest.mark.skipif(is_mac, reason="Unsupported on MacOSX") @pytest.mark.parametrize("custom_pinning", [1, "10", "#", 1.0, ["a"], [1.0]]) def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinning): - db_type = "uds" # Test is insensitive to choice of db - db_args = {"custom_pinning": custom_pinning} + fs_type = "uds" # Test is insensitive to choice of fs + fs_args = {"custom_pinning": custom_pinning} exp = Experiment( "colocated_application_defaults", launcher="local", exp_path=test_dir @@ -95,10 +95,10 @@ def test_unsupported_custom_pinning(fileutils, test_dir, coloutils, custom_pinni with pytest.raises(TypeError): coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) @@ -119,23 +119,23 @@ def test_create_pinning_string(pin_list, num_cpus, expected): assert Application._create_pinning_string(pin_list, num_cpus) == expected -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_launch_colocated_application_defaults( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): - """Test the launch of a application with a colocated database and local launcher""" + """Test the launch of a application with a colocated feature store and local launcher""" - db_args = {} + fs_args = {} exp = Experiment( "colocated_application_defaults", launcher=launcher, exp_path=test_dir ) colo_application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) if is_mac: @@ -143,7 +143,7 @@ def test_launch_colocated_application_defaults( else: true_pinning = "0" assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == true_pinning ) exp.generate(colo_application) @@ -159,31 +159,31 @@ def test_launch_colocated_application_defaults( ), f"Statuses {statuses}" -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_launch_multiple_colocated_applications( - fileutils, test_dir, coloutils, wlmutils, db_type, launcher="local" + fileutils, test_dir, coloutils, wlmutils, fs_type, launcher="local" ): - """Test the concurrent launch of two applications with a colocated database and local launcher""" + """Test the concurrent launch of two applications with a colocated feature store and local launcher""" - db_args = {} + fs_args = {} exp = Experiment("multi_colo_applications", launcher=launcher, exp_path=test_dir) colo_applications = [ coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_application_name="colo0", port=wlmutils.get_test_port(), ), coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_application_name="colo1", port=wlmutils.get_test_port() + 1, ), @@ -199,58 +199,58 @@ def test_launch_multiple_colocated_applications( assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_application_disable_pinning( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): exp = Experiment( "colocated_application_pinning_auto_1cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 1, + fs_args = { + "fs_cpus": 1, "custom_pinning": [], } # Check to make sure that the CPU mask was correctly generated colo_application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) - assert colo_application.run_settings.colocated_db_settings["custom_pinning"] is None + assert colo_application.run_settings.colocated_fs_settings["custom_pinning"] is None exp.generate(colo_application) exp.start(colo_application, block=True) statuses = exp.get_status(colo_application) assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_application_pinning_auto_2cpu( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): exp = Experiment( "colocated_application_pinning_auto_2cpu", launcher=launcher, exp_path=test_dir ) - db_args = { - "db_cpus": 2, + fs_args = { + "fs_cpus": 2, } # Check to make sure that the CPU mask was correctly generated colo_application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) if is_mac: true_pinning = None else: true_pinning = "0,1" assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == true_pinning ) exp.generate(colo_application) @@ -260,9 +260,9 @@ def test_colocated_application_pinning_auto_2cpu( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_application_pinning_range( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): # Check to make sure that the CPU mask was correctly generated @@ -270,17 +270,17 @@ def test_colocated_application_pinning_range( "colocated_application_pinning_manual", launcher=launcher, exp_path=test_dir ) - db_args = {"db_cpus": 2, "custom_pinning": range(2)} + fs_args = {"fs_cpus": 2, "custom_pinning": range(2)} colo_application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) assert ( - colo_application.run_settings.colocated_db_settings["custom_pinning"] == "0,1" + colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "0,1" ) exp.generate(colo_application) exp.start(colo_application, block=True) @@ -289,9 +289,9 @@ def test_colocated_application_pinning_range( @pytest.mark.skipif(is_mac, reason="unsupported on MacOSX") -@pytest.mark.parametrize("db_type", supported_dbs) +@pytest.mark.parametrize("fs_type", supported_fss) def test_colocated_application_pinning_list( - fileutils, test_dir, coloutils, db_type, launcher="local" + fileutils, test_dir, coloutils, fs_type, launcher="local" ): # Check to make sure that the CPU mask was correctly generated @@ -299,16 +299,16 @@ def test_colocated_application_pinning_list( "colocated_application_pinning_manual", launcher=launcher, exp_path=test_dir ) - db_args = {"db_cpus": 1, "custom_pinning": [1]} + fs_args = {"fs_cpus": 1, "custom_pinning": [1]} colo_application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, ) - assert colo_application.run_settings.colocated_db_settings["custom_pinning"] == "1" + assert colo_application.run_settings.colocated_fs_settings["custom_pinning"] == "1" exp.generate(colo_application) exp.start(colo_application, block=True) statuses = exp.get_status(colo_application) @@ -323,4 +323,4 @@ def test_colo_uds_verifies_socket_file_name(test_dir, launcher="local"): colo_application = exp.create_application("wrong_uds_socket_name", colo_settings) with pytest.raises(ValueError): - colo_application.colocate_db_uds(unix_socket="this is not a valid name!") + colo_application.colocate_fs_uds(unix_socket="this is not a valid name!") diff --git a/tests/_legacy/test_colo_model_lsf.py b/tests/_legacy/test_colo_model_lsf.py index afa843ef2..17e75caee 100644 --- a/tests/_legacy/test_colo_model_lsf.py +++ b/tests/_legacy/test_colo_model_lsf.py @@ -47,29 +47,29 @@ class ExpectationMet(Exception): def show_expectation_met(*args, **kwargs): - raise ExpectationMet("mock._prep_colocated_db") + raise ExpectationMet("mock._prep_colocated_fs") def test_jsrun_prep(fileutils, coloutils, monkeypatch): """Ensure that JsrunSettings prep method is executed as expected""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # mock the prep method to raise an exception that short circuits test when goal is met - monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) + monkeypatch.setattr(JsrunSettings, "_prep_colocated_fs", show_expectation_met) - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") - with pytest.raises(ExpectationMet, match="mock._prep_colocated_db") as ex: + with pytest.raises(ExpectationMet, match="mock._prep_colocated_fs") as ex: run_settings = JsrunSettings("foo") coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -78,10 +78,10 @@ def test_non_js_run_prep(fileutils, coloutils, monkeypatch): """Ensure that RunSettings does not attempt to call a prep method""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") # mock prep method to ensure that the exception isn't thrown w/non-JsrunSettings arg - monkeypatch.setattr(JsrunSettings, "_prep_colocated_db", show_expectation_met) + monkeypatch.setattr(JsrunSettings, "_prep_colocated_fs", show_expectation_met) - db_args = {"custom_pinning": [1]} - db_type = "tcp" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") @@ -89,10 +89,10 @@ def test_non_js_run_prep(fileutils, coloutils, monkeypatch): colo_application: Application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -119,14 +119,14 @@ def test_jsrun_prep_cpu_per_flag_set_check( exp_value, test_value, ): - """Ensure that _prep_colocated_db honors basic cpu_per_rs config and allows a + """Ensure that _prep_colocated_fs honors basic cpu_per_rs config and allows a valid input parameter to result in the correct output. If no expected input (or incorrect key) is given, the default should be returned using default config key""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # excluding "db_cpus" should result in default value in comparison & output - db_args = {"custom_pinning": [1]} - db_type = "uds" # Test is insensitive to choice of db + # excluding "fs_cpus" should result in default value in comparison & output + fs_args = {"custom_pinning": [1]} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") @@ -135,10 +135,10 @@ def test_jsrun_prep_cpu_per_flag_set_check( colo_application: Application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -151,14 +151,14 @@ def test_jsrun_prep_cpu_per_flag_set_check( pytest.param("cpu_per_rs", "cpu_per_rs", 11, 11, id="cpu_per_rs matches input"), pytest.param("c", "c", 22, 22, id="c matches input"), pytest.param( - "cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: db_cpus out (not default)" + "cpu_per_rs", "cpu_per_rsx", 3, 33, id="key typo: fs_cpus out (not default)" ), pytest.param( - "cpu_per_rs", "cx", 3, 44, id="key typo: get db_cpus out (not default)" + "cpu_per_rs", "cx", 3, 44, id="key typo: get fs_cpus out (not default)" ), ], ) -def test_jsrun_prep_db_cpu_override( +def test_jsrun_prep_fs_cpu_override( fileutils, coloutils, monkeypatch, @@ -167,12 +167,12 @@ def test_jsrun_prep_db_cpu_override( exp_value, test_value, ): - """Ensure that both cpu_per_rs and c input config override db_cpus""" + """Ensure that both cpu_per_rs and c input config override fs_cpus""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning": [1], "db_cpus": 3} - db_type = "tcp" # Test is insensitive to choice of db + # setting "fs_cpus" should result in non-default value in comparison & output + fs_args = {"custom_pinning": [1], "fs_cpus": 3} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") @@ -181,10 +181,10 @@ def test_jsrun_prep_db_cpu_override( colo_application: Application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -195,14 +195,14 @@ def test_jsrun_prep_db_cpu_override( "exp_run_arg_key,run_arg_key,exp_value,test_value", [ pytest.param( - "cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to db_cpus" + "cpu_per_rs", "cpu_per_rs", 8, 3, id="cpu_per_rs swaps to fs_cpus" ), - pytest.param("c", "c", 8, 4, id="c swaps to db_cpus"), - pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: db_cpus out"), - pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get db_cpus out"), + pytest.param("c", "c", 8, 4, id="c swaps to fs_cpus"), + pytest.param("cpu_per_rs", "cpu_per_rsx", 8, 5, id="key typo: fs_cpus out"), + pytest.param("cpu_per_rs", "cx", 8, 6, id="key typo: get fs_cpus out"), ], ) -def test_jsrun_prep_db_cpu_replacement( +def test_jsrun_prep_fs_cpu_replacement( fileutils, coloutils, monkeypatch, @@ -211,12 +211,12 @@ def test_jsrun_prep_db_cpu_replacement( exp_value, test_value, ): - """Ensure that db_cpus default is used if user config suggests underutilizing resources""" + """Ensure that fs_cpus default is used if user config suggests underutilizing resources""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - # setting "db_cpus" should result in non-default value in comparison & output - db_args = {"custom_pinning": [1], "db_cpus": 8} - db_type = "uds" # Test is insensitive to choice of db + # setting "fs_cpus" should result in non-default value in comparison & output + fs_args = {"custom_pinning": [1], "fs_cpus": 8} + fs_type = "uds" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") @@ -225,10 +225,10 @@ def test_jsrun_prep_db_cpu_replacement( colo_application: Application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) @@ -265,8 +265,8 @@ def test_jsrun_prep_rs_per_host( required to meet limitations (e.g. rs_per_host MUST equal 1)""" monkeypatch.setattr(smartsim.settings.base, "expand_exe_path", lambda x: "/bin/{x}") - db_args = {"custom_pinning": [1]} - db_type = "tcp" # Test is insensitive to choice of db + fs_args = {"custom_pinning": [1]} + fs_type = "tcp" # Test is insensitive to choice of fs exp = Experiment("colocated_application_lsf", launcher="lsf") @@ -275,14 +275,14 @@ def test_jsrun_prep_rs_per_host( colo_application: Application = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, "send_data_local_smartredis.py", - db_args, + fs_args, colo_settings=run_settings, ) - # NOTE: _prep_colocated_db sets this to a string & not an integer + # NOTE: _prep_colocated_fs sets this to a string & not an integer assert str(colo_application.run_settings.run_args[exp_run_arg_key]) == str( exp_value ) diff --git a/tests/_legacy/test_containers.py b/tests/_legacy/test_containers.py index 8957f223d..215ab721d 100644 --- a/tests/_legacy/test_containers.py +++ b/tests/_legacy/test_containers.py @@ -142,7 +142,7 @@ def test_singularity_args(fileutils, test_dir): @pytest.mark.skipif(not singularity_exists, reason="Test needs singularity to run") -def test_singularity_smartredis(local_experiment, prepare_db, local_db, fileutils): +def test_singularity_smartredis(local_experiment, prepare_fs, local_fs, fileutils): """Run two processes, each process puts a tensor on the DB, then accesses the other process's tensor. Finally, the tensor is used to run a application. @@ -151,8 +151,8 @@ def test_singularity_smartredis(local_experiment, prepare_db, local_db, fileutil """ # create and start a database - db = prepare_db(local_db).orchestrator - local_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) container = Singularity(containerURI) diff --git a/tests/_legacy/test_controller.py b/tests/_legacy/test_controller.py index 149872708..19325c933 100644 --- a/tests/_legacy/test_controller.py +++ b/tests/_legacy/test_controller.py @@ -30,7 +30,7 @@ from smartsim._core.control.controller import Controller from smartsim._core.launcher.step import Step -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings @@ -40,7 +40,9 @@ bs = SbatchSettings() ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) class MockStep(Step): @@ -58,7 +60,7 @@ def get_launch_cmd(self): "collection", [ pytest.param(ens, id="Ensemble"), - pytest.param(orc, id="Database"), + pytest.param(feature_store, id="FeatureStore"), ], ) def test_controller_batch_step_creation_preserves_entity_order(collection, monkeypatch): diff --git a/tests/_legacy/test_controller_errors.py b/tests/_legacy/test_controller_errors.py index 0f16c4c6f..60b757f0b 100644 --- a/tests/_legacy/test_controller_errors.py +++ b/tests/_legacy/test_controller_errors.py @@ -30,7 +30,7 @@ from smartsim._core.control import Controller, Manifest from smartsim._core.launcher.step import Step from smartsim._core.launcher.step.dragonStep import DragonStep -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Application from smartsim.entity.ensemble import Ensemble from smartsim.error import SmartSimError, SSUnsupportedError @@ -52,15 +52,17 @@ ens = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=2) # Ensemble entity slightly different but with same name ens_2 = Ensemble("ensemble_name", params={}, run_settings=entity_settings, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) -def test_finished_entity_orc_error(): - """Orchestrators are never 'finished', either run forever or stopped by user""" - orc = Orchestrator() +def test_finished_entity_feature_store_error(): + """FeatureStores are never 'finished', either run forever or stopped by user""" + feature_store = FeatureStore() cont = Controller(launcher="local") with pytest.raises(TypeError): - cont.finished(orc) + cont.finished(feature_store) def test_finished_entity_wrong_type(): @@ -105,26 +107,26 @@ def test_no_launcher(): cont.init_launcher(None) -def test_wrong_orchestrator(wlmutils): +def test_wrong_feature_store(wlmutils): # lo interface to avoid warning from SmartSim - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, interface="lo", run_command="aprun", launcher="pbs", ) cont = Controller(launcher="local") - manifest = Manifest(orc) + manifest = Manifest(feature_store) with pytest.raises(SmartSimError): cont._launch("exp_name", "exp_path", manifest) -def test_bad_orc_checkpoint(): +def test_bad_feature_store_checkpoint(): checkpoint = "./bad-checkpoint" cont = Controller(launcher="local") with pytest.raises(FileNotFoundError): - cont.reload_saved_db(checkpoint) + cont.reload_saved_fs(checkpoint) class MockStep(Step): @@ -141,12 +143,12 @@ def get_launch_cmd(self): [ pytest.param(ens, id="Ensemble_running"), pytest.param(application, id="Application_running"), - pytest.param(orc, id="Orch_running"), + pytest.param(orc, id="Feature_store_running"), ], ) def test_duplicate_running_entity(test_dir, wlmutils, entity): """This test validates that users cannot reuse entity names - that are running in JobManager.jobs or JobManager.db_jobs + that are running in JobManager.jobs or JobManager.fs_jobs """ step_settings = RunSettings("echo") step = MockStep("mock-step", test_dir, step_settings) @@ -178,17 +180,17 @@ def test_restarting_entity(test_dir, wlmutils, entity): controller._launch_step(step, entity=entity) -def test_restarting_orch(test_dir, wlmutils): - """Validate restarting a completed Orchestrator job""" +def test_restarting_feature_store(test_dir, wlmutils): + """Validate restarting a completed FeatureStore job""" step_settings = RunSettings("echo") test_launcher = wlmutils.get_test_launcher() step = MockStep("mock-step", test_dir, step_settings) step.meta["status_dir"] = test_dir - orc.path = test_dir + feature_store.path = test_dir controller = Controller(test_launcher) - controller._jobs.add_job(orc.name, job_id="1234", entity=orc) - controller._jobs.move_to_completed(controller._jobs.db_jobs.get(orc.name)) - controller._launch_step(step, entity=orc) + controller._jobs.add_job(feature_store.name, job_id="1234", entity=feature_store) + controller._jobs.move_to_completed(controller._jobs.fs_jobs.get(feature_store.name)) + controller._launch_step(step, entity=feature_store) @pytest.mark.parametrize( diff --git a/tests/_legacy/test_dbnode.py b/tests/_legacy/test_dbnode.py index 04845344c..7111f5ce5 100644 --- a/tests/_legacy/test_dbnode.py +++ b/tests/_legacy/test_dbnode.py @@ -33,28 +33,28 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator -from smartsim.entity.dbnode import DBNode, LaunchedShardData +from smartsim.database import FeatureStore +from smartsim.entity.dbnode import FSNode, LaunchedShardData from smartsim.error.errors import SmartSimError # The tests in this file belong to the group_a group pytestmark = pytest.mark.group_a -def test_parse_db_host_error(): - orc = Orchestrator() - orc.entities[0].path = "not/a/path" - # Fail to obtain database hostname +def test_parse_fs_host_error(): + feature_store = FeatureStore() + feature_store.entities[0].path = "not/a/path" + # Fail to obtain feature store hostname with pytest.raises(SmartSimError): - orc.entities[0].host + feature_store.entities[0].host -def test_hosts(local_experiment, prepare_db, local_db): - db = prepare_db(local_db).orchestrator - orc = local_experiment.reconnect_orchestrator(db.checkpoint_file) +def test_hosts(local_experiment, prepare_fs, local_fs): + fs = prepare_fs(local_fs).featurestore + feature_store = local_experiment.reconnect_feature_store(fs.checkpoint_file) - hosts = orc.hosts - assert len(hosts) == orc.db_nodes == 1 + hosts = feature_store.hosts + assert len(hosts) == feature_store.fs_nodes == 1 def _random_shard_info(): @@ -81,7 +81,7 @@ def test_launched_shard_info_can_be_serialized(): @pytest.mark.parametrize("limit", [None, 1]) -def test_db_node_can_parse_launched_shard_info(limit): +def test_fs_node_can_parse_launched_shard_info(limit): rand_shards = [_random_shard_info() for _ in range(3)] with io.StringIO(textwrap.dedent("""\ This is some file like str @@ -90,7 +90,7 @@ def test_db_node_can_parse_launched_shard_info(limit): SMARTSIM_ORC_SHARD_INFO: {} ^^^^^^^^^^^^^^^^^^^^^^^ We should be able to parse the serialized - launched db info from this file if the line is + launched fs info from this file if the line is prefixed with this tag. Here are two more for good measure: @@ -99,28 +99,28 @@ def test_db_node_can_parse_launched_shard_info(limit): All other lines should be ignored. """).format(*(json.dumps(s.to_dict()) for s in rand_shards))) as stream: - parsed_shards = DBNode._parse_launched_shard_info_from_iterable(stream, limit) + parsed_shards = FSNode._parse_launched_shard_info_from_iterable(stream, limit) if limit is not None: rand_shards = rand_shards[:limit] assert rand_shards == parsed_shards def test_set_host(): - orc = Orchestrator() - orc.entities[0].set_hosts(["host"]) - assert orc.entities[0].host == "host" + feature_store = FeatureStore() + feature_store.entities[0].set_hosts(["host"]) + assert feature_store.entities[0].host == "host" @pytest.mark.parametrize("nodes, mpmd", [[3, False], [3, True], [1, False]]) -def test_db_id_and_name(mpmd, nodes, wlmutils): +def test_fs_id_and_name(mpmd, nodes, wlmutils): if nodes > 1 and wlmutils.get_test_launcher() not in pytest.wlm_options: - pytest.skip(reason="Clustered DB can only be checked on WLMs") - orc = Orchestrator( - db_identifier="test_db", - db_nodes=nodes, + pytest.skip(reason="Clustered fs can only be checked on WLMs") + feature_store = FeatureStore( + fs_identifier="test_fs", + fs_nodes=nodes, single_cmd=mpmd, launcher=wlmutils.get_test_launcher(), ) - for i, node in enumerate(orc.entities): - assert node.name == f"{orc.name}_{i}" - assert node.db_identifier == orc.db_identifier + for i, node in enumerate(feature_store.entities): + assert node.name == f"{feature_store.name}_{i}" + assert node.fs_identifier == feature_store.fs_identifier diff --git a/tests/_legacy/test_experiment.py b/tests/_legacy/test_experiment.py index 3b4c856e0..623fa782e 100644 --- a/tests/_legacy/test_experiment.py +++ b/tests/_legacy/test_experiment.py @@ -35,7 +35,7 @@ from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils import serialize -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Application from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -252,21 +252,21 @@ def test_error_on_cobalt() -> None: exp = Experiment("cobalt_exp", launcher="cobalt") -def test_default_orch_path( +def test_default_feature_store_path( monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" ) -> None: - """Ensure the default file structure is created for Orchestrator""" + """Ensure the default file structure is created for FeatureStore""" - exp_name = "default-orch-path" + exp_name = "default-feature-store-path" exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) - db = exp.create_database( + db = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() ) exp.start(db) - orch_path = pathlib.Path(test_dir) / db.name - assert orch_path.exists() - assert db.path == str(orch_path) + feature_store_path = pathlib.Path(test_dir) / db.name + assert feature_store_path.exists() + assert db.path == str(feature_store_path) def test_default_application_path( @@ -307,24 +307,24 @@ def test_default_ensemble_path( assert member.path == str(ensemble_path / member.name) -def test_user_orch_path( +def test_user_feature_store_path( monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" ) -> None: - """Ensure a relative path is used to created Orchestrator folder""" + """Ensure a relative path is used to created FeatureStore folder""" - exp_name = "default-orch-path" + exp_name = "default-feature-store-path" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) - db = exp.create_database( + db = exp.create_feature_store( port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface(), path="./testing_folder1234", ) exp.start(db) - orch_path = pathlib.Path(osp.abspath("./testing_folder1234")) - assert orch_path.exists() - assert db.path == str(orch_path) - shutil.rmtree(orch_path) + feature_store_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert feature_store_path.exists() + assert db.path == str(feature_store_path) + shutil.rmtree(feature_store_path) def test_default_application_with_path( diff --git a/tests/_legacy/test_fixtures.py b/tests/_legacy/test_fixtures.py index ea753374e..15823e158 100644 --- a/tests/_legacy/test_fixtures.py +++ b/tests/_legacy/test_fixtures.py @@ -29,7 +29,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -37,20 +37,20 @@ pytestmark = pytest.mark.group_a -def test_db_fixtures(local_experiment, local_db, prepare_db): - db = prepare_db(local_db).orchestrator - local_experiment.reconnect_orchestrator(db.checkpoint_file) - assert db.is_active() - local_experiment.stop(db) +def test_db_fixtures(local_experiment, local_fs, prepare_fs): + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) + assert fs.is_active() + local_experiment.stop(fs) -def test_create_new_db_fixture_if_stopped(local_experiment, local_db, prepare_db): +def test_create_new_fs_fixture_if_stopped(local_experiment, local_fs, prepare_fs): # Run this twice to make sure that there is a stopped database - output = prepare_db(local_db) - local_experiment.reconnect_orchestrator(output.orchestrator.checkpoint_file) - local_experiment.stop(output.orchestrator) - - output = prepare_db(local_db) - assert output.new_db - local_experiment.reconnect_orchestrator(output.orchestrator.checkpoint_file) - assert output.orchestrator.is_active() + output = prepare_fs(local_fs) + local_experiment.reconnect_feature_store(output.featurestore.checkpoint_file) + local_experiment.stop(output.featurestore) + + output = prepare_fs(local_fs) + assert output.new_fs + local_experiment.reconnect_feature_store(output.featurestore.checkpoint_file) + assert output.featurestore.is_active() diff --git a/tests/_legacy/test_generator.py b/tests/_legacy/test_generator.py index b17db06fd..c3bfcad64 100644 --- a/tests/_legacy/test_generator.py +++ b/tests/_legacy/test_generator.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.generation import Generator -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.settings import RunSettings # The tests in this file belong to the group_a group @@ -123,21 +123,21 @@ def test_full_exp(fileutils, test_dir, wlmutils): script = fileutils.get_test_conf_path("sleep.py") application.attach_generator_files(to_copy=script) - orc = Orchestrator(wlmutils.get_test_port()) + feature_store = FeatureStore(wlmutils.get_test_port()) params = {"THERMO": [10, 20, 30], "STEPS": [10, 20, 30]} ensemble = exp.create_ensemble("test_ens", params=params, run_settings=rs) config = get_gen_file(fileutils, "in.atm") ensemble.attach_generator_files(to_configure=config) - exp.generate(orc, ensemble, application) + exp.generate(feature_store, ensemble, application) # test for ensemble assert osp.isdir(osp.join(test_dir, "test_ens/")) for i in range(9): assert osp.isdir(osp.join(test_dir, "test_ens/test_ens_" + str(i))) - # test for orc dir - assert osp.isdir(osp.join(test_dir, orc.name)) + # test for feature_store dir + assert osp.isdir(osp.join(test_dir, feature_store.name)) # test for application file assert osp.isdir(osp.join(test_dir, "application")) diff --git a/tests/_legacy/test_indirect.py b/tests/_legacy/test_indirect.py index 814302968..7766b5825 100644 --- a/tests/_legacy/test_indirect.py +++ b/tests/_legacy/test_indirect.py @@ -54,7 +54,7 @@ [ pytest.param("indirect.py", {"+name", "+command", "+entity_type", "+telemetry_dir", "+working_dir"}, id="no args"), pytest.param("indirect.py -c echo +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="cmd typo"), - pytest.param("indirect.py -t orchestrator +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), + pytest.param("indirect.py -t featurestore +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="etype typo"), pytest.param("indirect.py -d /foo/bar +entity_type ttt +command ccc +output_file ooo +working_dir www +error_file eee", {"+telemetry_dir"}, id="dir typo"), pytest.param("indirect.py +entity_type ttt +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+command"}, id="no cmd"), pytest.param("indirect.py +command ccc +telemetry_dir ddd +output_file ooo +working_dir www +error_file eee", {"+entity_type"}, id="no etype"), diff --git a/tests/_legacy/test_interrupt.py b/tests/_legacy/test_interrupt.py index 700f2dd4a..1b134a884 100644 --- a/tests/_legacy/test_interrupt.py +++ b/tests/_legacy/test_interrupt.py @@ -75,9 +75,9 @@ def test_interrupt_blocked_jobs(test_dir): time.sleep(2) # allow time for jobs to be stopped active_jobs = exp._control._jobs.jobs - active_db_jobs = exp._control._jobs.db_jobs + active_fs_jobs = exp._control._jobs.fs_jobs completed_jobs = exp._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(active_jobs) + len(active_fs_jobs) == 0 assert len(completed_jobs) == num_jobs @@ -120,7 +120,7 @@ def test_interrupt_multi_experiment_unblocked_jobs(test_dir): time.sleep(2) # allow time for jobs to be stopped for i, experiment in enumerate(experiments): active_jobs = experiment._control._jobs.jobs - active_db_jobs = experiment._control._jobs.db_jobs + active_fs_jobs = experiment._control._jobs.fs_jobs completed_jobs = experiment._control._jobs.completed - assert len(active_jobs) + len(active_db_jobs) == 0 + assert len(active_jobs) + len(active_fs_jobs) == 0 assert len(completed_jobs) == jobs_per_experiment[i] diff --git a/tests/_legacy/test_launch_errors.py b/tests/_legacy/test_launch_errors.py index e67115ce3..15dd89831 100644 --- a/tests/_legacy/test_launch_errors.py +++ b/tests/_legacy/test_launch_errors.py @@ -28,7 +28,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SSUnsupportedError from smartsim.settings import JsrunSettings, RunSettings from smartsim.status import SmartSimStatus @@ -61,18 +61,22 @@ def test_model_failure(fileutils, test_dir): assert all([stat == SmartSimStatus.STATUS_FAILED for stat in statuses]) -def test_orchestrator_relaunch(test_dir, wlmutils): - """Test when users try to launch second orchestrator""" - exp_name = "test-orc-on-relaunch" +def test_feature_store_relaunch(test_dir, wlmutils): + """Test when users try to launch second FeatureStore""" + exp_name = "test-feature-store-on-relaunch" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) - orc = Orchestrator(port=wlmutils.get_test_port(), db_identifier="orch_1") - orc.set_path(test_dir) - orc_1 = Orchestrator(port=wlmutils.get_test_port() + 1, db_identifier="orch_2") - orc_1.set_path(test_dir) + feature_store = FeatureStore( + port=wlmutils.get_test_port(), fs_identifier="feature_store_1" + ) + feature_store.set_path(test_dir) + feature_store_1 = FeatureStore( + port=wlmutils.get_test_port() + 1, fs_identifier="feature_store_2" + ) + feature_store_1.set_path(test_dir) try: - exp.start(orc) - exp.start(orc_1) + exp.start(feature_store) + exp.start(feature_store_1) finally: - exp.stop(orc) - exp.stop(orc_1) + exp.stop(feature_store) + exp.stop(feature_store_1) diff --git a/tests/_legacy/test_lsf_settings.py b/tests/_legacy/test_lsf_settings.py index fcb351648..64dbd001c 100644 --- a/tests/_legacy/test_lsf_settings.py +++ b/tests/_legacy/test_lsf_settings.py @@ -144,7 +144,7 @@ def test_jsrun_mpmd(): def test_catch_colo_mpmd(): settings = JsrunSettings("python") - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = JsrunSettings("python") with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/_legacy/test_manifest.py b/tests/_legacy/test_manifest.py index fccc1a7b2..4268c3761 100644 --- a/tests/_legacy/test_manifest.py +++ b/tests/_legacy/test_manifest.py @@ -40,8 +40,8 @@ from smartsim._core.control.manifest import ( _LaunchedManifestMetadata as LaunchedManifestMetadata, ) -from smartsim.database import Orchestrator -from smartsim.entity.dbobject import DBModel, DBScript +from smartsim.database import FeatureStore +from smartsim.entity.dbobject import FSModel, FSScript from smartsim.error import SmartSimError from smartsim.settings import RunSettings @@ -58,21 +58,21 @@ application_2 = exp.create_application("application_1", run_settings=rs) ensemble = exp.create_ensemble("ensemble", run_settings=rs, replicas=1) -orc = Orchestrator() -orc_1 = deepcopy(orc) -orc_1.name = "orc2" +feature_store = FeatureStore() +feature_store_1 = deepcopy(feature_store) +feature_store_1.name = "feature_store2" -db_script = DBScript("some-script", "def main():\n print('hello world')\n") -db_model = DBModel("some-model", "TORCH", b"some-model-bytes") +fs_script = FSScript("some-script", "def main():\n print('hello world')\n") +fs_model = FSModel("some-model", "TORCH", b"some-model-bytes") def test_separate(): - manifest = Manifest(application, ensemble, orc) + manifest = Manifest(application, ensemble, feature_store) assert manifest.applications[0] == application assert len(manifest.applications) == 1 assert manifest.ensembles[0] == ensemble assert len(manifest.ensembles) == 1 - assert manifest.dbs[0] == orc + assert manifest.fss[0] == feature_store def test_separate_type(): @@ -106,55 +106,55 @@ class Person: @pytest.mark.parametrize( - "patch, has_db_objects", + "patch, has_fs_objects", [ - pytest.param((), False, id="No DB Objects"), + pytest.param((), False, id="No FS Objects"), pytest.param( - (application, "_db_models", [db_model]), True, id="Application w/ DB Model" + (application, "_fs_models", [fs_model]), True, id="Application w/ FS Model" ), pytest.param( - (application, "_db_scripts", [db_script]), + (application, "_fs_scripts", [fs_script]), True, - id="Application w/ DB Script", + id="Application w/ FS Script", ), pytest.param( - (ensemble, "_db_models", [db_model]), True, id="Ensemble w/ DB Model" + (ensemble, "_fs_models", [fs_model]), True, id="Ensemble w/ fs Model" ), pytest.param( - (ensemble, "_db_scripts", [db_script]), True, id="Ensemble w/ DB Script" + (ensemble, "_fs_scripts", [fs_script]), True, id="Ensemble w/ fs Script" ), pytest.param( - (ensemble.entities[0], "_db_models", [db_model]), + (ensemble.entities[0], "_fs_models", [fs_model]), True, - id="Ensemble Member w/ DB Model", + id="Ensemble Member w/ fs Model", ), pytest.param( - (ensemble.entities[0], "_db_scripts", [db_script]), + (ensemble.entities[0], "_fs_scripts", [fs_script]), True, - id="Ensemble Member w/ DB Script", + id="Ensemble Member w/ fs Script", ), ], ) -def test_manifest_detects_db_objects(monkeypatch, patch, has_db_objects): +def test_manifest_detects_fs_objects(monkeypatch, patch, has_fs_objects): if patch: monkeypatch.setattr(*patch) - assert Manifest(application, ensemble).has_db_objects == has_db_objects + assert Manifest(application, ensemble).has_fs_objects == has_fs_objects def test_launched_manifest_transform_data(): applications = [(application, 1), (application_2, 2)] ensembles = [(ensemble, [(m, i) for i, m in enumerate(ensemble.entities)])] - dbs = [(orc, [(n, i) for i, n in enumerate(orc.entities)])] + fss = [(feature_store, [(n, i) for i, n in enumerate(feature_store.entities)])] launched = LaunchedManifest( metadata=LaunchedManifestMetadata("name", "path", "launcher", "run_id"), applications=applications, ensembles=ensembles, - databases=dbs, + featurestores=fss, ) transformed = launched.map(lambda x: str(x)) assert transformed.applications == tuple((m, str(i)) for m, i in applications) assert transformed.ensembles[0][1] == tuple((m, str(i)) for m, i in ensembles[0][1]) - assert transformed.databases[0][1] == tuple((n, str(i)) for n, i in dbs[0][1]) + assert transformed.featurestores[0][1] == tuple((n, str(i)) for n, i in fss[0][1]) def test_launched_manifest_builder_correctly_maps_data(): @@ -162,12 +162,14 @@ def test_launched_manifest_builder_correctly_maps_data(): lmb.add_application(application, 1) lmb.add_application(application_2, 1) lmb.add_ensemble(ensemble, [i for i in range(len(ensemble.entities))]) - lmb.add_database(orc, [i for i in range(len(orc.entities))]) + lmb.add_feature_store( + feature_store, [i for i in range(len(feature_store.entities))] + ) manifest = lmb.finalize() assert len(manifest.applications) == 2 assert len(manifest.ensembles) == 1 - assert len(manifest.databases) == 1 + assert len(manifest.featurestores) == 1 def test_launced_manifest_builder_raises_if_lens_do_not_match(): @@ -175,7 +177,7 @@ def test_launced_manifest_builder_raises_if_lens_do_not_match(): with pytest.raises(ValueError): lmb.add_ensemble(ensemble, list(range(123))) with pytest.raises(ValueError): - lmb.add_database(orc, list(range(123))) + lmb.add_feature_store(feature_store, list(range(123))) def test_launched_manifest_builer_raises_if_attaching_data_to_empty_collection( diff --git a/tests/_legacy/test_model.py b/tests/_legacy/test_model.py index 74888a52b..f32a27a07 100644 --- a/tests/_legacy/test_model.py +++ b/tests/_legacy/test_model.py @@ -71,7 +71,7 @@ def test_catch_colo_mpmd_application(): # make it colocated which should raise and error with pytest.raises(SSUnsupportedError): - application.colocate_db() + application.colocate_fs() def test_attach_batch_settings_to_application(): diff --git a/tests/_legacy/test_mpi_settings.py b/tests/_legacy/test_mpi_settings.py index 7d8db6e75..40c3f4ce0 100644 --- a/tests/_legacy/test_mpi_settings.py +++ b/tests/_legacy/test_mpi_settings.py @@ -173,7 +173,7 @@ def test_mpi_add_mpmd(): def test_catch_colo_mpmd(): settings = _BaseMPISettings(*default_mpi_args, **default_mpi_kwargs) - settings.colocated_db_settings = {"port": 6379, "cpus": 1} + settings.colocated_fs_settings = {"port": 6379, "cpus": 1} settings_2 = _BaseMPISettings(*default_mpi_args, **default_mpi_kwargs) with pytest.raises(SSUnsupportedError): settings.make_mpmd(settings_2) diff --git a/tests/_legacy/test_multidb.py b/tests/_legacy/test_multidb.py index 0cc89fed6..556110170 100644 --- a/tests/_legacy/test_multidb.py +++ b/tests/_legacy/test_multidb.py @@ -28,7 +28,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import SSDBIDConflictError from smartsim.log import get_logger @@ -40,7 +40,7 @@ logger = get_logger(__name__) -supported_dbs = ["uds", "tcp"] +supported_fss = ["uds", "tcp"] on_wlm = (pytest.test_launcher in pytest.wlm_options,) @@ -69,73 +69,76 @@ def check_not_failed(exp, *args): assert all(stat is not SmartSimStatus.STATUS_FAILED for stat in statuses) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_standard_then_colo_error( - fileutils, wlmutils, coloutils, db_type, test_dir +@pytest.mark.parametrize("fs_type", supported_fss) +def test_fs_identifier_standard_then_colo_error( + fileutils, wlmutils, coloutils, fs_type, test_dir ): - """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp - with unique db_identifiers""" + """Test that it is possible to create_feature_store then colocate_fs_uds/colocate_fs_tcp + with unique fs_identifiers""" # Set experiment name - exp_name = "test_db_identifier_standard_then_colo" + exp_name = "test_fs_identifier_standard_then_colo" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - test_script = fileutils.get_test_conf_path("smartredis/db_id_err.py") + test_script = fileutils.get_test_conf_path("smartredis/fs_id_err.py") # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database( + # create regular feature store + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_colo", + fs_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "testdb_colo" + assert feature_store.name == "testdb_colo" - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) assert ( - smartsim_model.run_settings.colocated_db_settings["db_identifier"] + smartsim_model.run_settings.colocated_fs_settings["fs_identifier"] == "testdb_colo" ) - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): - exp.start(orc) + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): + exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: exp.start(smartsim_model) assert ( - "has already been used. Pass in a unique name for db_identifier" + "has already been used. Pass in a unique name for fs_identifier" in ex.value.args[0] ) - check_not_failed(exp, orc) + check_not_failed(exp, feature_store) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_db_identifier_colo_then_standard( - fileutils, wlmutils, coloutils, db_type, test_dir +@pytest.mark.parametrize("fs_type", supported_fss) +def test_fs_identifier_colo_then_standard( + fileutils, wlmutils, coloutils, fs_type, test_dir ): - """Test colocate_db_uds/colocate_db_tcp then create_database with database + """Test colocate_fs_uds/colocate_fs_tcp then create_feature_store with feature store identifiers. """ # Set experiment name - exp_name = "test_db_identifier_colo_then_standard" + exp_name = "test_fs_identifier_colo_then_standard" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -154,50 +157,53 @@ def test_db_identifier_colo_then_standard( # Create the SmartSim Model smartsim_model = exp.create_application("colocated_model", colo_settings) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) assert ( - smartsim_model.run_settings.colocated_db_settings["db_identifier"] + smartsim_model.run_settings.colocated_fs_settings["fs_identifier"] == "testdb_colo" ) - # Create Database - orc = exp.create_database( + # Create feature store + feature_store = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_colo", + fs_identifier="testdb_colo", hosts=choose_host(wlmutils), ) - assert orc.name == "testdb_colo" + assert feature_store.name == "testdb_colo" - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): exp.start(smartsim_model, block=True) - exp.start(orc) + exp.start(feature_store) - check_not_failed(exp, orc, smartsim_model) + check_not_failed(exp, feature_store, smartsim_model) -def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): - """Test uniqueness of db_identifier several calls to create_database, with non unique names, +def test_fs_identifier_standard_twice_not_unique(wlmutils, test_dir): + """Test uniqueness of fs_identifier several calls to create_feature_store, with non unique names, checking error is raised before exp start is called""" # Set experiment name - exp_name = "test_db_identifier_multiple_create_database_not_unique" + exp_name = "test_fs_identifier_multiple_create_feature_store_not_unique" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -207,42 +213,45 @@ def test_db_identifier_standard_twice_not_unique(wlmutils, test_dir): # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # CREATE DATABASE with db_identifier - orc = exp.create_database( + # CREATE feature store with fs_identifier + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="my_db", + fs_identifier="my_fs", hosts=choose_host(wlmutils), ) - assert orc.name == "my_db" + assert feature_store.name == "my_fs" - orc2 = exp.create_database( + feature_store2 = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="my_db", + fs_identifier="my_fs", hosts=choose_host(wlmutils, index=1), ) - assert orc2.name == "my_db" + assert feature_store2.name == "my_fs" - # CREATE DATABASE with db_identifier - with make_entity_context(exp, orc2), make_entity_context(exp, orc): - exp.start(orc) + # CREATE feature store with fs_identifier + with ( + make_entity_context(exp, feature_store2), + make_entity_context(exp, feature_store), + ): + exp.start(feature_store) with pytest.raises(SSDBIDConflictError) as ex: - exp.start(orc2) + exp.start(feature_store) assert ( - "has already been used. Pass in a unique name for db_identifier" + "has already been used. Pass in a unique name for fs_identifier" in ex.value.args[0] ) - check_not_failed(exp, orc) + check_not_failed(exp, feature_store) -def test_db_identifier_create_standard_once(test_dir, wlmutils): - """One call to create database with a database identifier""" +def test_fs_identifier_create_standard_once(test_dir, wlmutils): + """One call to create feature store with a feature storeidentifier""" # Set experiment name - exp_name = "test_db_identifier_create_standard_once" + exp_name = "test_fs_identifier_create_standard_once" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -252,22 +261,22 @@ def test_db_identifier_create_standard_once(test_dir, wlmutils): # Create the SmartSim Experiment exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # Create the SmartSim database - db = exp.create_database( + # Create the SmartSim feature store + fs = exp.create_feature_store( port=test_port, - db_nodes=1, + fs_nodes=1, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - with make_entity_context(exp, db): - exp.start(db) + with make_entity_context(exp, fs): + exp.start(fs) - check_not_failed(exp, db) + check_not_failed(exp, fs) -def test_multidb_create_standard_twice(wlmutils, test_dir): - """Multiple calls to create database with unique db_identifiers""" +def test_multifs_create_standard_twice(wlmutils, test_dir): + """Multiple calls to create feature store with unique fs_identifiers""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -276,36 +285,36 @@ def test_multidb_create_standard_twice(wlmutils, test_dir): # start a new Experiment for this section exp = Experiment( - "test_multidb_create_standard_twice", exp_path=test_dir, launcher=test_launcher + "test_multifs_create_standard_twice", exp_path=test_dir, launcher=test_launcher ) - # create and start an instance of the Orchestrator database - db = exp.create_database( + # create and start an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils, 1), ) - # create database with different db_id - db2 = exp.create_database( + # create feature store with different fs_id + fs2 = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_reg2", + fs_identifier="testdb_reg2", hosts=choose_host(wlmutils, 2), ) # launch - with make_entity_context(exp, db), make_entity_context(exp, db2): - exp.start(db, db2) + with make_entity_context(exp, fs), make_entity_context(exp, fs2): + exp.start(fs, fs2) - with make_entity_context(exp, db), make_entity_context(exp, db2): - exp.start(db, db2) + with make_entity_context(exp, fs), make_entity_context(exp, fs2): + exp.start(fs, fs2) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): - """create one model with colocated database with db_identifier""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_colo_once(fileutils, test_dir, wlmutils, coloutils, fs_type): + """create one model with colocated feature store with fs_identifier""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -315,7 +324,7 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # start a new Experiment for this section exp = Experiment( - "test_multidb_colo_once", launcher=test_launcher, exp_path=test_dir + "test_multifs_colo_once", launcher=test_launcher, exp_path=test_dir ) # create run settings @@ -326,20 +335,20 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # Create the SmartSim Model smartsim_model = exp.create_application("smartsim_model", run_settings) - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) @@ -349,9 +358,9 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): check_not_failed(exp, smartsim_model) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db_type): - """Create regular database then colocate_db_tcp/uds with unique db_identifiers""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, fs_type): + """Create regular feature store then colocate_fs_tcp/uds with unique fs_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() @@ -362,43 +371,43 @@ def test_multidb_standard_then_colo(fileutils, test_dir, wlmutils, coloutils, db # start a new Experiment for this section exp = Experiment( - "test_multidb_standard_then_colo", exp_path=test_dir, launcher=test_launcher + "test_multifs_standard_then_colo", exp_path=test_dir, launcher=test_launcher ) - # create and generate an instance of the Orchestrator database - db = exp.create_database( + # create and generate an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - db_args = { + fs_args = { "port": test_port + 1, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( fileutils, - db_type, + fs_type, exp, test_script, - db_args, + fs_args, on_wlm=on_wlm, ) - with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): - exp.start(db) + with make_entity_context(exp, fs), make_entity_context(exp, smartsim_model): + exp.start(fs) exp.start(smartsim_model, block=True) - check_not_failed(exp, smartsim_model, db) + check_not_failed(exp, smartsim_model, fs) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db_type): - """create regular database then colocate_db_tcp/uds with unique db_identifiers""" +@pytest.mark.parametrize("fs_type", supported_fss) +def test_multifs_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, fs_type): + """create regular feature store then colocate_fs_tcp/uds with unique fs_identifiers""" # Retrieve parameters from testing environment test_port = wlmutils.get_test_port() @@ -411,49 +420,49 @@ def test_multidb_colo_then_standard(fileutils, test_dir, wlmutils, coloutils, db # start a new Experiment exp = Experiment( - "test_multidb_colo_then_standard", exp_path=test_dir, launcher=test_launcher + "test_multifs_colo_then_standard", exp_path=test_dir, launcher=test_launcher ) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) - # create and start an instance of the Orchestrator database - db = exp.create_database( + # create and start an instance of the FeatureStore feature store + fs = exp.create_feature_store( port=test_port + 1, interface=test_interface, - db_identifier="testdb_reg", + fs_identifier="testdb_reg", hosts=choose_host(wlmutils), ) - with make_entity_context(exp, db), make_entity_context(exp, smartsim_model): + with make_entity_context(exp, fs), make_entity_context(exp, smartsim_model): exp.start(smartsim_model, block=False) - exp.start(db) + exp.start(fs) exp.poll(smartsim_model) - check_not_failed(exp, db, smartsim_model) + check_not_failed(exp, fs, smartsim_model) @pytest.mark.skipif( pytest.test_launcher not in pytest.wlm_options, reason="Not testing WLM integrations", ) -@pytest.mark.parametrize("db_type", supported_dbs) -def test_launch_cluster_orc_single_dbid( - test_dir, coloutils, fileutils, wlmutils, db_type +@pytest.mark.parametrize("fs_type", supported_fss) +def test_launch_cluster_feature_store_single_fsid( + test_dir, coloutils, fileutils, wlmutils, fs_type ): - """test clustered 3-node orchestrator with single command with a database identifier""" + """test clustered 3-node FeatureStore with single command with a feature store identifier""" # TODO detect number of nodes in allocation and skip if not sufficent - exp_name = "test_launch_cluster_orc_single_dbid" + exp_name = "test_launch_cluster_feature_store_single_fsid" launcher = wlmutils.get_test_launcher() test_port = wlmutils.get_test_port() test_script = fileutils.get_test_conf_path("smartredis/multidbid.py") @@ -461,32 +470,35 @@ def test_launch_cluster_orc_single_dbid( # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() - orc: Orchestrator = exp.create_database( + feature_store: FeatureStore = exp.create_feature_store( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface=network_interface, single_cmd=True, hosts=wlmutils.get_test_hostlist(), - db_identifier="testdb_reg", + fs_identifier="testdb_reg", ) - db_args = { + fs_args = { "port": test_port, - "db_cpus": 1, + "fs_cpus": 1, "debug": True, - "db_identifier": "testdb_colo", + "fs_identifier": "testdb_colo", } - # Create model with colocated database + # Create model with colocated feature store smartsim_model = coloutils.setup_test_colo( - fileutils, db_type, exp, test_script, db_args, on_wlm=on_wlm + fileutils, fs_type, exp, test_script, fs_args, on_wlm=on_wlm ) - with make_entity_context(exp, orc), make_entity_context(exp, smartsim_model): - exp.start(orc, block=True) + with ( + make_entity_context(exp, feature_store), + make_entity_context(exp, smartsim_model), + ): + exp.start(feature_store, block=True) exp.start(smartsim_model, block=True) - job_dict = exp._control._jobs.get_db_host_addresses() - assert len(job_dict[orc.entities[0].db_identifier]) == 3 + job_dict = exp._control._jobs.get_fs_host_addresses() + assert len(job_dict[feature_store.entities[0].fs_identifier]) == 3 - check_not_failed(exp, orc, smartsim_model) + check_not_failed(exp, feature_store, smartsim_model) diff --git a/tests/_legacy/test_orc_config_settings.py b/tests/_legacy/test_orc_config_settings.py index 74d0c1af2..3f32da8db 100644 --- a/tests/_legacy/test_orc_config_settings.py +++ b/tests/_legacy/test_orc_config_settings.py @@ -27,7 +27,7 @@ import pytest -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SmartSimError try: @@ -41,41 +41,41 @@ pytestmark = pytest.mark.group_b -def test_config_methods(dbutils, prepare_db, local_db): +def test_config_methods(fsutils, prepare_fs, local_fs): """Test all configuration file edit methods on an active db""" - db = prepare_db(local_db).orchestrator + fs = prepare_fs(local_fs).featurestore # test the happy path and ensure all configuration file edit methods # successfully execute when given correct key-value pairs - configs = dbutils.get_db_configs() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) config_set_method(value) - # ensure SmartSimError is raised when Orchestrator.set_db_conf + # ensure SmartSimError is raised when FeatureStore.set_fs_conf # is given invalid CONFIG key-value pairs - ss_error_configs = dbutils.get_smartsim_error_db_configs() + ss_error_configs = fsutils.get_smartsim_error_fs_configs() for key, value_list in ss_error_configs.items(): for value in value_list: with pytest.raises(SmartSimError): - db.set_db_conf(key, value) + fs.set_fs_conf(key, value) - # ensure TypeError is raised when Orchestrator.set_db_conf + # ensure TypeError is raised when FeatureStore.set_fs_conf # is given either a key or a value that is not a string - type_error_configs = dbutils.get_type_error_db_configs() + type_error_configs = fsutils.get_type_error_fs_configs() for key, value_list in type_error_configs.items(): for value in value_list: with pytest.raises(TypeError): - db.set_db_conf(key, value) + fs.set_db_conf(key, value) -def test_config_methods_inactive(dbutils): +def test_config_methods_inactive(fsutils): """Ensure a SmartSimError is raised when trying to - set configurations on an inactive database + set configurations on an inactive feature store """ - db = Orchestrator() - configs = dbutils.get_db_configs() + fs = FeatureStore() + configs = fsutils.get_fs_configs() for setting, value in configs.items(): - config_set_method = dbutils.get_config_edit_method(db, setting) + config_set_method = fsutils.get_config_edit_method(fs, setting) with pytest.raises(SmartSimError): config_set_method(value) diff --git a/tests/_legacy/test_orchestrator.py b/tests/_legacy/test_orchestrator.py index 66fb894f7..5febb8d1b 100644 --- a/tests/_legacy/test_orchestrator.py +++ b/tests/_legacy/test_orchestrator.py @@ -31,7 +31,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -43,48 +43,48 @@ import conftest -def test_orc_parameters() -> None: +def test_feature_store_parameters() -> None: threads_per_queue = 2 inter_op_threads = 2 intra_op_threads = 2 - db = Orchestrator( - db_nodes=1, + fs = FeatureStore( + fs_nodes=1, threads_per_queue=threads_per_queue, inter_op_threads=inter_op_threads, intra_op_threads=intra_op_threads, ) - assert db.queue_threads == threads_per_queue - assert db.inter_threads == inter_op_threads - assert db.intra_threads == intra_op_threads + assert fs.queue_threads == threads_per_queue + assert fs.inter_threads == inter_op_threads + assert fs.intra_threads == intra_op_threads - module_str = db._rai_module + module_str = fs._rai_module assert "THREADS_PER_QUEUE" in module_str assert "INTRA_OP_PARALLELISM" in module_str assert "INTER_OP_PARALLELISM" in module_str def test_is_not_active() -> None: - db = Orchestrator(db_nodes=1) - assert not db.is_active() + fs = FeatureStore(fs_nodes=1) + assert not fs.is_active() -def test_inactive_orc_get_address() -> None: - db = Orchestrator() +def test_inactive_feature_store_get_address() -> None: + fs = FeatureStore() with pytest.raises(SmartSimError): - db.get_address() + fs.get_address() -def test_orc_is_active_functions( +def test_feature_store_is_active_functions( local_experiment, - prepare_db, - local_db, + prepare_fs, + local_fs, ) -> None: - db = prepare_db(local_db).orchestrator - db = local_experiment.reconnect_orchestrator(db.checkpoint_file) - assert db.is_active() + fs = prepare_fs(local_fs).featurestore + fs = local_experiment.reconnect_feature_store(fs.checkpoint_file) + assert fs.is_active() - # check if the orchestrator can get the address - assert db.get_address() == [f"127.0.0.1:{db.ports[0]}"] + # check if the feature store can get the address + assert fs.get_address() == [f"127.0.0.1:{fs.ports[0]}"] def test_multiple_interfaces( @@ -101,126 +101,135 @@ def test_multiple_interfaces( net_if_addrs = ["lo", net_if_addrs[0]] port = wlmutils.get_test_port() - db = Orchestrator(port=port, interface=net_if_addrs) - db.set_path(test_dir) + fs = FeatureStore(port=port, interface=net_if_addrs) + fs.set_path(test_dir) - exp.start(db) + exp.start(fs) - # check if the orchestrator is active - assert db.is_active() + # check if the FeatureStore is active + assert fs.is_active() - # check if the orchestrator can get the address + # check if the feature store can get the address correct_address = [f"127.0.0.1:{port}"] - if not correct_address == db.get_address(): - exp.stop(db) + if not correct_address == fs.get_address(): + exp.stop(fs) assert False - exp.stop(db) + exp.stop(fs) -def test_catch_local_db_errors() -> None: - # local database with more than one node not allowed +def test_catch_local_feature_store_errors() -> None: + # local feature store with more than one node not allowed with pytest.raises(SSUnsupportedError): - db = Orchestrator(db_nodes=2) + fs = FeatureStore(fs_nodes=2) - # Run command for local orchestrator not allowed + # Run command for local FeatureStore not allowed with pytest.raises(SmartSimError): - db = Orchestrator(run_command="srun") + fs = FeatureStore(run_command="srun") - # Batch mode for local orchestrator is not allowed + # Batch mode for local FeatureStore is not allowed with pytest.raises(SmartSimError): - db = Orchestrator(batch=True) + fs = FeatureStore(batch=True) ##### PBS ###### def test_pbs_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="pbs", run_command="aprun", ) - orc.set_run_arg("account", "ACCOUNT") + feature_store.set_run_arg("account", "ACCOUNT") assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] + [ + fs.run_settings.run_args["account"] == "ACCOUNT" + for fs in feature_store.entities + ] ) - orc.set_run_arg("pes-per-numa-node", "5") + feature_store.set_run_arg("pes-per-numa-node", "5") assert all( - ["pes-per-numa-node" not in db.run_settings.run_args for db in orc.entities] + [ + "pes-per-numa-node" not in fs.run_settings.run_args + for fs in feature_store.entities + ] ) def test_pbs_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="pbs", run_command="aprun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") + feature_store.set_batch_arg("account", "ACCOUNT") - orc2 = Orchestrator( + feature_store2 = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface="lo", launcher="pbs", run_command="aprun", ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" - orc2.set_batch_arg("N", "another_name") - assert "N" not in orc2.batch_settings.batch_args + feature_store2.set_batch_arg("account", "ACCOUNT") + assert feature_store2.batch_settings.batch_args["account"] == "ACCOUNT" + feature_store2.set_batch_arg("N", "another_name") + assert "N" not in feature_store2.batch_settings.batch_args ##### Slurm ###### def test_slurm_set_run_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="slurm", run_command="srun", ) - orc.set_run_arg("account", "ACCOUNT") + feature_store.set_run_arg("account", "ACCOUNT") assert all( - [db.run_settings.run_args["account"] == "ACCOUNT" for db in orc.entities] + [ + fs.run_settings.run_args["account"] == "ACCOUNT" + for fs in feature_store.entities + ] ) def test_slurm_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, interface="lo", launcher="slurm", run_command="srun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("account", "ACCOUNT") + feature_store.set_batch_arg("account", "ACCOUNT") - orc2 = Orchestrator( + feature_store2 = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, interface="lo", launcher="slurm", run_command="srun", ) - orc2.set_batch_arg("account", "ACCOUNT") - assert orc2.batch_settings.batch_args["account"] == "ACCOUNT" + feature_store2.set_batch_arg("account", "ACCOUNT") + assert feature_store2.batch_settings.batch_args["account"] == "ACCOUNT" @pytest.mark.parametrize( @@ -230,98 +239,100 @@ def test_slurm_set_batch_arg(wlmutils: t.Type["conftest.WLMUtils"]) -> None: pytest.param(False, id="Multiple `srun`s"), ], ) -def test_orc_results_in_correct_number_of_shards(single_cmd: bool) -> None: +def test_feature_store_results_in_correct_number_of_shards(single_cmd: bool) -> None: num_shards = 5 - orc = Orchestrator( + feature_store = FeatureStore( port=12345, launcher="slurm", run_command="srun", - db_nodes=num_shards, + fs_nodes=num_shards, batch=False, single_cmd=single_cmd, ) if single_cmd: - assert len(orc.entities) == 1 - (node,) = orc.entities + assert len(feature_store.entities) == 1 + (node,) = feature_store.entities assert len(node.run_settings.mpmd) == num_shards - 1 else: - assert len(orc.entities) == num_shards - assert all(node.run_settings.mpmd == [] for node in orc.entities) + assert len(feature_store.entities) == num_shards + assert all(node.run_settings.mpmd == [] for node in feature_store.entities) assert ( - orc.num_shards == orc.db_nodes == sum(node.num_shards for node in orc.entities) + feature_store.num_shards + == feature_store.fs_nodes + == sum(node.num_shards for node in feature_store.entities) ) ###### LSF ###### -def test_catch_orc_errors_lsf(wlmutils: t.Type["conftest.WLMUtils"]) -> None: +def test_catch_feature_store_errors_lsf(wlmutils: t.Type["conftest.WLMUtils"]) -> None: with pytest.raises(SSUnsupportedError): - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=2, - db_per_host=2, + fs_nodes=2, + fs_per_host=2, batch=False, launcher="lsf", run_command="jsrun", ) - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=False, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) with pytest.raises(SmartSimError): - orc.set_batch_arg("P", "MYPROJECT") + feature_store.set_batch_arg("P", "MYPROJECT") def test_lsf_set_run_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) - orc.set_run_arg("l", "gpu-gpu") - assert all(["l" not in db.run_settings.run_args for db in orc.entities]) + feature_store.set_run_arg("l", "gpu-gpu") + assert all(["l" not in fs.run_settings.run_args for fs in feature_store.entities]) def test_lsf_set_batch_args(wlmutils: t.Type["conftest.WLMUtils"]) -> None: - orc = Orchestrator( + feature_store = FeatureStore( wlmutils.get_test_port(), - db_nodes=3, + fs_nodes=3, batch=True, hosts=["batch", "host1", "host2"], launcher="lsf", run_command="jsrun", ) - assert orc.batch_settings.batch_args["m"] == '"batch host1 host2"' - orc.set_batch_arg("D", "102400000") - assert orc.batch_settings.batch_args["D"] == "102400000" + assert feature_store.batch_settings.batch_args["m"] == '"batch host1 host2"' + feature_store.set_batch_arg("D", "102400000") + assert feature_store.batch_settings.batch_args["D"] == "102400000" def test_orc_telemetry(test_dir: str, wlmutils: t.Type["conftest.WLMUtils"]) -> None: - """Ensure the default behavior for an orchestrator is to disable telemetry""" - db = Orchestrator(port=wlmutils.get_test_port()) - db.set_path(test_dir) + """Ensure the default behavior for a feature store is to disable telemetry""" + fs = FeatureStore(port=wlmutils.get_test_port()) + fs.set_path(test_dir) # default is disabled - assert not db.telemetry.is_enabled + assert not fs.telemetry.is_enabled # ensure updating value works as expected - db.telemetry.enable() - assert db.telemetry.is_enabled + fs.telemetry.enable() + assert fs.telemetry.is_enabled # toggle back - db.telemetry.disable() - assert not db.telemetry.is_enabled + fs.telemetry.disable() + assert not fs.telemetry.is_enabled # toggle one more time - db.telemetry.enable() - assert db.telemetry.is_enabled + fs.telemetry.enable() + assert fs.telemetry.is_enabled diff --git a/tests/_legacy/test_output_files.py b/tests/_legacy/test_output_files.py index 65f080804..3b786548f 100644 --- a/tests/_legacy/test_output_files.py +++ b/tests/_legacy/test_output_files.py @@ -33,7 +33,7 @@ from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller, _AnonymousBatchJob from smartsim._core.launcher.step import Step -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.entity.model import Application from smartsim.settings.base import RunSettings @@ -50,7 +50,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) application = Application("test_application", params={}, path="", run_settings=rs) batch_application = Application( "batch_test_application", @@ -137,7 +139,7 @@ def test_get_output_files_with_create_job_step(test_dir): @pytest.mark.parametrize( "entity", - [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")], + [pytest.param(ens, id="ensemble"), pytest.param(feature_store, id="featurestore")], ) def test_get_output_files_with_create_batch_job_step(entity, test_dir): """Testing output files through _create_batch_job_step""" diff --git a/tests/_legacy/test_preview.py b/tests/_legacy/test_preview.py index 79dcd1206..9ce46c315 100644 --- a/tests/_legacy/test_preview.py +++ b/tests/_legacy/test_preview.py @@ -40,7 +40,7 @@ from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller from smartsim._core.control.job import Job -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity.entity import SmartSimEntity from smartsim.error.errors import PreviewFormatError from smartsim.settings import QsubBatchSettings, RunSettings @@ -66,41 +66,41 @@ def preview_object(test_dir) -> t.Dict[str, Job]: """ rs = RunSettings(exe="echo", exe_args="ifname=lo") s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) - o = Orchestrator() + o = FeatureStore() o.entity = s - s.db_identifier = "test_db_id" + s.fs_identifier = "test_fs_id" s.ports = [1235] s.num_shards = 1 job = Job("faux-name", "faux-step-id", s, "slurm", True) - active_dbjobs: t.Dict[str, Job] = {"mock_job": job} - return active_dbjobs + active_fsjobs: t.Dict[str, Job] = {"mock_job": job} + return active_fsjobs @pytest.fixture -def preview_object_multidb(test_dir) -> t.Dict[str, Job]: +def preview_object_multifs(test_dir) -> t.Dict[str, Job]: """ - Bare bones orch + Bare bones feature store """ rs = RunSettings(exe="echo", exe_args="ifname=lo") s = SmartSimEntity(name="faux-name", path=test_dir, run_settings=rs) - o = Orchestrator() + o = FeatureStore() o.entity = s - s.db_identifier = "testdb_reg" + s.fs_identifier = "testfs_reg" s.ports = [8750] s.num_shards = 1 job = Job("faux-name", "faux-step-id", s, "slurm", True) rs2 = RunSettings(exe="echo", exe_args="ifname=lo") s2 = SmartSimEntity(name="faux-name_2", path=test_dir, run_settings=rs) - o2 = Orchestrator() + o2 = FeatureStore() o2.entity = s2 - s2.db_identifier = "testdb_reg2" + s2.fs_identifier = "testfs_reg2" s2.ports = [8752] s2.num_shards = 1 job2 = Job("faux-name_2", "faux-step-id_2", s2, "slurm", True) - active_dbjobs: t.Dict[str, Job] = {"mock_job": job, "mock_job2": job2} - return active_dbjobs + active_fsjobs: t.Dict[str, Job] = {"mock_job": job, "mock_job2": job2} + return active_fsjobs def add_batch_resources(wlmutils, batch_settings): @@ -140,14 +140,14 @@ def test_get_ifname_filter(): assert output == expected_output -def test_get_dbtype_filter(): - """Test get_dbtype filter to extract database backend from config""" +def test_get_fstype_filter(): + """Test get_fstype filter to extract database backend from config""" - template_str = "{{ config | get_dbtype }}" + template_str = "{{ config | get_fstype }}" template_dict = {"ts": template_str} loader = jinja2.DictLoader(template_dict) env = jinja2.Environment(loader=loader, autoescape=True) - env.filters["get_dbtype"] = previewrenderer.get_dbtype + env.filters["get_fstype"] = previewrenderer.get_fstype t = env.get_template("ts") output = t.render(config=CONFIG.database_cli) @@ -215,44 +215,44 @@ def test_experiment_preview_properties(test_dir, wlmutils): assert exp.launcher == summary_dict["Launcher"] -def test_orchestrator_preview_render(test_dir, wlmutils, choose_host): - """Test correct preview output properties for Orchestrator preview""" +def test_feature_store_preview_render(test_dir, wlmutils, choose_host): + """Test correct preview output properties for FeatureStore preview""" # Prepare entities test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - exp_name = "test_orchestrator_preview_properties" + exp_name = "test_feature_store_preview_properties" exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) # create regular database - orc = exp.create_database( + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), ) - preview_manifest = Manifest(orc) + preview_manifest = Manifest(feature_store) # Execute method for template rendering output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") # Evaluate output - assert "Database Identifier" in output + assert "Feature Store Identifier" in output assert "Shards" in output assert "TCP/IP Port(s)" in output assert "Network Interface" in output assert "Type" in output assert "Executable" in output - db_path = _utils.get_db_path() - if db_path: - db_type, _ = db_path.name.split("-", 1) + fs_path = _utils.get_db_path() + if fs_path: + fs_type, _ = fs_path.name.split("-", 1) - assert orc.db_identifier in output - assert str(orc.num_shards) in output - assert orc._interfaces[0] in output - assert db_type in output + assert feature_store.fs_identifier in output + assert str(feature_store.num_shards) in output + assert feature_store._interfaces[0] in output + assert fs_type in output assert CONFIG.database_exe in output - assert orc.run_command in output - assert str(orc.db_nodes) in output + assert feature_store.run_command in output + assert str(feature_store.fs_nodes) in output def test_preview_to_file(test_dir, wlmutils): @@ -419,8 +419,8 @@ def test_model_key_prefixing(test_dir, wlmutils): test_launcher = wlmutils.get_test_launcher() exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - db = exp.create_database(port=6780, interface="lo") - exp.generate(db, overwrite=True) + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) rs1 = exp.create_run_settings("echo", ["hello", "world"]) model = exp.create_application("model_test", run_settings=rs1) @@ -428,7 +428,7 @@ def test_model_key_prefixing(test_dir, wlmutils): model.enable_key_prefixing() exp.generate(model, overwrite=True) - preview_manifest = Manifest(db, model) + preview_manifest = Manifest(fs, model) # Execute preview method output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") @@ -522,8 +522,8 @@ def test_ensemble_preview_client_configuration(test_dir, wlmutils): "test-preview-ensemble-clientconfig", exp_path=test_dir, launcher=test_launcher ) # Create Orchestrator - db = exp.create_database(port=6780, interface="lo") - exp.generate(db, overwrite=True) + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) rs1 = exp.create_run_settings("echo", ["hello", "world"]) # Create ensemble ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) @@ -538,36 +538,36 @@ def test_ensemble_preview_client_configuration(test_dir, wlmutils): ml_model.register_incoming_entity(sim) exp.generate(ml_model, overwrite=True) - preview_manifest = Manifest(db, ml_model, ensemble) + preview_manifest = Manifest(fs, ml_model, ensemble) # Call preview renderer for testing output output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") # Evaluate output assert "Client Configuration" in output - assert "Database Identifier" in output - assert "Database Backend" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output assert "Type" in output -def test_ensemble_preview_client_configuration_multidb(test_dir, wlmutils): +def test_ensemble_preview_client_configuration_multifs(test_dir, wlmutils): """ Test preview of client configuration and key prefixing in Ensemble preview - with multiple databases + with multiple feature stores """ # Prepare entities test_launcher = wlmutils.get_test_launcher() exp = Experiment( - "test-preview-multidb-clinet-config", exp_path=test_dir, launcher=test_launcher + "test-preview-multifs-clinet-config", exp_path=test_dir, launcher=test_launcher ) - # Create Orchestrator - db1_dbid = "db_1" - db1 = exp.create_database(port=6780, interface="lo", db_identifier=db1_dbid) - exp.generate(db1, overwrite=True) - # Create another Orchestrator - db2_dbid = "db_2" - db2 = exp.create_database(port=6784, interface="lo", db_identifier=db2_dbid) - exp.generate(db2, overwrite=True) + # Create feature store + fs1_fsid = "fs_1" + fs1 = exp.create_feature_store(port=6780, interface="lo", fs_identifier=fs1_fsid) + exp.generate(fs1, overwrite=True) + # Create another feature store + fs2_fsid = "fs_2" + fs2 = exp.create_feature_store(port=6784, interface="lo", fs_identifier=fs2_fsid) + exp.generate(fs2, overwrite=True) rs1 = exp.create_run_settings("echo", ["hello", "world"]) # Create ensemble @@ -581,20 +581,20 @@ def test_ensemble_preview_client_configuration_multidb(test_dir, wlmutils): for sim in ensemble.entities: ml_model.register_incoming_entity(sim) exp.generate(ml_model, overwrite=True) - preview_manifest = Manifest(db1, db2, ml_model, ensemble) + preview_manifest = Manifest(fs1, fs2, ml_model, ensemble) # Call preview renderer for testing output output = previewrenderer.render(exp, preview_manifest, verbosity_level="debug") # Evaluate output assert "Client Configuration" in output - assert "Database Identifier" in output - assert "Database Backend" in output + assert "Feature Store Identifier" in output + assert "Feature Store Backend" in output assert "TCP/IP Port(s)" in output assert "Type" in output - assert db1_dbid in output - assert db2_dbid in output + assert fs1_fsid in output + assert fs2_fsid in output def test_ensemble_preview_attached_files(fileutils, test_dir, wlmutils): @@ -651,12 +651,12 @@ def test_ensemble_preview_attached_files(fileutils, test_dir, wlmutils): assert "generator_files/to_symlink_dir" in link -def test_preview_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): +def test_preview_colocated_fs_model_ensemble(fileutils, test_dir, wlmutils, mlutils): """ - Test preview of DBModel on colocated ensembles + Test preview of FSModel on colocated ensembles """ - exp_name = "test-preview-colocated-db-model-ensemble" + exp_name = "test-preview-colocated-fs-model-ensemble" test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() @@ -695,10 +695,10 @@ def test_preview_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlut outputs="Identity", ) - # Colocate a database with the first ensemble members + # Colocate a feature store with the first ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML models to each ensemble member to make sure they # do not conflict with other ML models @@ -717,10 +717,10 @@ def test_preview_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlut # Add another ensemble member colo_ensemble.add_model(colo_model) - # Colocate a database with the new ensemble member - colo_model.colocate_db_tcp( + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble) - 1, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -766,12 +766,12 @@ def test_preview_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlut assert model_outputs in output -def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlutils): +def test_preview_colocated_fs_script_ensemble(fileutils, test_dir, wlmutils, mlutils): """ - Test preview of DB Scripts on colocated DB from ensemble + Test preview of FS Scripts on colocated FS from ensemble """ - exp_name = "test-preview-colocated-db-script" + exp_name = "test-preview-colocated-fs-script" test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() @@ -780,7 +780,7 @@ def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlu test_num_gpus = mlutils.get_test_num_gpus() if pytest.test_device == "GPU" else 1 expected_torch_script = "torchscript.py" - test_script = fileutils.get_test_conf_path("run_dbscript_smartredis.py") + test_script = fileutils.get_test_conf_path("run_fsscript_smartredis.py") torch_script = fileutils.get_test_conf_path(expected_torch_script) # Create SmartSim Experiment @@ -798,13 +798,13 @@ def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlu # Create a SmartSim model colo_model = exp.create_application("colocated_model", colo_settings) - # Colocate a db with each ensemble entity and add a script + # Colocate a fs with each ensemble entity and add a script # to each entity via file for i, entity in enumerate(colo_ensemble): entity.disable_key_prefixing() - entity.colocate_db_tcp( + entity.colocate_fs_tcp( port=test_port + i, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -817,10 +817,10 @@ def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlu first_device=0, ) - # Colocate a db with the non-ensemble Model - colo_model.colocate_db_tcp( + # Colocate a fs with the non-ensemble Model + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble), - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -850,9 +850,9 @@ def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlu ) # Assert we have added one model to the ensemble - assert len(colo_ensemble._db_scripts) == 1 + assert len(colo_ensemble._fs_scripts) == 1 # Assert we have added both models to each entity - assert all([len(entity._db_scripts) == 2 for entity in colo_ensemble]) + assert all([len(entity._fs_scripts) == 2 for entity in colo_ensemble]) exp.generate(colo_ensemble) @@ -874,7 +874,7 @@ def test_preview_colocated_db_script_ensemble(fileutils, test_dir, wlmutils, mlu def test_preview_active_infrastructure(wlmutils, test_dir, preview_object): - """Test active infrastructure without other orchestrators""" + """Test active infrastructure without other feature stores""" # Prepare entities test_launcher = wlmutils.get_test_launcher() @@ -883,11 +883,11 @@ def test_preview_active_infrastructure(wlmutils, test_dir, preview_object): # Execute method for template rendering output = previewrenderer.render( - exp, active_dbjobs=preview_object, verbosity_level="debug" + exp, active_fsjobs=preview_object, verbosity_level="debug" ) assert "Active Infrastructure" in output - assert "Database Identifier" in output + assert "Feature Store Identifier" in output assert "Shards" in output assert "Network Interface" in output assert "Type" in output @@ -899,48 +899,48 @@ def test_preview_orch_active_infrastructure( ): """ Test correct preview output properties for active infrastructure preview - with other orchestrators + with other feature stores """ # Prepare entities test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - exp_name = "test_orchestrator_active_infrastructure_preview" + exp_name = "test_feature_store_active_infrastructure_preview" exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - orc2 = exp.create_database( + feature_store2 = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), - db_identifier="orc_2", + fs_identifier="fs_2", ) - orc3 = exp.create_database( + feature_store3 = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), - db_identifier="orc_3", + fs_identifier="fs_3", ) - preview_manifest = Manifest(orc2, orc3) + preview_manifest = Manifest(feature_store2, feature_store3) # Execute method for template rendering output = previewrenderer.render( - exp, preview_manifest, active_dbjobs=preview_object, verbosity_level="debug" + exp, preview_manifest, active_fsjobs=preview_object, verbosity_level="debug" ) assert "Active Infrastructure" in output - assert "Database Identifier" in output + assert "Feature Store Identifier" in output assert "Shards" in output assert "Network Interface" in output assert "Type" in output assert "TCP/IP" in output -def test_preview_multidb_active_infrastructure( +def test_preview_multifs_active_infrastructure( wlmutils, test_dir, choose_host, preview_object_multidb ): - """multiple started databases active infrastructure""" + """multiple started feature stores active infrastructure""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -949,32 +949,32 @@ def test_preview_multidb_active_infrastructure( # start a new Experiment for this section exp = Experiment( - "test_preview_multidb_active_infrastructure", + "test_preview_multifs_active_infrastructure", exp_path=test_dir, launcher=test_launcher, ) # Execute method for template rendering output = previewrenderer.render( - exp, active_dbjobs=preview_object_multidb, verbosity_level="debug" + exp, active_fsjobs=preview_object_multifs, verbosity_level="debug" ) assert "Active Infrastructure" in output - assert "Database Identifier" in output + assert "Feature Store Identifier" in output assert "Shards" in output assert "Network Interface" in output assert "Type" in output assert "TCP/IP" in output - assert "testdb_reg" in output - assert "testdb_reg2" in output - assert "Ochestrators" not in output + assert "testfs_reg" in output + assert "testfs_reg2" in output + assert "Feature Stores" not in output -def test_preview_active_infrastructure_orchestrator_error( +def test_preview_active_infrastructure_feature_store_error( wlmutils, test_dir, choose_host, monkeypatch: pytest.MonkeyPatch ): - """Demo error when trying to preview a started orchestrator""" + """Demo error when trying to preview a started feature store""" # Prepare entities test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() @@ -983,56 +983,56 @@ def test_preview_active_infrastructure_orchestrator_error( exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) monkeypatch.setattr( - smartsim.database.orchestrator.Orchestrator, "is_active", lambda x: True + smartsim.database.orchestrator.FeatureStore, "is_active", lambda x: True ) - orc = exp.create_database( + orc = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), - db_identifier="orc_1", + fs_identifier="orc_1", ) # Retrieve any active jobs - active_dbjobs = exp._control.active_orchestrator_jobs + active_fsjobs = exp._control.active_feature_store_jobs preview_manifest = Manifest(orc) # Execute method for template rendering output = previewrenderer.render( - exp, preview_manifest, active_dbjobs=active_dbjobs, verbosity_level="debug" + exp, preview_manifest, active_fsjobs=active_fsjobs, verbosity_level="debug" ) assert "WARNING: Cannot preview orc_1, because it is already started" in output -def test_active_orchestrator_jobs_property( +def test_active_feature_store_jobs_property( wlmutils, test_dir, preview_object, ): - """Ensure db_jobs remaines unchanged after deletion - of active_orchestrator_jobs property stays intact when retrieving db_jobs""" + """Ensure fs_jobs remaines unchanged after deletion + of active_feature_store_jobs property stays intact when retrieving fs_jobs""" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() # start a new Experiment for this section exp = Experiment( - "test-active_orchestrator_jobs-property", + "test-active_feature_store_jobs-property", exp_path=test_dir, launcher=test_launcher, ) controller = Controller() - controller._jobs.db_jobs = preview_object + controller._jobs.fs_jobs = preview_object # Modify the returned job collection - active_orchestrator_jobs = exp._control.active_orchestrator_jobs - active_orchestrator_jobs["test"] = "test_value" + active_feature_store_jobs = exp._control.active_feature_store_jobs + active_feature_store_jobs["test"] = "test_value" # Verify original collection is not also modified - assert not exp._control.active_orchestrator_jobs.get("test", None) + assert not exp._control.active_feature_store_jobs.get("test", None) def test_verbosity_info_ensemble(test_dir, wlmutils): @@ -1067,14 +1067,14 @@ def test_verbosity_info_ensemble(test_dir, wlmutils): assert "echo_ensemble_1" not in output -def test_verbosity_info_colocated_db_model_ensemble( +def test_verbosity_info_colocated_fs_model_ensemble( fileutils, test_dir, wlmutils, mlutils ): - """Test preview of DBModel on colocated ensembles, first adding the DBModel to the - ensemble, then colocating DB. + """Test preview of FSModel on colocated ensembles, first adding the FSModel to the + ensemble, then colocating FS. """ - exp_name = "test-colocated-db-model-ensemble-reordered" + exp_name = "test-colocated-fs-model-ensemble-reordered" test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() @@ -1113,10 +1113,10 @@ def test_verbosity_info_colocated_db_model_ensemble( outputs="Identity", ) - # Colocate a database with the first ensemble members + # Colocate a feature store with the first ensemble members for i, entity in enumerate(colo_ensemble): - entity.colocate_db_tcp( - port=test_port + i, db_cpus=1, debug=True, ifname=test_interface + entity.colocate_fs_tcp( + port=test_port + i, fs_cpus=1, debug=True, ifname=test_interface ) # Add ML models to each ensemble member to make sure they # do not conflict with other ML models @@ -1135,10 +1135,10 @@ def test_verbosity_info_colocated_db_model_ensemble( # Add another ensemble member colo_ensemble.add_model(colo_model) - # Colocate a database with the new ensemble member - colo_model.colocate_db_tcp( + # Colocate a feature store with the new ensemble member + colo_model.colocate_fs_tcp( port=test_port + len(colo_ensemble) - 1, - db_cpus=1, + fs_cpus=1, debug=True, ifname=test_interface, ) @@ -1169,21 +1169,21 @@ def test_verbosity_info_colocated_db_model_ensemble( assert "Devices Per Node" not in output -def test_verbosity_info_orchestrator(test_dir, wlmutils, choose_host): - """Test correct preview output properties for Orchestrator preview""" +def test_verbosity_info_feature_store(test_dir, wlmutils, choose_host): + """Test correct preview output properties for feature store preview""" # Prepare entities test_launcher = wlmutils.get_test_launcher() test_interface = wlmutils.get_test_interface() test_port = wlmutils.get_test_port() - exp_name = "test_orchestrator_preview_properties" + exp_name = "test_feature_store_preview_properties" exp = Experiment(exp_name, exp_path=test_dir, launcher=test_launcher) - # create regular database - orc = exp.create_database( + # create regular feature store + feature_store = exp.create_feature_store( port=test_port, interface=test_interface, hosts=choose_host(wlmutils), ) - preview_manifest = Manifest(orc) + preview_manifest = Manifest(feature_store) # Execute method for template rendering output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") @@ -1200,9 +1200,9 @@ def test_verbosity_info_ensemble(test_dir, wlmutils): # Prepare entities test_launcher = wlmutils.get_test_launcher() exp = Experiment("key_prefix_test", exp_path=test_dir, launcher=test_launcher) - # Create Orchestrator - db = exp.create_database(port=6780, interface="lo") - exp.generate(db, overwrite=True) + # Create feature store + fs = exp.create_feature_store(port=6780, interface="lo") + exp.generate(fs, overwrite=True) rs1 = exp.create_run_settings("echo", ["hello", "world"]) # Create ensemble ensemble = exp.create_ensemble("fd_simulation", run_settings=rs1, replicas=2) @@ -1217,7 +1217,7 @@ def test_verbosity_info_ensemble(test_dir, wlmutils): ml_model.register_incoming_entity(sim) exp.generate(ml_model, overwrite=True) - preview_manifest = Manifest(db, ml_model, ensemble) + preview_manifest = Manifest(fs, ml_model, ensemble) # Call preview renderer for testing output output = previewrenderer.render(exp, preview_manifest, verbosity_level="info") @@ -1268,8 +1268,8 @@ def test_check_verbosity_level(): exp.preview(verbosity_level="info") -def test_preview_colocated_db_singular_model(wlmutils, test_dir): - """Test preview behavior when a colocated db is only added to +def test_preview_colocated_fs_singular_model(wlmutils, test_dir): + """Test preview behavior when a colocated fs is only added to one model. The expected behviour is that both models are colocated """ @@ -1282,7 +1282,7 @@ def test_preview_colocated_db_singular_model(wlmutils, test_dir): model_1 = exp.create_application("model_1", run_settings=rs) model_2 = exp.create_application("model_2", run_settings=rs) - model_1.colocate_db() + model_1.colocate_fs() exp.generate(model_1, model_2, overwrite=True) @@ -1296,7 +1296,7 @@ def test_preview_colocated_db_singular_model(wlmutils, test_dir): assert "Client Configuration" in output -def test_preview_db_script(wlmutils, test_dir): +def test_preview_fs_script(wlmutils, test_dir): """ Test preview of model instance with a torch script. """ @@ -1310,7 +1310,7 @@ def test_preview_db_script(wlmutils, test_dir): # Initialize a Model object model_instance = exp.create_application("model_name", model_settings) - model_instance.colocate_db_tcp() + model_instance.colocate_fs_tcp() # TorchScript string torch_script_str = "def negate(x):\n\treturn torch.neg(x)\n" diff --git a/tests/_legacy/test_reconnect_orchestrator.py b/tests/_legacy/test_reconnect_orchestrator.py index 6ce93c6f9..889876f00 100644 --- a/tests/_legacy/test_reconnect_orchestrator.py +++ b/tests/_legacy/test_reconnect_orchestrator.py @@ -30,7 +30,7 @@ import pytest from smartsim import Experiment -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.status import SmartSimStatus # The tests in this file belong to the group_b group @@ -39,22 +39,22 @@ first_dir = "" -# TODO ensure database is shutdown +# TODO ensure feature store is shutdown # use https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test -def test_local_orchestrator(test_dir, wlmutils): - """Test launching orchestrator locally""" +def test_local_feature_store(test_dir, wlmutils): + """Test launching feature store locally""" global first_dir - exp_name = "test-orc-launch-local" + exp_name = "test-feature-store-launch-local" exp = Experiment(exp_name, launcher="local", exp_path=test_dir) first_dir = test_dir - orc = Orchestrator(port=wlmutils.get_test_port()) - orc.set_path(osp.join(test_dir, "orchestrator")) + feature_store = FeatureStore(port=wlmutils.get_test_port()) + feature_store.set_path(osp.join(test_dir, "feature_store")) - exp.start(orc) - statuses = exp.get_status(orc) + exp.start(feature_store) + statuses = exp.get_status(feature_store) assert [stat != SmartSimStatus.STATUS_FAILED for stat in statuses] # simulate user shutting down main thread @@ -62,22 +62,23 @@ def test_local_orchestrator(test_dir, wlmutils): exp._control._launcher.task_manager.actively_monitoring = False -def test_reconnect_local_orc(test_dir): - """Test reconnecting to orchestrator from first experiment""" +def test_reconnect_local_feature_store(test_dir): + """Test reconnecting to feature store from first experiment""" global first_dir # start new experiment - exp_name = "test-orc-local-reconnect-2nd" + exp_name = "test-feature-store-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) - checkpoint = osp.join(first_dir, "orchestrator", "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + checkpoint = osp.join(first_dir, "feature_store", "smartsim_db.dat") + + reloaded_feature_store = exp_2.reconnect_feature_store(checkpoint) # let statuses update once time.sleep(5) - statuses = exp_2.get_status(reloaded_orc) + statuses = exp_2.get_status(reloaded_feature_store) for stat in statuses: if stat == SmartSimStatus.STATUS_FAILED: - exp_2.stop(reloaded_orc) + exp_2.stop(reloaded_feature_store) assert False - exp_2.stop(reloaded_orc) + exp_2.stop(reloaded_feature_store) diff --git a/tests/_legacy/test_serialize.py b/tests/_legacy/test_serialize.py index a8c9cf1d9..eb56d7554 100644 --- a/tests/_legacy/test_serialize.py +++ b/tests/_legacy/test_serialize.py @@ -36,7 +36,7 @@ from smartsim._core._cli import utils from smartsim._core.control.manifest import LaunchedManifestBuilder from smartsim._core.utils import serialize -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore _CFG_TM_ENABLED_ATTR = "telemetry_enabled" @@ -144,10 +144,12 @@ def test_started_entities_are_serialized(test_dir, manifest_json): exp.stop(hello_world_application, spam_eggs_application, hello_ensemble) -def test_serialzed_database_does_not_break_if_using_a_non_standard_install(monkeypatch): - monkeypatch.setattr(utils, "get_db_path", lambda: None) - db = Orchestrator() - dict_ = serialize._dictify_db(db, []) +def test_serialzed_feature_store_does_not_break_if_using_a_non_standard_install( + monkeypatch, +): + monkeypatch.setattr(utils, "get_fs_path", lambda: None) + fs = FeatureStore() + dict_ = serialize._dictify_fs(fs, []) assert dict_["type"] == "Unknown" diff --git a/tests/_legacy/test_slurm_parser.py b/tests/_legacy/test_slurm_parser.py index b5f7cf32a..a49d9b198 100644 --- a/tests/_legacy/test_slurm_parser.py +++ b/tests/_legacy/test_slurm_parser.py @@ -231,12 +231,12 @@ def test_parse_sacct_step_id_2(): "extern|119225.extern|\n" "m1-119225.0|119225.0|\n" "m2-119225.1|119225.1|\n" - "orchestrator_0-119225.2|119225.2|\n" + "featurestore_0-119225.2|119225.2|\n" "n1-119225.3|119225.3|" ) step_id = "119225.2" parsed_step_id = slurmParser.parse_step_id_from_sacct( - output, "orchestrator_0-119225.2" + output, "featurestore_0-119225.2" ) assert step_id == parsed_step_id diff --git a/tests/_legacy/test_slurm_settings.py b/tests/_legacy/test_slurm_settings.py index d9d820244..aa915cded 100644 --- a/tests/_legacy/test_slurm_settings.py +++ b/tests/_legacy/test_slurm_settings.py @@ -79,7 +79,7 @@ def test_update_env(): def test_catch_colo_mpmd(): srun = SrunSettings("python") - srun.colocated_db_settings = {"port": 6379, "cpus": 1} + srun.colocated_fs_settings = {"port": 6379, "cpus": 1} srun_2 = SrunSettings("python") # should catch the user trying to make rs mpmd that already are colocated diff --git a/tests/_legacy/test_smartredis.py b/tests/_legacy/test_smartredis.py index 2b7d78918..cc9bc8823 100644 --- a/tests/_legacy/test_smartredis.py +++ b/tests/_legacy/test_smartredis.py @@ -29,7 +29,7 @@ from smartsim import Experiment from smartsim._core.utils import installed_redisai_backends -from smartsim.database import Orchestrator +from smartsim.database import FeatureStore from smartsim.entity import Application, Ensemble from smartsim.status import SmartSimStatus @@ -60,15 +60,15 @@ ) -def test_exchange(local_experiment, local_db, prepare_db, fileutils): +def test_exchange(local_experiment, local_fs, prepare_fs, fileutils): """Run two processes, each process puts a tensor on - the DB, then accesses the other process's tensor. + the FS, then accesses the other process's tensor. Finally, the tensor is used to run a application. """ - db = prepare_db(local_db).orchestrator - # create and start a database - local_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(local_fs).featurestore + # create and start a feature store + local_experiment.reconnect_feature_store(fs.checkpoint_file) rs = local_experiment.create_run_settings("python", "producer.py --exchange") params = {"mult": [1, -10]} @@ -95,16 +95,16 @@ def test_exchange(local_experiment, local_db, prepare_db, fileutils): assert all([stat == SmartSimStatus.STATUS_COMPLETED for stat in statuses]) -def test_consumer(local_experiment, local_db, prepare_db, fileutils): +def test_consumer(local_experiment, local_fs, prepare_fs, fileutils): """Run three processes, each one of the first two processes - puts a tensor on the DB; the third process accesses the + puts a tensor on the FS; the third process accesses the tensors put by the two producers. Finally, the tensor is used to run a application by each producer and the consumer accesses the two results. """ - db = prepare_db(local_db).orchestrator - local_experiment.reconnect_orchestrator(db.checkpoint_file) + fs = prepare_fs(local_fs).featurestore + local_experiment.reconnect_feature_store(fs.checkpoint_file) rs_prod = local_experiment.create_run_settings("python", "producer.py") rs_consumer = local_experiment.create_run_settings("python", "consumer.py") diff --git a/tests/_legacy/test_symlinking.py b/tests/_legacy/test_symlinking.py index 622b960b2..11219a81b 100644 --- a/tests/_legacy/test_symlinking.py +++ b/tests/_legacy/test_symlinking.py @@ -32,7 +32,7 @@ from smartsim import Experiment from smartsim._core.config import CONFIG from smartsim._core.control.controller import Controller, _AnonymousBatchJob -from smartsim.database.orchestrator import Orchestrator +from smartsim.database.orchestrator import FeatureStore from smartsim.entity.ensemble import Ensemble from smartsim.entity.model import Application from smartsim.settings.base import RunSettings @@ -49,7 +49,9 @@ batch_rs = SrunSettings("echo", ["spam", "eggs"]) ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) -orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +feature_store = FeatureStore( + fs_nodes=3, batch=True, launcher="slurm", run_command="srun" +) application = Application("test_application", params={}, path="", run_settings=rs) batch_application = Application( "batch_test_application", @@ -96,7 +98,7 @@ def symlink_with_create_job_step(test_dir, entity): "entity", [ pytest.param(ens, id="ensemble"), - pytest.param(orc, id="orchestrator"), + pytest.param(feature_store, id="featurestore"), pytest.param(anon_batch_application, id="application"), ], ) @@ -235,15 +237,15 @@ def test_non_batch_application_symlinks(test_dir): _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) -def test_non_batch_orchestrator_symlinks(test_dir): - exp = Experiment("test-non-batch-orc", exp_path=test_dir) +def test_non_batch_feature_store_symlinks(test_dir): + exp = Experiment("test-non-batch-feature-store", exp_path=test_dir) - db = exp.create_database(interface="lo") + db = exp.create_feature_store(interface="lo") exp.generate(db) exp.start(db, block=True) exp.stop(db) - for i in range(db.db_nodes): + for i in range(db.fs_nodes): _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) diff --git a/tests/_legacy/test_telemetry_monitor.py b/tests/_legacy/test_telemetry_monitor.py index e0b122820..02a89d3e0 100644 --- a/tests/_legacy/test_telemetry_monitor.py +++ b/tests/_legacy/test_telemetry_monitor.py @@ -298,8 +298,8 @@ def test_load_manifest(fileutils: FileUtils, test_dir: str, config: cfg.Config): assert len(manifest.runs[0].applications) == 1 assert len(manifest.runs[2].applications) == 8 # 8 applications in ensemble - assert len(manifest.runs[0].orchestrators) == 0 - assert len(manifest.runs[1].orchestrators) == 3 # 3 shards in db + assert len(manifest.runs[0].featurestores) == 0 + assert len(manifest.runs[1].featurestores) == 3 # 3 shards in fs def test_load_manifest_colo_application(fileutils: FileUtils): @@ -334,9 +334,9 @@ def test_load_manifest_serial_applications(fileutils: FileUtils): assert len(manifest.runs[0].applications) == 5 -def test_load_manifest_db_and_applications(fileutils: FileUtils): +def test_load_manifest_fs_and_applications(fileutils: FileUtils): """Ensure that the runtime manifest loads correctly when containing applications & - orchestrator across 2 separate runs""" + feature store across 2 separate runs""" # NOTE: for regeneration, this manifest can use `test_telemetry_colo` sample_manifest_path = fileutils.get_test_conf_path("telemetry/db_and_model.json") sample_manifest = pathlib.Path(sample_manifest_path) @@ -348,19 +348,19 @@ def test_load_manifest_db_and_applications(fileutils: FileUtils): assert manifest.launcher == "Slurm" assert len(manifest.runs) == 2 - assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].featurestores) == 1 assert len(manifest.runs[1].applications) == 1 # verify collector paths from manifest are deserialized to collector config - assert manifest.runs[0].orchestrators[0].collectors["client"] - assert manifest.runs[0].orchestrators[0].collectors["memory"] + assert manifest.runs[0].featurestores[0].collectors["client"] + assert manifest.runs[0].featurestores[0].collectors["memory"] # verify collector paths missing from manifest are empty - assert not manifest.runs[0].orchestrators[0].collectors["client_count"] + assert not manifest.runs[0].featurestores[0].collectors["client_count"] -def test_load_manifest_db_and_applications_1run(fileutils: FileUtils): +def test_load_manifest_fs_and_applications_1run(fileutils: FileUtils): """Ensure that the runtime manifest loads correctly when containing applications & - orchestrator in a single run""" + featurestore in a single run""" # NOTE: for regeneration, this manifest can use `test_telemetry_colo` sample_manifest_path = fileutils.get_test_conf_path( "telemetry/db_and_model_1run.json" @@ -374,21 +374,33 @@ def test_load_manifest_db_and_applications_1run(fileutils: FileUtils): assert manifest.launcher == "Slurm" assert len(manifest.runs) == 1 - assert len(manifest.runs[0].orchestrators) == 1 + assert len(manifest.runs[0].featurestores) == 1 assert len(manifest.runs[0].applications) == 1 @pytest.mark.parametrize( - ["task_id", "step_id", "etype", "exp_isorch", "exp_ismanaged"], + ["task_id", "step_id", "etype", "exp_isfeature_store", "exp_ismanaged"], [ - pytest.param("123", "", "application", False, False, id="unmanaged, non-orch"), - pytest.param("456", "123", "ensemble", False, True, id="managed, non-orch"), - pytest.param("789", "987", "orchestrator", True, True, id="managed, orch"), - pytest.param("987", "", "orchestrator", True, False, id="unmanaged, orch"), + pytest.param( + "123", "", "application", False, False, id="unmanaged, non-feature_store" + ), + pytest.param( + "456", "123", "ensemble", False, True, id="managed, non-feature_store" + ), + pytest.param( + "789", "987", "featurestore", True, True, id="managed, feature_store" + ), + pytest.param( + "987", "", "featurestore", True, False, id="unmanaged, feature_store" + ), ], ) def test_persistable_computed_properties( - task_id: str, step_id: str, etype: str, exp_isorch: bool, exp_ismanaged: bool + task_id: str, + step_id: str, + etype: str, + exp_isfeature_store: bool, + exp_ismanaged: bool, ): name = f"test-{etype}-{uuid.uuid4()}" timestamp = get_ts_ms() @@ -407,7 +419,7 @@ def test_persistable_computed_properties( persistable = persistables[0] if persistables else None assert persistable.is_managed == exp_ismanaged - assert persistable.is_db == exp_isorch + assert persistable.is_fs == exp_isfeature_store def test_deserialize_ensemble(fileutils: FileUtils): @@ -459,17 +471,17 @@ def test_shutdown_conditions__has_monitored_job(test_dir: str): telmon._action_handler = mani_handler assert not telmon._can_shutdown() - assert not bool(mani_handler.job_manager.db_jobs) + assert not bool(mani_handler.job_manager.fs_jobs) assert bool(mani_handler.job_manager.jobs) -def test_shutdown_conditions__has_db(test_dir: str): - """Show that an event handler w/a monitored db cannot shutdown""" +def test_shutdown_conditions__has_fs(test_dir: str): + """Show that an event handler w/a monitored fs cannot shutdown""" job_entity1 = JobEntity() job_entity1.name = "xyz" job_entity1.step_id = "123" job_entity1.task_id = "" - job_entity1.type = "orchestrator" # <---- make entity appear as db + job_entity1.type = "featurestore" # <---- make entity appear as fs mani_handler = ManifestEventHandler("xyz") ## TODO: see next comment and combine an add_job method on manieventhandler @@ -486,7 +498,7 @@ def test_shutdown_conditions__has_db(test_dir: str): telmon._action_handler = mani_handler # replace w/mock handler assert not telmon._can_shutdown() - assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_db]) + assert bool([j for j in mani_handler._tracked_jobs.values() if j.is_fs]) assert not bool(mani_handler.job_manager.jobs) @@ -554,10 +566,10 @@ def is_alive(self) -> bool: ], ) @pytest.mark.asyncio -async def test_auto_shutdown__has_db( +async def test_auto_shutdown__has_fs( test_dir: str, cooldown_ms: int, task_duration_ms: int ): - """Ensure that the cooldown timer is respected with a running db""" + """Ensure that the cooldown timer is respected with a running fs""" class FauxObserver: """Mock for the watchdog file system event listener""" @@ -575,10 +587,10 @@ def is_alive(self) -> bool: return True entity = JobEntity() - entity.name = "db_0" + entity.name = "fs_0" entity.step_id = "123" entity.task_id = "" - entity.type = "orchestrator" + entity.type = "featurestore" entity.telemetry_on = True entity.status_dir = test_dir @@ -612,8 +624,8 @@ def is_alive(self) -> bool: def test_telemetry_single_application(fileutils, test_dir, wlmutils, config): - """Test that it is possible to create_database then colocate_db_uds/colocate_db_tcp - with unique db_identifiers""" + """Test that it is possible to create_database then colocate_fs_uds/colocate_fs_tcp + with unique fs_identifiers""" # Set experiment name exp_name = "telemetry_single_application" @@ -780,15 +792,15 @@ def test_telemetry_serial_applications_nonblocking( assert len(stop_events) == 5 -def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config): +def test_telemetry_fs_only_with_generate(test_dir, wlmutils, monkeypatch, config): """ - Test telemetry with only a database running + Test telemetry with only a feature store running """ with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_with_generate" + exp_name = "telemetry_fs_with_generate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -798,14 +810,16 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) + # create regular feature store + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) + exp.generate(feature_store) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: - exp.start(orc, block=True) + exp.start(feature_store, block=True) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) @@ -815,24 +829,24 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch, config assert len(start_events) == 1 assert len(stop_events) <= 1 finally: - exp.stop(orc) + exp.stop(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, config): +def test_telemetry_fs_only_without_generate(test_dir, wlmutils, monkeypatch, config): """ - Test telemetry with only a non-generated database running + Test telemetry with only a non-generated feature store running """ with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_only_without_generate" + exp_name = "telemetry_fs_only_without_generate" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -842,12 +856,14 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) + # create regular feature store + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: - exp.start(orc) + exp.start(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) @@ -857,27 +873,27 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con assert len(start_events) == 1 assert len(stop_events) == 0 finally: - exp.stop(orc) + exp.stop(feature_store) snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED stop_events = list(telemetry_output_path.rglob("stop.json")) assert len(stop_events) == 1 -def test_telemetry_db_and_application( +def test_telemetry_fs_and_application( fileutils, test_dir, wlmutils, monkeypatch, config ): """ - Test telemetry with only a database and a application running + Test telemetry with only a feature store and a application running """ with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) # Set experiment name - exp_name = "telemetry_db_and_application" + exp_name = "telemetry_fs_and_application" # Retrieve parameters from testing environment test_launcher = wlmutils.get_test_launcher() @@ -888,11 +904,13 @@ def test_telemetry_db_and_application( # Create SmartSim Experiment exp = Experiment(exp_name, launcher=test_launcher, exp_path=test_dir) - # create regular database - orc = exp.create_database(port=test_port, interface=test_interface) - exp.generate(orc) + # create regular feature store + feature_store = exp.create_feature_store( + port=test_port, interface=test_interface + ) + exp.generate(feature_store) try: - exp.start(orc) + exp.start(feature_store) # create run settings app_settings = exp.create_run_settings(sys.executable, test_script) @@ -904,12 +922,12 @@ def test_telemetry_db_and_application( exp.generate(smartsim_application) exp.start(smartsim_application, block=True) finally: - exp.stop(orc) + exp.stop(feature_store) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir snooze_blocking(telemetry_output_path, max_delay=10, post_data_delay=1) - assert exp.get_status(orc)[0] == SmartSimStatus.STATUS_CANCELLED + assert exp.get_status(feature_store)[0] == SmartSimStatus.STATUS_CANCELLED assert ( exp.get_status(smartsim_application)[0] == SmartSimStatus.STATUS_COMPLETED ) @@ -1005,7 +1023,7 @@ def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch, c start_events = list(telemetry_output_path.rglob("start.json")) stop_events = list(telemetry_output_path.rglob("stop.json")) - # the colodb does NOT show up as a unique entity in the telemetry + # the colofs does NOT show up as a unique entity in the telemetry assert len(start_events) == 1 assert len(stop_events) == 1 @@ -1207,13 +1225,13 @@ def test_multistart_experiment( rs_m.set_tasks(1) application = exp.create_application("my-application", run_settings=rs_m) - db = exp.create_database( - db_nodes=1, + fs = exp.create_feature_store( + fs_nodes=1, port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface(), ) - exp.generate(db, ens, application, overwrite=True) + exp.generate(fs, ens, application, overwrite=True) with monkeypatch.context() as ctx: ctx.setattr(cfg.Config, "telemetry_frequency", 1) @@ -1224,20 +1242,20 @@ def test_multistart_experiment( # track PID to see that telmon cooldown avoids restarting process tm_pid = exp._control._telemetry_monitor.pid - exp.start(db, block=False) + exp.start(fs, block=False) # check that same TM proc is active assert tm_pid == exp._control._telemetry_monitor.pid try: exp.start(ens, block=True, summary=True) finally: - exp.stop(db) + exp.stop(fs) assert tm_pid == exp._control._telemetry_monitor.pid - time.sleep(3) # time for telmon to write db stop event + time.sleep(3) # time for telmon to write fs stop event telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir - db_start_events = list(telemetry_output_path.rglob("database/**/start.json")) - assert len(db_start_events) == 1 + fs_start_events = list(telemetry_output_path.rglob("database/**/start.json")) + assert len(fs_start_events) == 1 m_start_events = list(telemetry_output_path.rglob("model/**/start.json")) assert len(m_start_events) == 1 @@ -1311,7 +1329,7 @@ def _faux_updates(_self: WLMLauncher, _names: t.List[str]) -> t.List[StepInfo]: job_entity.step_id = "faux-step-id" job_entity.task_id = 1234 job_entity.status_dir = test_dir - job_entity.type = "orchestrator" + job_entity.type = "featurestore" job = Job(job_entity.name, job_entity.step_id, job_entity, "slurm", True) diff --git a/tests/test_configs/telemetry/colocatedmodel.json b/tests/test_configs/telemetry/colocatedmodel.json index 8ecec1c76..77cf910fa 100644 --- a/tests/test_configs/telemetry/colocatedmodel.json +++ b/tests/test_configs/telemetry/colocatedmodel.json @@ -33,7 +33,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "unix_socket": "/tmp/redis.socket", "socket_permissions": 755, @@ -41,13 +41,13 @@ "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "", + "fs_identifier": "", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [] @@ -62,7 +62,7 @@ "err_file": "/tmp/my-exp/colocated_model/colocated_model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] diff --git a/tests/test_configs/telemetry/db_and_model.json b/tests/test_configs/telemetry/db_and_model.json index 62656a30a..3eebd6fbf 100644 --- a/tests/test_configs/telemetry/db_and_model.json +++ b/tests/test_configs/telemetry/db_and_model.json @@ -13,16 +13,16 @@ "run_id": "2ca19ad", "timestamp": 1699038647234488933, "application": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.4", "port": 6780, "cluster": false, @@ -33,7 +33,7 @@ "client_count_file": null, "memory_file": "/path/to/some/mem.log", "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/featurestore/featurestore_0", "step_id": "4139111.27", "task_id": "1452", "managed": true @@ -71,7 +71,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", "step_id": "4139111.28", @@ -82,7 +82,7 @@ "err_file": "/tmp/my-exp/perroquet/perroquet.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] diff --git a/tests/test_configs/telemetry/db_and_model_1run.json b/tests/test_configs/telemetry/db_and_model_1run.json index cbce37799..ec6be51f5 100644 --- a/tests/test_configs/telemetry/db_and_model_1run.json +++ b/tests/test_configs/telemetry/db_and_model_1run.json @@ -36,7 +36,7 @@ "Configure": [], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/4b5507a/model/perroquet", "step_id": "4139111.28", @@ -47,16 +47,16 @@ "err_file": "/tmp/my-exp/perroquet/perroquet.err" } ], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.4", "port": 6780, "cluster": false, @@ -64,7 +64,7 @@ "out_file": "/path/to/some/file.out", "err_file": "/path/to/some/file.err", "telemetry_metadata": { - "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/orchestrator/orchestrator_0", + "status_dir": "/tmp/my-exp/.smartsim/telemetry/telemetry_db_and_model/2ca19ad/database/featurestore/featurestore_0", "step_id": "4139111.27", "task_id": "1452", "managed": true diff --git a/tests/test_configs/telemetry/ensembles.json b/tests/test_configs/telemetry/ensembles.json index 4f340e7e0..e8c4cfc32 100644 --- a/tests/test_configs/telemetry/ensembles.json +++ b/tests/test_configs/telemetry/ensembles.json @@ -13,7 +13,7 @@ "run_id": "d041b90", "timestamp": 1698679830384608928, "application": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", diff --git a/tests/test_configs/telemetry/serialmodels.json b/tests/test_configs/telemetry/serialmodels.json index 77dddcc1e..53c0d9cb8 100644 --- a/tests/test_configs/telemetry/serialmodels.json +++ b/tests/test_configs/telemetry/serialmodels.json @@ -179,7 +179,7 @@ "err_file": "/tmp/my-exp/perroquet_4/perroquet_4.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] } ] diff --git a/tests/test_configs/telemetry/telemetry.json b/tests/test_configs/telemetry/telemetry.json index 348bffd49..084cc1866 100644 --- a/tests/test_configs/telemetry/telemetry.json +++ b/tests/test_configs/telemetry/telemetry.json @@ -6,7 +6,7 @@ }, "runs": [ { - "run_id": "d999ad89-020f-4e6a-b834-dbd88658ce84", + "run_id": "d999ad89-020f-4e6a-b834-fsd88658ce84", "timestamp": 1697824072792854287, "application": [ { @@ -33,20 +33,20 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "port": 5757, "ifname": "lo", "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "COLO", + "fs_identifier": "COLO", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [ @@ -59,7 +59,7 @@ ] }, "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-dbd88658ce84/model/my-model", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d999ad89-020f-4e6a-b834-fsd88658ce84/model/my-model", "step_id": "4121050.30", "task_id": "25230", "managed": true @@ -68,61 +68,61 @@ "err_file": "/path/to/my-exp/my-model/my-model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] }, { "run_id": "fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa", "timestamp": 1697824102122439975, "application": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_1", + "name": "featurestore_1", "hostname": "10.128.0.70", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_1-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true } }, { - "name": "orchestrator_2", + "name": "featurestore_2", "hostname": "10.128.0.71", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_2-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true } }, { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.69", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_0-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/fd3cd1a8-cb8f-4f61-b847-73a8eb0881fa/database/featurestore/featurestore", "step_id": "4121050.31+2", "task_id": "25241", "managed": true @@ -137,7 +137,7 @@ "run_id": "d65ae1df-cb5e-45d9-ab09-6fa641755997", "timestamp": 1697824127962219505, "application": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", @@ -186,7 +186,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_0", "step_id": "4121050.32", @@ -225,7 +225,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_1", "step_id": "4121050.33", @@ -264,7 +264,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_2", "step_id": "4121050.34", @@ -303,7 +303,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_3", "step_id": "4121050.35", @@ -342,7 +342,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_4", "step_id": "4121050.36", @@ -381,7 +381,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_5", "step_id": "4121050.37", @@ -420,7 +420,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_6", "step_id": "4121050.38", @@ -459,7 +459,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/d65ae1df-cb5e-45d9-ab09-6fa641755997/ensemble/my-ens/my-ens_7", "step_id": "4121050.39", @@ -501,20 +501,20 @@ "Configure": [], "Copy": [] }, - "colocated_db": { + "colocated_fs": { "settings": { "port": 5757, "ifname": "lo", "cpus": 1, "custom_pinning": "0", "debug": false, - "db_identifier": "COLO", + "fs_identifier": "COLO", "rai_args": { "threads_per_queue": null, "inter_op_parallelism": null, "intra_op_parallelism": null }, - "extra_db_args": {} + "extra_fs_args": {} }, "scripts": [], "models": [ @@ -536,61 +536,61 @@ "err_file": "/path/to/my-exp/my-model/my-model.err" } ], - "orchestrator": [], + "featurestore": [], "ensemble": [] }, { "run_id": "b33a5d27-6822-4795-8e0e-cfea18551fa4", "timestamp": 1697835261956135240, "application": [], - "orchestrator": [ + "featurestore": [ { - "name": "orchestrator", + "name": "featurestore", "type": "redis", "interface": [ "ipogif0" ], "shards": [ { - "name": "orchestrator_0", + "name": "featurestore_0", "hostname": "10.128.0.2", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_0-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_0-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true } }, { - "name": "orchestrator_2", + "name": "featurestore_2", "hostname": "10.128.0.4", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_2-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_2-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true } }, { - "name": "orchestrator_1", + "name": "featurestore_1", "hostname": "10.128.0.3", "port": 2424, "cluster": true, - "conf_file": "nodes-orchestrator_1-2424.conf", - "out_file": "/path/to/my-exp/orchestrator/orchestrator.out", - "err_file": "/path/to/my-exp/orchestrator/orchestrator.err", + "conf_file": "nodes-featurestore_1-2424.conf", + "out_file": "/path/to/my-exp/featurestore/featurestore.out", + "err_file": "/path/to/my-exp/featurestore/featurestore.err", "telemetry_metadata": { - "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/orchestrator/orchestrator", + "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/b33a5d27-6822-4795-8e0e-cfea18551fa4/database/featurestore/featurestore", "step_id": "4121904.1+2", "task_id": "28289", "managed": true @@ -605,7 +605,7 @@ "run_id": "45772df2-fd80-43fd-adf0-d5e319870182", "timestamp": 1697835287798613875, "application": [], - "orchestrator": [], + "featurestore": [], "ensemble": [ { "name": "my-ens", @@ -654,7 +654,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_0", "step_id": "4121904.2", @@ -693,7 +693,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_1", "step_id": "4121904.3", @@ -732,7 +732,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_2", "step_id": "4121904.4", @@ -771,7 +771,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_3", "step_id": "4121904.5", @@ -810,7 +810,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_4", "step_id": "4121904.6", @@ -849,7 +849,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_5", "step_id": "4121904.7", @@ -888,7 +888,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_6", "step_id": "4121904.8", @@ -927,7 +927,7 @@ ], "Copy": [] }, - "colocated_db": {}, + "colocated_fs": {}, "telemetry_metadata": { "status_dir": "/path/to/my-exp/.smartsim/telemetry/my-exp/45772df2-fd80-43fd-adf0-d5e319870182/ensemble/my-ens/my-ens_7", "step_id": "4121904.9",