From 41d52f9aa3f5356a16b18b5f1f960176898e4202 Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Tue, 19 Jan 2021 13:50:11 +0100 Subject: [PATCH 1/5] Add better support for custom & plugin algos + few fixes --- dataikuapi/dss/ml.py | 100 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 85 insertions(+), 15 deletions(-) diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py index 18309f0e..ebd68d5e 100644 --- a/dataikuapi/dss/ml.py +++ b/dataikuapi/dss/ml.py @@ -238,6 +238,16 @@ def use_feature(self, feature_name): def get_algorithm_settings(self, algorithm_name): raise NotImplementedError() + def _get_custom_algorithm_settings(self, algorithm_name): + # returns the first algorithm with this name + for algo in self.mltask_settings["modeling"]["custom_mllib"]: + if algorithm_name == algo["name"]: + return algo + for algo in self.mltask_settings["modeling"]["custom_python"]: + if algorithm_name == algo["name"]: + return algo + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) + def get_diagnostics_settings(self): """ Gets the diagnostics settings for a mltask. This returns a reference to the @@ -307,7 +317,7 @@ def disable_all_algorithms(self): custom_mllib["enabled"] = False for custom_python in self.mltask_settings["modeling"]["custom_python"]: custom_python["enabled"] = False - for plugin in self.mltask_settings["modeling"]["plugin_python"].values(): + for plugin in self.mltask_settings["modeling"].get("plugin_python", {}).values(): plugin["enabled"] = False def get_all_possible_algorithm_names(self): @@ -315,23 +325,30 @@ def get_all_possible_algorithm_names(self): Returns the list of possible algorithm names, i.e. the list of valid identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings` - This does not include Custom Python models, Custom MLLib models, plugin models. This includes all possible algorithms, regardless of the prediction kind (regression/classification) or engine, so some algorithms may be irrelevant :returns: the list of algorithm names as a list of strings :rtype: list of string """ - return list(self.__class__.algorithm_remap.keys()) + return list(self.__class__.algorithm_remap.keys()) + self._get_custom_algorithm_names() + + def _get_custom_algorithm_names(self): + """ + Returns the list of names of defined custom models (python & mllib) + + :returns: the list of custom models names + :rtype: list of string + """ + return [algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]\ + + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]] def get_enabled_algorithm_names(self): """ :returns: the list of enabled algorithm names as a list of strings :rtype: list of string """ - algos = self.__class__.algorithm_remap - algo_names = [algo_name for algo_name in algos.keys() if self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]] - return algo_names + return [algo_name for algo_name in self.get_all_possible_algorithm_names() if self.get_algorithm_settings(algo_name).get("enabled", False)] def get_enabled_algorithm_settings(self): """ @@ -356,6 +373,32 @@ def set_metric(self, metric=None, custom_metric=None, custom_metric_greater_is_b self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricGIB"] = custom_metric_greater_is_better self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricNeedsProba"] = custom_metric_use_probas + def add_custom_python_model(self, name="Custom Python Model", code=""): + """ + Adds a new custom python model + + :param str name: name of the custom model + :param str code: code of the custom model + """ + self.mltask_settings["modeling"]["custom_python"].append({ + "name": name, + "code": code, + "enabled": True + }) + + def add_custom_mllib_model(self, name="Custom MLlib Model", code=""): + """ + Adds a new custom mllib model + + :param str name: name of the custom model + :param str code: code of the custom model + """ + self.mltask_settings["modeling"]["custom_mllib"].append({ + "name": name, + "initializationCode": code, + "enabled": True + }) + def save(self): """Saves back these settings to the ML Task""" @@ -1310,7 +1353,6 @@ def __init__(self, raw_settings, hyperparameter_search_params): self.cache_node_ids = self._register_simple_parameter("cache_node_ids") self.checkpoint_interval = self._register_single_value_hyperparameter("checkpoint_interval", accepted_types=[int]) - self.impurity = self._register_single_category_hyperparameter("impurity", accepted_values=["gini", "entropy", "variance"]) # TODO: distinguish between regression and classif self.max_bins = self._register_single_value_hyperparameter("max_bins", accepted_types=[int]) self.max_memory_mb = self._register_simple_parameter("max_memory_mb") self.min_info_gain = self._register_single_value_hyperparameter("min_info_gain", accepted_types=[int, float]) @@ -1395,20 +1437,41 @@ def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings) def get_prediction_type(self): return self.mltask_settings['predictionType'] + def get_all_possible_algorithm_names(self): + """ + Returns the list of possible algorithm names, i.e. the list of valid + identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings` + + This includes all possible algorithms, regardless of the prediction kind (regression/classification) + or engine, so some algorithms may be irrelevant + + :returns: the list of algorithm names as a list of strings + :rtype: list of string + """ + return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names() + + def _get_plugin_algorithm_names(self): + return self.mltask_settings["modeling"]["plugin_python"].keys() + + def _get_plugin_algorithm_settings(self, algorithm_name): + if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]: + return self.mltask_settings["modeling"]["plugin_python"][algorithm_name] + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) + def get_enabled_algorithm_names(self): """ :returns: the list of enabled algorithm names as a list of strings :rtype: list of string """ - algos = self.__class__.algorithm_remap + algo_names = super(DSSPredictionMLTaskSettings, self).get_enabled_algorithm_names() + # Hide either "XGBOOST_CLASSIFICATION" or "XGBOOST_REGRESSION" which point to the same key "xgboost" if self.mltask_settings["predictionType"] == "REGRESSION": - excluded_name = {"XGBOOST_CLASSIFICATION"} + excluded_names = {"XGBOOST_CLASSIFICATION"} else: - excluded_name = {"XGBOOST_REGRESSION"} - algo_names = [algo_name for algo_name in algos.keys() if (self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"] - and algo_name not in excluded_name)] - return algo_names + excluded_names = {"XGBOOST_REGRESSION"} + + return [algo_name for algo_name in algo_names if algo_name not in excluded_names] def get_algorithm_settings(self, algorithm_name): """ @@ -1442,6 +1505,10 @@ def get_algorithm_settings(self, algorithm_name): # Subsequent calls get the same object self.mltask_settings["modeling"][algorithm_name.lower()] = algorithm_settings return self.mltask_settings["modeling"][algorithm_name.lower()] + elif algorithm_name in self._get_custom_algorithm_names(): + return self._get_custom_algorithm_settings(algorithm_name) + elif algorithm_name in self._get_plugin_algorithm_names(): + return self._get_plugin_algorithm_settings(algorithm_name) else: raise ValueError("Unknown algorithm: {}".format(algorithm_name)) @@ -1590,8 +1657,11 @@ def get_algorithm_settings(self, algorithm_name): """ if algorithm_name in self.__class__.algorithm_remap: algorithm_name = self.__class__.algorithm_remap[algorithm_name] - - return self.mltask_settings["modeling"][algorithm_name.lower()] + return self.mltask_settings["modeling"][algorithm_name.lower()] + elif algorithm_name in self._get_custom_algorithm_names(): + return self._get_custom_algorithm_settings(algorithm_name) + else: + raise ValueError("Unknown algorithm: {}".format(algorithm_name)) class DSSTrainedModelDetails(object): From 97396a420544b7af94897e6c9119aab4950a1d03 Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Tue, 19 Jan 2021 13:50:56 +0100 Subject: [PATCH 2/5] Add support of wait_guess_complete for clustering tasks --- dataikuapi/dss/analysis.py | 15 ++++++++++++--- dataikuapi/dss/dataset.py | 13 +++++++++---- dataikuapi/dss/project.py | 16 ++++++++++++---- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/dataikuapi/dss/analysis.py b/dataikuapi/dss/analysis.py index e08e85c2..ae10f702 100644 --- a/dataikuapi/dss/analysis.py +++ b/dataikuapi/dss/analysis.py @@ -188,8 +188,9 @@ def create_prediction_ml_task(self, return mltask def create_clustering_ml_task(self, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): + ml_backend_type="PY_MEMORY", + guess_policy="KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab @@ -205,6 +206,10 @@ def create_clustering_ml_task(self, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ obj = { @@ -214,7 +219,11 @@ def create_clustering_ml_task(self, } ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj) - return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) + mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) + + if wait_guess_complete: + mltask.wait_guess_complete() + return mltask def list_ml_tasks(self): """ diff --git a/dataikuapi/dss/dataset.py b/dataikuapi/dss/dataset.py index 574ca125..91cfb3da 100644 --- a/dataikuapi/dss/dataset.py +++ b/dataikuapi/dss/dataset.py @@ -385,8 +385,9 @@ def create_prediction_ml_task(self, target_variable, guess_policy = guess_policy, prediction_type = prediction_type, wait_guess_complete = wait_guess_complete) def create_clustering_ml_task(self, input_dataset, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): + ml_backend_type="PY_MEMORY", + guess_policy="KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab for a dataset. @@ -400,9 +401,13 @@ def create_clustering_ml_task(self, input_dataset, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ - return self.project.create_clustering_ml_task(self.dataset_name, - ml_backend_type = ml_backend_type, guess_policy = guess_policy) + return self.project.create_clustering_ml_task(self.dataset_name, ml_backend_type=ml_backend_type, guess_policy=guess_policy, + wait_guess_complete=wait_guess_complete) def create_analysis(self): """ diff --git a/dataikuapi/dss/project.py b/dataikuapi/dss/project.py index 7422b829..6557e8b6 100644 --- a/dataikuapi/dss/project.py +++ b/dataikuapi/dss/project.py @@ -532,9 +532,9 @@ def create_prediction_ml_task(self, input_dataset, target_variable, return ret def create_clustering_ml_task(self, input_dataset, - ml_backend_type = "PY_MEMORY", - guess_policy = "KMEANS"): - + ml_backend_type = "PY_MEMORY", + guess_policy = "KMEANS", + wait_guess_complete=True): """Creates a new clustering task in a new visual analysis lab for a dataset. @@ -549,6 +549,10 @@ def create_clustering_ml_task(self, input_dataset, :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION + :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms. + You should wait for the guessing to be completed by calling + ``wait_guess_complete`` on the returned object before doing anything + else (in particular calling ``train`` or ``get_settings``) """ obj = { @@ -559,7 +563,11 @@ def create_clustering_ml_task(self, input_dataset, } ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj) - return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"]) + mltask = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"]) + + if wait_guess_complete: + mltask.wait_guess_complete() + return mltask def list_ml_tasks(self): """ From b38b38f11b5fb9ee1c949ca7f488600266282084 Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Tue, 19 Jan 2021 15:42:14 +0100 Subject: [PATCH 3/5] Better naming in comments Co-authored-by: Adrien Lavoillotte --- dataikuapi/dss/ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py index ebd68d5e..71cbd00d 100644 --- a/dataikuapi/dss/ml.py +++ b/dataikuapi/dss/ml.py @@ -335,7 +335,7 @@ def get_all_possible_algorithm_names(self): def _get_custom_algorithm_names(self): """ - Returns the list of names of defined custom models (python & mllib) + Returns the list of names of defined custom models (Python & MLlib backends) :returns: the list of custom models names :rtype: list of string @@ -388,7 +388,7 @@ def add_custom_python_model(self, name="Custom Python Model", code=""): def add_custom_mllib_model(self, name="Custom MLlib Model", code=""): """ - Adds a new custom mllib model + Adds a new custom MLlib model :param str name: name of the custom model :param str code: code of the custom model From dfd56aac61eace63b7f93c0891f94cbb0dcca17f Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Tue, 19 Jan 2021 17:41:16 +0100 Subject: [PATCH 4/5] User parentheses over backslash for line continuation --- dataikuapi/dss/ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py index 71cbd00d..95083001 100644 --- a/dataikuapi/dss/ml.py +++ b/dataikuapi/dss/ml.py @@ -340,8 +340,8 @@ def _get_custom_algorithm_names(self): :returns: the list of custom models names :rtype: list of string """ - return [algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]\ - + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]] + return ([algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]] + + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]]) def get_enabled_algorithm_names(self): """ From 3d75fcf85dea6f0525e4bd6ebdda8ea596f3a963 Mon Sep 17 00:00:00 2001 From: nicolasservel Date: Fri, 22 Jan 2021 11:39:06 +0100 Subject: [PATCH 5/5] Fix listing algos in py3 & remove typo Co-authored-by: Samuel O. Ronsin --- dataikuapi/dss/ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py index 95083001..6088bdea 100644 --- a/dataikuapi/dss/ml.py +++ b/dataikuapi/dss/ml.py @@ -386,7 +386,7 @@ def add_custom_python_model(self, name="Custom Python Model", code=""): "enabled": True }) - def add_custom_mllib_model(self, name="Custom MLlib Model", code=""): + def add_custom_mllib_model(self, name="Custom MLlib Model", code=""): """ Adds a new custom MLlib model @@ -1451,7 +1451,7 @@ def get_all_possible_algorithm_names(self): return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names() def _get_plugin_algorithm_names(self): - return self.mltask_settings["modeling"]["plugin_python"].keys() + return list(self.mltask_settings["modeling"]["plugin_python"].keys()) def _get_plugin_algorithm_settings(self, algorithm_name): if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]: