From 41d52f9aa3f5356a16b18b5f1f960176898e4202 Mon Sep 17 00:00:00 2001
From: nicolasservel <nico.servel@gmail.com>
Date: Tue, 19 Jan 2021 13:50:11 +0100
Subject: [PATCH 1/5] Add better support for custom & plugin algos + few fixes

---
 dataikuapi/dss/ml.py | 100 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 85 insertions(+), 15 deletions(-)

diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
index 18309f0e..ebd68d5e 100644
--- a/dataikuapi/dss/ml.py
+++ b/dataikuapi/dss/ml.py
@@ -238,6 +238,16 @@ def use_feature(self, feature_name):
     def get_algorithm_settings(self, algorithm_name):
         raise NotImplementedError()
 
+    def _get_custom_algorithm_settings(self, algorithm_name):
+        # returns the first algorithm with this name
+        for algo in self.mltask_settings["modeling"]["custom_mllib"]:
+            if algorithm_name == algo["name"]:
+                return algo
+        for algo in self.mltask_settings["modeling"]["custom_python"]:
+            if algorithm_name == algo["name"]:
+                return algo
+        raise ValueError("Unknown algorithm: {}".format(algorithm_name))
+
     def get_diagnostics_settings(self):
         """
         Gets the diagnostics settings for a mltask. This returns a reference to the
@@ -307,7 +317,7 @@ def disable_all_algorithms(self):
             custom_mllib["enabled"] = False
         for custom_python in self.mltask_settings["modeling"]["custom_python"]:
             custom_python["enabled"] = False
-        for plugin in self.mltask_settings["modeling"]["plugin_python"].values():
+        for plugin in self.mltask_settings["modeling"].get("plugin_python", {}).values():
             plugin["enabled"] = False
 
     def get_all_possible_algorithm_names(self):
@@ -315,23 +325,30 @@ def get_all_possible_algorithm_names(self):
         Returns the list of possible algorithm names, i.e. the list of valid
         identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`
 
-        This does not include Custom Python models, Custom MLLib models, plugin models.
         This includes all possible algorithms, regardless of the prediction kind (regression/classification)
         or engine, so some algorithms may be irrelevant
 
         :returns: the list of algorithm names as a list of strings
         :rtype: list of string
         """
-        return list(self.__class__.algorithm_remap.keys())
+        return list(self.__class__.algorithm_remap.keys()) + self._get_custom_algorithm_names()
+
+    def _get_custom_algorithm_names(self):
+        """
+        Returns the list of names of defined custom models (python & mllib)
+
+        :returns: the list of custom models names
+        :rtype: list of string
+        """
+        return [algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]\
+               + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]]
 
     def get_enabled_algorithm_names(self):
         """
         :returns: the list of enabled algorithm names as a list of strings
         :rtype: list of string
         """
-        algos = self.__class__.algorithm_remap
-        algo_names = [algo_name for algo_name in algos.keys() if self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]]
-        return algo_names
+        return [algo_name for algo_name in self.get_all_possible_algorithm_names() if self.get_algorithm_settings(algo_name).get("enabled", False)]
 
     def get_enabled_algorithm_settings(self):
         """
@@ -356,6 +373,32 @@ def set_metric(self, metric=None, custom_metric=None, custom_metric_greater_is_b
         self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricGIB"] = custom_metric_greater_is_better
         self.mltask_settings["modeling"]["metrics"]["customEvaluationMetricNeedsProba"] = custom_metric_use_probas
 
+    def add_custom_python_model(self, name="Custom Python Model", code=""):
+        """
+        Adds a new custom python model
+
+        :param str name: name of the custom model
+        :param str code: code of the custom model
+        """
+        self.mltask_settings["modeling"]["custom_python"].append({
+            "name": name,
+            "code": code,
+            "enabled": True
+        })
+
+    def add_custom_mllib_model(self, name="Custom  MLlib Model", code=""):
+        """
+        Adds a new custom mllib model
+
+        :param str name: name of the custom model
+        :param str code: code of the custom model
+        """
+        self.mltask_settings["modeling"]["custom_mllib"].append({
+            "name": name,
+            "initializationCode": code,
+            "enabled": True
+        })
+
     def save(self):
         """Saves back these settings to the ML Task"""
 
@@ -1310,7 +1353,6 @@ def __init__(self, raw_settings, hyperparameter_search_params):
 
         self.cache_node_ids = self._register_simple_parameter("cache_node_ids")
         self.checkpoint_interval = self._register_single_value_hyperparameter("checkpoint_interval", accepted_types=[int])
-        self.impurity = self._register_single_category_hyperparameter("impurity", accepted_values=["gini", "entropy", "variance"])  # TODO: distinguish between regression and classif
         self.max_bins = self._register_single_value_hyperparameter("max_bins", accepted_types=[int])
         self.max_memory_mb = self._register_simple_parameter("max_memory_mb")
         self.min_info_gain = self._register_single_value_hyperparameter("min_info_gain", accepted_types=[int, float])
@@ -1395,20 +1437,41 @@ def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings)
     def get_prediction_type(self):
         return self.mltask_settings['predictionType']
 
+    def get_all_possible_algorithm_names(self):
+        """
+        Returns the list of possible algorithm names, i.e. the list of valid
+        identifiers for :meth:`set_algorithm_enabled` and :meth:`get_algorithm_settings`
+
+        This includes all possible algorithms, regardless of the prediction kind (regression/classification)
+        or engine, so some algorithms may be irrelevant
+
+        :returns: the list of algorithm names as a list of strings
+        :rtype: list of string
+        """
+        return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names()
+
+    def _get_plugin_algorithm_names(self):
+        return self.mltask_settings["modeling"]["plugin_python"].keys()
+
+    def _get_plugin_algorithm_settings(self, algorithm_name):
+        if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]:
+                return self.mltask_settings["modeling"]["plugin_python"][algorithm_name]
+        raise ValueError("Unknown algorithm: {}".format(algorithm_name))
+
     def get_enabled_algorithm_names(self):
         """
         :returns: the list of enabled algorithm names as a list of strings
         :rtype: list of string
         """
-        algos = self.__class__.algorithm_remap
+        algo_names = super(DSSPredictionMLTaskSettings, self).get_enabled_algorithm_names()
+
         # Hide either "XGBOOST_CLASSIFICATION" or "XGBOOST_REGRESSION" which point to the same key "xgboost"
         if self.mltask_settings["predictionType"] == "REGRESSION":
-            excluded_name = {"XGBOOST_CLASSIFICATION"}
+            excluded_names = {"XGBOOST_CLASSIFICATION"}
         else:
-            excluded_name = {"XGBOOST_REGRESSION"}
-        algo_names = [algo_name for algo_name in algos.keys() if (self.mltask_settings["modeling"][algos[algo_name].algorithm_name.lower()]["enabled"]
-                                                                  and algo_name not in excluded_name)]
-        return algo_names
+            excluded_names = {"XGBOOST_REGRESSION"}
+
+        return [algo_name for algo_name in algo_names if algo_name not in excluded_names]
 
     def get_algorithm_settings(self, algorithm_name):
         """
@@ -1442,6 +1505,10 @@ def get_algorithm_settings(self, algorithm_name):
                 # Subsequent calls get the same object
                 self.mltask_settings["modeling"][algorithm_name.lower()] = algorithm_settings
             return self.mltask_settings["modeling"][algorithm_name.lower()]
+        elif algorithm_name in self._get_custom_algorithm_names():
+            return self._get_custom_algorithm_settings(algorithm_name)
+        elif algorithm_name in self._get_plugin_algorithm_names():
+            return self._get_plugin_algorithm_settings(algorithm_name)
         else:
             raise ValueError("Unknown algorithm: {}".format(algorithm_name))
 
@@ -1590,8 +1657,11 @@ def get_algorithm_settings(self, algorithm_name):
         """
         if algorithm_name in self.__class__.algorithm_remap:
             algorithm_name = self.__class__.algorithm_remap[algorithm_name]
-
-        return self.mltask_settings["modeling"][algorithm_name.lower()]
+            return self.mltask_settings["modeling"][algorithm_name.lower()]
+        elif algorithm_name in self._get_custom_algorithm_names():
+            return self._get_custom_algorithm_settings(algorithm_name)
+        else:
+            raise ValueError("Unknown algorithm: {}".format(algorithm_name))
 
 
 class DSSTrainedModelDetails(object):

From 97396a420544b7af94897e6c9119aab4950a1d03 Mon Sep 17 00:00:00 2001
From: nicolasservel <nico.servel@gmail.com>
Date: Tue, 19 Jan 2021 13:50:56 +0100
Subject: [PATCH 2/5] Add support of wait_guess_complete for clustering tasks

---
 dataikuapi/dss/analysis.py | 15 ++++++++++++---
 dataikuapi/dss/dataset.py  | 13 +++++++++----
 dataikuapi/dss/project.py  | 16 ++++++++++++----
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/dataikuapi/dss/analysis.py b/dataikuapi/dss/analysis.py
index e08e85c2..ae10f702 100644
--- a/dataikuapi/dss/analysis.py
+++ b/dataikuapi/dss/analysis.py
@@ -188,8 +188,9 @@ def create_prediction_ml_task(self,
         return mltask
 
     def create_clustering_ml_task(self,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
+                                  ml_backend_type="PY_MEMORY",
+                                  guess_policy="KMEANS",
+                                  wait_guess_complete=True):
 
 
         """Creates a new clustering task in a new visual analysis lab
@@ -205,6 +206,10 @@ def create_clustering_ml_task(self,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
 
         obj = {
@@ -214,7 +219,11 @@ def create_clustering_ml_task(self,
         }
 
         ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
-        return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
+        mltask = DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
+
+        if wait_guess_complete:
+            mltask.wait_guess_complete()
+        return mltask
 
     def list_ml_tasks(self):
         """
diff --git a/dataikuapi/dss/dataset.py b/dataikuapi/dss/dataset.py
index 574ca125..91cfb3da 100644
--- a/dataikuapi/dss/dataset.py
+++ b/dataikuapi/dss/dataset.py
@@ -385,8 +385,9 @@ def create_prediction_ml_task(self, target_variable,
              guess_policy = guess_policy, prediction_type = prediction_type, wait_guess_complete = wait_guess_complete)
 
     def create_clustering_ml_task(self, input_dataset,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
+                                  ml_backend_type="PY_MEMORY",
+                                  guess_policy="KMEANS",
+                                  wait_guess_complete=True):
         """Creates a new clustering task in a new visual analysis lab
         for a dataset.
 
@@ -400,9 +401,13 @@ def create_clustering_ml_task(self, input_dataset,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
-        return self.project.create_clustering_ml_task(self.dataset_name, 
-            ml_backend_type = ml_backend_type, guess_policy = guess_policy)
+        return self.project.create_clustering_ml_task(self.dataset_name, ml_backend_type=ml_backend_type, guess_policy=guess_policy,
+                                                      wait_guess_complete=wait_guess_complete)
 
     def create_analysis(self):
         """
diff --git a/dataikuapi/dss/project.py b/dataikuapi/dss/project.py
index 7422b829..6557e8b6 100644
--- a/dataikuapi/dss/project.py
+++ b/dataikuapi/dss/project.py
@@ -532,9 +532,9 @@ def create_prediction_ml_task(self, input_dataset, target_variable,
         return ret
 
     def create_clustering_ml_task(self, input_dataset,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "KMEANS"):
-
+                                  ml_backend_type = "PY_MEMORY",
+                                  guess_policy = "KMEANS",
+                                  wait_guess_complete=True):
 
         """Creates a new clustering task in a new visual analysis lab
         for a dataset.
@@ -549,6 +549,10 @@ def create_clustering_ml_task(self, input_dataset,
 
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: KMEANS and ANOMALY_DETECTION
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
 
         obj = {
@@ -559,7 +563,11 @@ def create_clustering_ml_task(self, input_dataset,
         }
 
         ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj)
-        return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+        mltask = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+
+        if wait_guess_complete:
+            mltask.wait_guess_complete()
+        return mltask
 
     def list_ml_tasks(self):
         """

From b38b38f11b5fb9ee1c949ca7f488600266282084 Mon Sep 17 00:00:00 2001
From: nicolasservel <nico.servel@gmail.com>
Date: Tue, 19 Jan 2021 15:42:14 +0100
Subject: [PATCH 3/5] Better naming in comments

Co-authored-by: Adrien Lavoillotte <adrien.lavoillotte@dataiku.com>
---
 dataikuapi/dss/ml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
index ebd68d5e..71cbd00d 100644
--- a/dataikuapi/dss/ml.py
+++ b/dataikuapi/dss/ml.py
@@ -335,7 +335,7 @@ def get_all_possible_algorithm_names(self):
 
     def _get_custom_algorithm_names(self):
         """
-        Returns the list of names of defined custom models (python & mllib)
+        Returns the list of names of defined custom models (Python & MLlib backends)
 
         :returns: the list of custom models names
         :rtype: list of string
@@ -388,7 +388,7 @@ def add_custom_python_model(self, name="Custom Python Model", code=""):
 
     def add_custom_mllib_model(self, name="Custom  MLlib Model", code=""):
         """
-        Adds a new custom mllib model
+        Adds a new custom MLlib model
 
         :param str name: name of the custom model
         :param str code: code of the custom model

From dfd56aac61eace63b7f93c0891f94cbb0dcca17f Mon Sep 17 00:00:00 2001
From: nicolasservel <nico.servel@gmail.com>
Date: Tue, 19 Jan 2021 17:41:16 +0100
Subject: [PATCH 4/5] User parentheses over backslash for line continuation

---
 dataikuapi/dss/ml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
index 71cbd00d..95083001 100644
--- a/dataikuapi/dss/ml.py
+++ b/dataikuapi/dss/ml.py
@@ -340,8 +340,8 @@ def _get_custom_algorithm_names(self):
         :returns: the list of custom models names
         :rtype: list of string
         """
-        return [algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]\
-               + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]]
+        return ([algo["name"] for algo in self.mltask_settings["modeling"]["custom_mllib"]]
+                + [algo["name"] for algo in self.mltask_settings["modeling"]["custom_python"]])
 
     def get_enabled_algorithm_names(self):
         """

From 3d75fcf85dea6f0525e4bd6ebdda8ea596f3a963 Mon Sep 17 00:00:00 2001
From: nicolasservel <nico.servel@gmail.com>
Date: Fri, 22 Jan 2021 11:39:06 +0100
Subject: [PATCH 5/5] Fix listing algos in py3 & remove typo

Co-authored-by: Samuel O. Ronsin <samuel.ronsin@dataiku.com>
---
 dataikuapi/dss/ml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
index 95083001..6088bdea 100644
--- a/dataikuapi/dss/ml.py
+++ b/dataikuapi/dss/ml.py
@@ -386,7 +386,7 @@ def add_custom_python_model(self, name="Custom Python Model", code=""):
             "enabled": True
         })
 
-    def add_custom_mllib_model(self, name="Custom  MLlib Model", code=""):
+    def add_custom_mllib_model(self, name="Custom MLlib Model", code=""):
         """
         Adds a new custom MLlib model
 
@@ -1451,7 +1451,7 @@ def get_all_possible_algorithm_names(self):
         return super(DSSPredictionMLTaskSettings, self).get_all_possible_algorithm_names() + self._get_plugin_algorithm_names()
 
     def _get_plugin_algorithm_names(self):
-        return self.mltask_settings["modeling"]["plugin_python"].keys()
+        return list(self.mltask_settings["modeling"]["plugin_python"].keys())
 
     def _get_plugin_algorithm_settings(self, algorithm_name):
         if algorithm_name in self.mltask_settings["modeling"]["plugin_python"]: