canonical · misohu · Apr 23, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/src/dss/config.py b/src/dss/config.py
@@ -28,6 +28,16 @@ def format_images_message(images_dict: dict) -> str:
 RECOMMENDED_IMAGES_MESSAGE = format_images_message(NOTEBOOK_IMAGES_ALIASES)
 DEFAULT_NOTEBOOK_IMAGE = "kubeflownotebookswg/jupyter-scipy:v1.8.0"
 
+SUPPORTED_GPUS = ["nvidia"]
+GPU_DEPLOYMENT_LABEL = {"nvidia": "nvidia.com/gpu"}
+NODE_LABELS = {
+    "nvidia": [
+        "nvidia.com/gpu.present",
+        "nvidia.com/gpu.deploy.container-toolkit",
+        "nvidia.com/gpu.deploy.device-plugin",
+    ]
+}
+
 
 class DeploymentState(Enum):
     ACTIVE = "Active"

diff --git a/src/dss/create_notebook.py b/src/dss/create_notebook.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Dict, Optional
 
 from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler
 from lightkube import Client
@@ -10,7 +11,9 @@
     DSS_CLI_MANAGER_LABELS,
     DSS_NAMESPACE,
     FIELD_MANAGER,
+    GPU_DEPLOYMENT_LABEL,
     MANIFEST_TEMPLATES_LOCATION,
+    NODE_LABELS,
     NOTEBOOK_IMAGES_ALIASES,
     NOTEBOOK_PVC_NAME,
     RECOMMENDED_IMAGES_MESSAGE,
@@ -23,22 +26,36 @@
     does_notebook_exist,
     get_mlflow_tracking_uri,
     get_service_url,
+    node_has_gpu_labels,
     wait_for_deployment_ready,
 )
 
 # Set up logger
 logger = setup_logger("logs/dss.log")
 
 
-def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
+def create_notebook(
+    name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None
+) -> None:
     """
-    Creates a Notebook server on the Kubernetes cluster.
+    Creates a Notebook server on the Kubernetes cluster with optional GPU support.
 
     Args:
         name (str): The name of the notebook server.
-        image (str): The image used for the notebook server.
-        lightkube_client (Client): The Kubernetes client.
+        image (str): The Docker image used for the notebook server.
+        lightkube_client (Client): The Kubernetes client used for server creation.
+        gpu (Optional[str]): Specifies the GPU type for the notebook if required.
+
+    Raises:
+        RuntimeError: If there is a failure in notebook creation or GPU label checking.
     """
+    if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]):
+        logger.error(f"Failed to create notebook with {gpu} GPU acceleration.\n")
+        logger.info(
+            "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http://<data-science-stack-docs>/setup-gpu for more information on the setup."  # noqa E501
+        )
+        raise RuntimeError()
+
     if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(
         lightkube_client
     ):
@@ -59,14 +76,12 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
             logger.info(f"To connect to the existing notebook, go to {url}.")
         raise RuntimeError()
 
-    manifests_file = Path(
-        Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2"
+    manifests_file = (
+        Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2"
     )
-
     image_full_name = _get_notebook_image_name(image)
-    config = _get_notebook_config(image_full_name, name)
+    config = _get_notebook_config(image_full_name, name, gpu)
 
-    # Initialize KubernetesResourceHandler
     k8s_resource_handler = KubernetesResourceHandler(
         field_manager=FIELD_MANAGER,
         labels=DSS_CLI_MANAGER_LABELS,
@@ -78,51 +93,69 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
 
     try:
         k8s_resource_handler.apply()
-
-        wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name)
-
+        wait_for_deployment_ready(
+            lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name, timeout_seconds=None
+        )
         logger.info(f"Success: Notebook {name} created successfully.")
+        if gpu:
+            logger.info(f"{gpu.title()} GPU attached to notebook.")
-            logger.info(f"{gpu.title()} GPU attached to notebook.")
+            logger.info(f"{gpu} GPU attached to notebook.")
-            logger.info(f"{gpu.title()} GPU attached to notebook.")
+            logger.info(f"{gpu} GPU attached to notebook.")
     except ApiError as err:
         logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True)
         logger.error(f"Failed to create Notebook with error code {err.status.code}.")
         logger.info(" Check the debug logs for more details.")
         raise RuntimeError()
-    except TimeoutError as err:
-        logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True)
-        logger.error(f"Timed out while trying to create Notebook {name}.")
-        logger.warn(" Some resources might be left in the cluster.")
-        logger.info(" Check the status with `dss list`.")
-        raise RuntimeError()
     except ImagePullBackOffError as err:
-        logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True)
-        logger.error(f"Timed out while trying to create Notebook {name}.")
-        logger.error(f"Image {image_full_name} does not exist or is not accessible.")
+        logger.debug(f"Failed to create notebook {name}: {err}.", exc_info=True)
+        logger.error(
+            f"Failed to create notebook {name}: Image {image_full_name} does not exist or is not accessible.\n"  # noqa E501
+        )
         logger.info(
             "Note: You might want to use some of these recommended images:\n"
-            f"{RECOMMENDED_IMAGES_MESSAGE}"
+            + RECOMMENDED_IMAGES_MESSAGE
         )
         raise RuntimeError()
-    # Assumes that the notebook server is exposed by a service of the same name.
+
     url = get_service_url(name, DSS_NAMESPACE, lightkube_client)
     if url:
         logger.info(f"Access the notebook at {url}.")
 
 
-def _get_notebook_config(image: str, name: str) -> dict:
-    mlflow_tracking_uri = get_mlflow_tracking_uri()
+def _get_notebook_config(image: str, name: str, gpu: Optional[str] = None) -> Dict[str, str]:
+    """
+    Generates the configuration dictionary for creating a notebook deployment.
+
+    Args:
+        image (str): The Docker image to use for the notebook.
+        name (str): The name of the notebook.
+        gpu (Optional[str]): GPU label for the notebook deployment, if GPU support is enabled.
+
+    Returns:
+        Dict[str, str]: A dictionary with configuration values for the notebook deployment.
+    """
+    # Base configuration for the notebook
     context = {
-        "mlflow_tracking_uri": mlflow_tracking_uri,
+        "mlflow_tracking_uri": get_mlflow_tracking_uri(),
         "notebook_name": name,
         "namespace": DSS_NAMESPACE,
         "notebook_image": image,
         "pvc_name": NOTEBOOK_PVC_NAME,
     }
+
+    # Conditionally add GPU configuration if specified
+    if gpu:
+        context["gpu"] = GPU_DEPLOYMENT_LABEL[gpu]
+
     return context
 
 
 def _get_notebook_image_name(image: str) -> str:
     """
-    Returns the image's full name if the input is a key in `NOTEBOOK_IMAGES_ALIASES`
-    else it returns the input.
+    Resolves the full Docker image name from an alias or returns the image if not an alias.
+
+    Args:
+        image (str): An alias for a notebook image or a full Docker image name.
+
+    Returns:
+        str: The resolved full Docker image name.
     """
     return NOTEBOOK_IMAGES_ALIASES.get(image, image)
diff --git a/src/dss/main.py b/src/dss/main.py
@@ -1,6 +1,6 @@
 import click
 
-from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE
+from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE, SUPPORTED_GPUS
 from dss.create_notebook import create_notebook
 from dss.initialize import initialize
 from dss.list import list_notebooks
@@ -65,7 +65,15 @@ def initialize_command(kubeconfig: str) -> None:
     "--kubeconfig",
     help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.",  # noqa E501
 )
-def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
+@click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.")
+@click.option(
+    "--gpu",
+    type=click.Choice(SUPPORTED_GPUS),
-    type=click.Choice(SUPPORTED_GPUS),
+    type=click.Choice(SUPPORTED_GPUS, case_sensitive=False),
-    type=click.Choice(SUPPORTED_GPUS),
+    type=click.Choice(SUPPORTED_GPUS, case_sensitive=False),
+    help="Specify the type of GPU acceleration, e.g., 'nvidia'.",
+)
+def create_notebook_command(
+    name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str
+) -> None:
     """Create a Jupyter notebook in DSS and connect it to MLflow. This command also
     outputs the URL to access the notebook on success.
 
@@ -78,11 +86,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
             " For more information on using a specific image, see dss create --help."
         )
 
+    # Check mutual exclusivity
+    if no_gpu and gpu:
+        logger.error("You cannot specify both --no-gpu and --gpu options.")
+        raise click.UsageError("Options --no-gpu and --gpu are mutually exclusive.")
     try:
         kubeconfig = get_default_kubeconfig(kubeconfig)
         lightkube_client = get_lightkube_client(kubeconfig)
 
-        create_notebook(name=name, image=image, lightkube_client=lightkube_client)
+        create_notebook(
+            name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu
+        )
     except RuntimeError:
         click.get_current_context().exit(1)
     except Exception as e:
@@ -95,6 +109,7 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
 Examples
   dss create my-notebook --image=pytorch
   dss create my-notebook --image={DEFAULT_NOTEBOOK_IMAGE}
+  dss create my-notebook --image=charmedkubeflow/jupyter-pytorch-cuda-full:1.8.0 --gpu=nvidia
 
     \b\n{RECOMMENDED_IMAGES_MESSAGE}
 """

diff --git a/src/dss/manifest_templates/notebook_deployment.yaml.j2 b/src/dss/manifest_templates/notebook_deployment.yaml.j2
@@ -23,6 +23,11 @@ spec:
         - env:
           - name: MLFLOW_TRACKING_URI
             value: {{ mlflow_tracking_uri }}
+          {% if gpu %}
+          resources:
+            limits:
+              {{ gpu }}: 1
+          {% endif %}
           image: {{ notebook_image }}
           imagePullPolicy: IfNotPresent
           name: {{ notebook_name }}

diff --git a/src/dss/utils.py b/src/dss/utils.py
@@ -1,6 +1,6 @@
 import os
 import time
-from typing import Optional
+from typing import List, Optional
 
 from lightkube import ApiError, Client, KubeConfig
 from lightkube.resources.apps_v1 import Deployment
@@ -32,58 +32,88 @@ def __init__(self, msg: str, *args):
         self.msg = str(msg)
 
 
+def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool:
-def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool:
+def all_nodes_have_labels(lightkube_client: Client, labels: List[str]) -> bool:
-def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool:
+def all_nodes_have_labels(lightkube_client: Client, labels: List[str]) -> bool:
+    """
+    Check if at least one node in the Kubernetes cluster has all the specified labels.
+
+    Args:
+        lightkube_client (Client): The Kubernetes client.
+        labels (List[str]): A list of label keys that must be present on the node.
+
+    Returns:
+        bool: True if at least one node has all the specified labels, False otherwise.
+    """
+    nodes = lightkube_client.list(Node)
+    for node in nodes:
+        node_labels = node.metadata.labels
+        if all(label in node_labels for label in labels):
+            return True
+    return False
+
+
 def wait_for_deployment_ready(
     client: Client,
     namespace: str,
     deployment_name: str,
-    timeout_seconds: int = 180,
+    timeout_seconds: Optional[int] = 180,
     interval_seconds: int = 10,
 ) -> None:
     """
-    Waits for a Kubernetes deployment to be ready.
+    Waits for a Kubernetes deployment to be ready. Can wait indefinitely if timeout_seconds is None.
 
     Args:
         client (Client): The Kubernetes client.
         namespace (str): The namespace of the deployment.
         deployment_name (str): The name of the deployment.
-        timeout_seconds (int): Timeout in seconds. Defaults to 600.
+        timeout_seconds (Optional[int]): Timeout in seconds, or None for no timeout.
+                                         Defaults to 180.
         interval_seconds (int): Interval between checks in seconds. Defaults to 10.
 
-    Returns:
-        None
+    Raises:
+        ImagePullBackOffError: If there is an issue pulling the deployment image.
+        TimeoutError: If the timeout is reached before the deployment is ready.
     """
     logger.info(
         f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..."
     )
     start_time = time.time()
     while True:
         deployment: Deployment = client.get(Deployment, namespace=namespace, name=deployment_name)
-        if deployment.status.availableReplicas == deployment.spec.replicas:
+        if deployment.status and deployment.status.availableReplicas == deployment.spec.replicas:
             logger.info(f"Deployment {deployment_name} in namespace {namespace} is ready")
             break
-        elif time.time() - start_time >= timeout_seconds:
-            # Surround the following block with try-except?
-            # ----Block-start----
-            pod: Pod = list(
+        try:
+            pods = list(
                 client.list(
                     Pod,
                     namespace=namespace,
                     labels={"canonical.com/dss-notebook": deployment_name},
                 )
-            )[0]
-            reason = pod.status.containerStatuses[0].state.waiting.reason
+            )
+        except ApiError as e:
+            if e.response.status_code == 404:
+                pods = []
+
+        if pods:
+            reason = (
+                pods[0].status.containerStatuses[0].state.waiting.reason
+                if pods[0].status.containerStatuses
+                and pods[0].status.containerStatuses[0].state.waiting
+                else "Unknown"
+            )
             if reason in ["ImagePullBackOff", "ErrImagePull"]:
                 raise ImagePullBackOffError(
                     f"Failed to create Deployment {deployment_name} with {reason}"
                 )
-            # ----Block-end----
+
+        if timeout_seconds is not None and time.time() - start_time >= timeout_seconds:
             raise TimeoutError(
                 f"Timeout waiting for deployment {deployment_name} in namespace {namespace} to be ready"  # noqa E501
             )
         else:
             time.sleep(interval_seconds)
-            logger.info(
-                f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..."
+            logger.debug(
+                f"Still waiting for deployment {deployment_name} in namespace {namespace} to be ready..."  # noqa E501
             )
 
 

diff --git a/tests/integration/test_dss.py b/tests/integration/test_dss.py
@@ -75,6 +75,38 @@ def test_initialize_creates_dss(cleanup_after_initialize) -> None:
     assert "notebooks" in kubectl_result.stdout
 
 
+def test_create_notebook_gpu_failure(cleanup_after_initialize) -> None:
+    """
+    Tests that `dss create` fails to creates a notebook on machine without GPU
+    (its expected to be run on GH runner without GPU).
+
+    Must be run after `dss initialize`
+    """
+
+    result = subprocess.run(
+        [
+            DSS_NAMESPACE,
+            "create",
+            NOTEBOOK_NAME,
+            "--image",
+            NOTEBOOK_IMAGE,
+            "--kubeconfig",
+            KUBECONFIG,
+            "--gpu=nvidia",
+        ],
+        capture_output=True,
+        text=True,
+        timeout=60 * 4,
+    )
+
+    # Check if the command executed successfully
+    assert result.returncode == 1
+    assert (
+        "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster."  # noqa E501
+        in result.stderr
+    )
+
+
 def test_create_notebook(cleanup_after_initialize) -> None:
     """
     Tests that `dss create` successfully creates a notebook as expected.