diff --git a/src/dss/config.py b/src/dss/config.py index 700494d9..73df1e6b 100644 --- a/src/dss/config.py +++ b/src/dss/config.py @@ -28,6 +28,16 @@ def format_images_message(images_dict: dict) -> str: RECOMMENDED_IMAGES_MESSAGE = format_images_message(NOTEBOOK_IMAGES_ALIASES) DEFAULT_NOTEBOOK_IMAGE = "kubeflownotebookswg/jupyter-scipy:v1.8.0" +SUPPORTED_GPUS = ["nvidia"] +GPU_DEPLOYMENT_LABEL = {"nvidia": "nvidia.com/gpu"} +NODE_LABELS = { + "nvidia": [ + "nvidia.com/gpu.present", + "nvidia.com/gpu.deploy.container-toolkit", + "nvidia.com/gpu.deploy.device-plugin", + ] +} + class DeploymentState(Enum): ACTIVE = "Active" diff --git a/src/dss/create_notebook.py b/src/dss/create_notebook.py index 759e5112..06ee20e8 100644 --- a/src/dss/create_notebook.py +++ b/src/dss/create_notebook.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Dict, Optional from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler from lightkube import Client @@ -10,7 +11,9 @@ DSS_CLI_MANAGER_LABELS, DSS_NAMESPACE, FIELD_MANAGER, + GPU_DEPLOYMENT_LABEL, MANIFEST_TEMPLATES_LOCATION, + NODE_LABELS, NOTEBOOK_IMAGES_ALIASES, NOTEBOOK_PVC_NAME, RECOMMENDED_IMAGES_MESSAGE, @@ -23,6 +26,7 @@ does_notebook_exist, get_mlflow_tracking_uri, get_service_url, + node_has_gpu_labels, wait_for_deployment_ready, ) @@ -30,15 +34,28 @@ logger = setup_logger("logs/dss.log") -def create_notebook(name: str, image: str, lightkube_client: Client) -> None: +def create_notebook( + name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None +) -> None: """ - Creates a Notebook server on the Kubernetes cluster. + Creates a Notebook server on the Kubernetes cluster with optional GPU support. Args: name (str): The name of the notebook server. - image (str): The image used for the notebook server. - lightkube_client (Client): The Kubernetes client. + image (str): The Docker image used for the notebook server. + lightkube_client (Client): The Kubernetes client used for server creation. + gpu (Optional[str]): Specifies the GPU type for the notebook if required. + + Raises: + RuntimeError: If there is a failure in notebook creation or GPU label checking. """ + if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]): + logger.error(f"Failed to create notebook with {gpu} GPU acceleration.\n") + logger.info( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup." # noqa E501 + ) + raise RuntimeError() + if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist( lightkube_client ): @@ -59,14 +76,12 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None: logger.info(f"To connect to the existing notebook, go to {url}.") raise RuntimeError() - manifests_file = Path( - Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2" + manifests_file = ( + Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2" ) - image_full_name = _get_notebook_image_name(image) - config = _get_notebook_config(image_full_name, name) + config = _get_notebook_config(image_full_name, name, gpu) - # Initialize KubernetesResourceHandler k8s_resource_handler = KubernetesResourceHandler( field_manager=FIELD_MANAGER, labels=DSS_CLI_MANAGER_LABELS, @@ -78,51 +93,69 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None: try: k8s_resource_handler.apply() - - wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name) - + wait_for_deployment_ready( + lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name, timeout_seconds=None + ) logger.info(f"Success: Notebook {name} created successfully.") + if gpu: + logger.info(f"{gpu.title()} GPU attached to notebook.") except ApiError as err: logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True) logger.error(f"Failed to create Notebook with error code {err.status.code}.") logger.info(" Check the debug logs for more details.") raise RuntimeError() - except TimeoutError as err: - logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True) - logger.error(f"Timed out while trying to create Notebook {name}.") - logger.warn(" Some resources might be left in the cluster.") - logger.info(" Check the status with `dss list`.") - raise RuntimeError() except ImagePullBackOffError as err: - logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True) - logger.error(f"Timed out while trying to create Notebook {name}.") - logger.error(f"Image {image_full_name} does not exist or is not accessible.") + logger.debug(f"Failed to create notebook {name}: {err}.", exc_info=True) + logger.error( + f"Failed to create notebook {name}: Image {image_full_name} does not exist or is not accessible.\n" # noqa E501 + ) logger.info( "Note: You might want to use some of these recommended images:\n" - f"{RECOMMENDED_IMAGES_MESSAGE}" + + RECOMMENDED_IMAGES_MESSAGE ) raise RuntimeError() - # Assumes that the notebook server is exposed by a service of the same name. + url = get_service_url(name, DSS_NAMESPACE, lightkube_client) if url: logger.info(f"Access the notebook at {url}.") -def _get_notebook_config(image: str, name: str) -> dict: - mlflow_tracking_uri = get_mlflow_tracking_uri() +def _get_notebook_config(image: str, name: str, gpu: Optional[str] = None) -> Dict[str, str]: + """ + Generates the configuration dictionary for creating a notebook deployment. + + Args: + image (str): The Docker image to use for the notebook. + name (str): The name of the notebook. + gpu (Optional[str]): GPU label for the notebook deployment, if GPU support is enabled. + + Returns: + Dict[str, str]: A dictionary with configuration values for the notebook deployment. + """ + # Base configuration for the notebook context = { - "mlflow_tracking_uri": mlflow_tracking_uri, + "mlflow_tracking_uri": get_mlflow_tracking_uri(), "notebook_name": name, "namespace": DSS_NAMESPACE, "notebook_image": image, "pvc_name": NOTEBOOK_PVC_NAME, } + + # Conditionally add GPU configuration if specified + if gpu: + context["gpu"] = GPU_DEPLOYMENT_LABEL[gpu] + return context def _get_notebook_image_name(image: str) -> str: """ - Returns the image's full name if the input is a key in `NOTEBOOK_IMAGES_ALIASES` - else it returns the input. + Resolves the full Docker image name from an alias or returns the image if not an alias. + + Args: + image (str): An alias for a notebook image or a full Docker image name. + + Returns: + str: The resolved full Docker image name. """ return NOTEBOOK_IMAGES_ALIASES.get(image, image) diff --git a/src/dss/main.py b/src/dss/main.py index e0d28fe6..67cce633 100644 --- a/src/dss/main.py +++ b/src/dss/main.py @@ -1,6 +1,6 @@ import click -from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE +from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE, SUPPORTED_GPUS from dss.create_notebook import create_notebook from dss.initialize import initialize from dss.list import list_notebooks @@ -65,7 +65,15 @@ def initialize_command(kubeconfig: str) -> None: "--kubeconfig", help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.", # noqa E501 ) -def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: +@click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.") +@click.option( + "--gpu", + type=click.Choice(SUPPORTED_GPUS), + help="Specify the type of GPU acceleration, e.g., 'nvidia'.", +) +def create_notebook_command( + name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str +) -> None: """Create a Jupyter notebook in DSS and connect it to MLflow. This command also outputs the URL to access the notebook on success. @@ -78,11 +86,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: " For more information on using a specific image, see dss create --help." ) + # Check mutual exclusivity + if no_gpu and gpu: + logger.error("You cannot specify both --no-gpu and --gpu options.") + raise click.UsageError("Options --no-gpu and --gpu are mutually exclusive.") try: kubeconfig = get_default_kubeconfig(kubeconfig) lightkube_client = get_lightkube_client(kubeconfig) - create_notebook(name=name, image=image, lightkube_client=lightkube_client) + create_notebook( + name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu + ) except RuntimeError: click.get_current_context().exit(1) except Exception as e: @@ -95,6 +109,7 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: Examples dss create my-notebook --image=pytorch dss create my-notebook --image={DEFAULT_NOTEBOOK_IMAGE} + dss create my-notebook --image=charmedkubeflow/jupyter-pytorch-cuda-full:1.8.0 --gpu=nvidia \b\n{RECOMMENDED_IMAGES_MESSAGE} """ diff --git a/src/dss/manifest_templates/notebook_deployment.yaml.j2 b/src/dss/manifest_templates/notebook_deployment.yaml.j2 index 4eefc0ad..df12c4c9 100644 --- a/src/dss/manifest_templates/notebook_deployment.yaml.j2 +++ b/src/dss/manifest_templates/notebook_deployment.yaml.j2 @@ -23,6 +23,11 @@ spec: - env: - name: MLFLOW_TRACKING_URI value: {{ mlflow_tracking_uri }} + {% if gpu %} + resources: + limits: + {{ gpu }}: 1 + {% endif %} image: {{ notebook_image }} imagePullPolicy: IfNotPresent name: {{ notebook_name }} diff --git a/src/dss/utils.py b/src/dss/utils.py index dea5aa0a..ee67787f 100644 --- a/src/dss/utils.py +++ b/src/dss/utils.py @@ -1,6 +1,6 @@ import os import time -from typing import Optional +from typing import List, Optional from lightkube import ApiError, Client, KubeConfig from lightkube.resources.apps_v1 import Deployment @@ -32,25 +32,46 @@ def __init__(self, msg: str, *args): self.msg = str(msg) +def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool: + """ + Check if at least one node in the Kubernetes cluster has all the specified labels. + + Args: + lightkube_client (Client): The Kubernetes client. + labels (List[str]): A list of label keys that must be present on the node. + + Returns: + bool: True if at least one node has all the specified labels, False otherwise. + """ + nodes = lightkube_client.list(Node) + for node in nodes: + node_labels = node.metadata.labels + if all(label in node_labels for label in labels): + return True + return False + + def wait_for_deployment_ready( client: Client, namespace: str, deployment_name: str, - timeout_seconds: int = 180, + timeout_seconds: Optional[int] = 180, interval_seconds: int = 10, ) -> None: """ - Waits for a Kubernetes deployment to be ready. + Waits for a Kubernetes deployment to be ready. Can wait indefinitely if timeout_seconds is None. Args: client (Client): The Kubernetes client. namespace (str): The namespace of the deployment. deployment_name (str): The name of the deployment. - timeout_seconds (int): Timeout in seconds. Defaults to 600. + timeout_seconds (Optional[int]): Timeout in seconds, or None for no timeout. + Defaults to 180. interval_seconds (int): Interval between checks in seconds. Defaults to 10. - Returns: - None + Raises: + ImagePullBackOffError: If there is an issue pulling the deployment image. + TimeoutError: If the timeout is reached before the deployment is ready. """ logger.info( f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." @@ -58,32 +79,41 @@ def wait_for_deployment_ready( start_time = time.time() while True: deployment: Deployment = client.get(Deployment, namespace=namespace, name=deployment_name) - if deployment.status.availableReplicas == deployment.spec.replicas: + if deployment.status and deployment.status.availableReplicas == deployment.spec.replicas: logger.info(f"Deployment {deployment_name} in namespace {namespace} is ready") break - elif time.time() - start_time >= timeout_seconds: - # Surround the following block with try-except? - # ----Block-start---- - pod: Pod = list( + try: + pods = list( client.list( Pod, namespace=namespace, labels={"canonical.com/dss-notebook": deployment_name}, ) - )[0] - reason = pod.status.containerStatuses[0].state.waiting.reason + ) + except ApiError as e: + if e.response.status_code == 404: + pods = [] + + if pods: + reason = ( + pods[0].status.containerStatuses[0].state.waiting.reason + if pods[0].status.containerStatuses + and pods[0].status.containerStatuses[0].state.waiting + else "Unknown" + ) if reason in ["ImagePullBackOff", "ErrImagePull"]: raise ImagePullBackOffError( f"Failed to create Deployment {deployment_name} with {reason}" ) - # ----Block-end---- + + if timeout_seconds is not None and time.time() - start_time >= timeout_seconds: raise TimeoutError( f"Timeout waiting for deployment {deployment_name} in namespace {namespace} to be ready" # noqa E501 ) else: time.sleep(interval_seconds) - logger.info( - f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." + logger.debug( + f"Still waiting for deployment {deployment_name} in namespace {namespace} to be ready..." # noqa E501 ) diff --git a/tests/integration/test_dss.py b/tests/integration/test_dss.py index 8ae575c6..2bee471a 100644 --- a/tests/integration/test_dss.py +++ b/tests/integration/test_dss.py @@ -75,6 +75,38 @@ def test_initialize_creates_dss(cleanup_after_initialize) -> None: assert "notebooks" in kubectl_result.stdout +def test_create_notebook_gpu_failure(cleanup_after_initialize) -> None: + """ + Tests that `dss create` fails to creates a notebook on machine without GPU + (its expected to be run on GH runner without GPU). + + Must be run after `dss initialize` + """ + + result = subprocess.run( + [ + DSS_NAMESPACE, + "create", + NOTEBOOK_NAME, + "--image", + NOTEBOOK_IMAGE, + "--kubeconfig", + KUBECONFIG, + "--gpu=nvidia", + ], + capture_output=True, + text=True, + timeout=60 * 4, + ) + + # Check if the command executed successfully + assert result.returncode == 1 + assert ( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster." # noqa E501 + in result.stderr + ) + + def test_create_notebook(cleanup_after_initialize) -> None: """ Tests that `dss create` successfully creates a notebook as expected. diff --git a/tests/unit/test_create_notebook.py b/tests/unit/test_create_notebook.py index 324c20eb..fbaf9c37 100644 --- a/tests/unit/test_create_notebook.py +++ b/tests/unit/test_create_notebook.py @@ -8,6 +8,13 @@ from dss.utils import ImagePullBackOffError +@pytest.fixture +def mock_client() -> MagicMock: + """Mock Kubernetes Client.""" + with patch("dss.list.Client") as mock: + yield mock.return_value + + @pytest.fixture def mock_get_service_url() -> MagicMock: """ @@ -48,6 +55,7 @@ def test_create_notebook_success( mock_get_service_url: MagicMock, mock_resource_handler: MagicMock, mock_logger: MagicMock, + mock_client: MagicMock, ) -> None: """ Test case to verify successful create_notebook call. @@ -56,9 +64,6 @@ def test_create_notebook_success( notebook_image = "test-image" notebook_url = "http://somewhere.com:1234/notebook/namespace/name/lab" - # Mock the behavior of Client - mock_client_instance = MagicMock() - mock_get_service_url.return_value = notebook_url # Mock the behavior of KubernetesResourceHandler @@ -69,14 +74,15 @@ def test_create_notebook_success( "dss.create_notebook.does_notebook_exist", return_value=False ), patch("dss.create_notebook.wait_for_deployment_ready") as mock_wait_for_deployment_ready: # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) + create_notebook(name=notebook_name, image=notebook_image, lightkube_client=mock_client) # Assertions mock_resource_handler_instance.apply.assert_called_once() mock_wait_for_deployment_ready.assert_called_once_with( - mock_client_instance, namespace=DSS_NAMESPACE, deployment_name=notebook_name + mock_client, + namespace=DSS_NAMESPACE, + deployment_name=notebook_name, + timeout_seconds=None, ) mock_logger.info.assert_called_with(f"Access the notebook at {notebook_url}.") @@ -96,8 +102,8 @@ def test_create_notebook_failure_pvc_does_not_exist( with patch("dss.create_notebook.does_dss_pvc_exist", return_value=False), patch( "dss.create_notebook.does_notebook_exist", return_value=False ): + # Call the function to test with pytest.raises(RuntimeError): - # Call the function to test create_notebook( name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance ) @@ -130,8 +136,8 @@ def test_create_notebook_failure_mlflow_does_not_exist( with patch("dss.create_notebook.does_mlflow_deployment_exist", return_value=False), patch( "dss.create_notebook.does_notebook_exist", return_value=False ): + # Call the function to test with pytest.raises(RuntimeError): - # Call the function to test create_notebook( name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance ) @@ -167,8 +173,8 @@ def test_create_notebook_failure_notebook_exists( with patch("dss.create_notebook.does_dss_pvc_exist", return_value=True), patch( "dss.create_notebook.does_notebook_exist", return_value=True ): + # Call the function to test with pytest.raises(RuntimeError): - # Call the function to test create_notebook( name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance ) @@ -205,8 +211,8 @@ def test_create_notebook_failure_api( with patch("dss.create_notebook.does_dss_pvc_exist", return_value=True), patch( "dss.create_notebook.does_notebook_exist", return_value=False ): + # Call the function to test with pytest.raises(RuntimeError): - # Call the function to test create_notebook( name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance ) @@ -221,42 +227,6 @@ def test_create_notebook_failure_api( mock_logger.info.assert_called_with(" Check the debug logs for more details.") -def test_create_notebook_failure_time_out( - mock_logger: MagicMock, mock_wait_for_deployment_ready: MagicMock -) -> None: - """ - Test case to verify behavior when a TimeoutError is raised. - """ - notebook_name = "test-notebook" - notebook_image = "test-image" - exception_message = "test-exception-message" - - # Mock the behavior of Client - mock_client_instance = MagicMock() - - # Mock the behavior of wait_for_deployment_ready - mock_wait_for_deployment_ready.side_effect = TimeoutError(exception_message) - - with patch("dss.create_notebook.does_dss_pvc_exist", return_value=True), patch( - "dss.create_notebook.does_notebook_exist", return_value=False - ): - with pytest.raises(RuntimeError): - # Call the function to test - create_notebook( - name=notebook_name, image=notebook_image, lightkube_client=mock_client_instance - ) - - # Assertions - mock_logger.debug.assert_called_with( - f"Failed to create Notebook {notebook_name}: {exception_message}", exc_info=True - ) - mock_logger.error.assert_called_with( - f"Timed out while trying to create Notebook {notebook_name}." - ) - mock_logger.warn.assert_called_with(" Some resources might be left in the cluster.") - mock_logger.info.assert_called_with(" Check the status with `dss list`.") - - def test_create_notebook_failure_image_pull( mock_logger: MagicMock, mock_wait_for_deployment_ready: MagicMock ) -> None: @@ -284,14 +254,10 @@ def test_create_notebook_failure_image_pull( # Assertions mock_logger.debug.assert_called_with( - f"Timed out while trying to create Notebook {notebook_name}: {exception_message}.", - exc_info=True, - ) - mock_logger.error.assert_any_call( - f"Timed out while trying to create Notebook {notebook_name}." + f"Failed to create notebook {notebook_name}: {exception_message}.", exc_info=True ) mock_logger.error.assert_called_with( - f"Image {notebook_image} does not exist or is not accessible." + f"Failed to create notebook {notebook_name}: Image {notebook_image} does not exist or is not accessible.\n" # noqa E501 ) mock_logger.info.assert_called_with( "Note: You might want to use some of these recommended images:\n" @@ -316,3 +282,68 @@ def test_get_notebook_config() -> None: with patch("dss.create_notebook.get_mlflow_tracking_uri", return_value=mlflow_tracking_uri): actual_context = _get_notebook_config(notebook_image, notebook_name) assert actual_context == expected_context + + +def test_create_notebook_success_with_gpu( + mock_get_service_url: MagicMock, + mock_resource_handler: MagicMock, + mock_logger: MagicMock, + mock_client: MagicMock, +) -> None: + """ + Test case to verify successful notebook creation with GPU support. + """ + notebook_name = "gpu-notebook" + notebook_image = "gpu-image" + notebook_url = "http://somewhere.com:1234/notebook/namespace/name/lab" + gpu_type = "nvidia" + + # Set up mocks + mock_get_service_url.return_value = notebook_url + mock_resource_handler_instance = MagicMock() + mock_resource_handler.return_value = mock_resource_handler_instance + with patch("dss.create_notebook.node_has_gpu_labels", return_value=True), patch( + "dss.create_notebook.does_dss_pvc_exist", return_value=True + ), patch("dss.create_notebook.does_notebook_exist", return_value=False), patch( + "dss.create_notebook.wait_for_deployment_ready" + ) as mock_wait_for_deployment_ready: + # Act + create_notebook( + name=notebook_name, image=notebook_image, lightkube_client=mock_client, gpu=gpu_type + ) + + # Assert + mock_resource_handler_instance.apply.assert_called_once() + mock_wait_for_deployment_ready.assert_called_once() + mock_logger.info.assert_called_with(f"Access the notebook at {notebook_url}.") + mock_logger.info.assert_any_call("Nvidia GPU attached to notebook.") + + +def test_create_notebook_failure_no_gpu_labels( + mock_logger: MagicMock, mock_client: MagicMock +) -> None: + """ + Test case to verify failure in notebook creation due to missing GPU labels on the cluster. + """ + notebook_name = "gpu-notebook" + notebook_image = "gpu-image" + gpu_type = "nvidia" + + # Set up mocks + with patch("dss.create_notebook.node_has_gpu_labels", return_value=False), patch( + "dss.create_notebook.does_dss_pvc_exist", return_value=True + ), patch("dss.create_notebook.does_notebook_exist", return_value=False): + # Act & Assert + with pytest.raises(RuntimeError): + create_notebook( + name=notebook_name, + image=notebook_image, + lightkube_client=mock_client, + gpu=gpu_type, + ) + mock_logger.error.assert_called_with( + f"Failed to create notebook with {gpu_type} GPU acceleration.\n" + ) + mock_logger.info.assert_called_with( + "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http:///setup-gpu for more information on the setup." # noqa E501 + )