-
Notifications
You must be signed in to change notification settings - Fork 10
Add --gpu option to the create command for NVIDIA support
#94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5411de5
bbf0d42
d312ea7
14a892a
f1fb614
8159992
d72f8e5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,4 +1,5 @@ | ||||||
| from pathlib import Path | ||||||
| from typing import Dict, Optional | ||||||
|
|
||||||
| from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler | ||||||
| from lightkube import Client | ||||||
|
|
@@ -10,7 +11,9 @@ | |||||
| DSS_CLI_MANAGER_LABELS, | ||||||
| DSS_NAMESPACE, | ||||||
| FIELD_MANAGER, | ||||||
| GPU_DEPLOYMENT_LABEL, | ||||||
| MANIFEST_TEMPLATES_LOCATION, | ||||||
| NODE_LABELS, | ||||||
| NOTEBOOK_IMAGES_ALIASES, | ||||||
| NOTEBOOK_PVC_NAME, | ||||||
| RECOMMENDED_IMAGES_MESSAGE, | ||||||
|
|
@@ -23,22 +26,36 @@ | |||||
| does_notebook_exist, | ||||||
| get_mlflow_tracking_uri, | ||||||
| get_service_url, | ||||||
| node_has_gpu_labels, | ||||||
| wait_for_deployment_ready, | ||||||
| ) | ||||||
|
|
||||||
| # Set up logger | ||||||
| logger = setup_logger("logs/dss.log") | ||||||
|
|
||||||
|
|
||||||
| def create_notebook(name: str, image: str, lightkube_client: Client) -> None: | ||||||
| def create_notebook( | ||||||
| name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None | ||||||
| ) -> None: | ||||||
| """ | ||||||
| Creates a Notebook server on the Kubernetes cluster. | ||||||
| Creates a Notebook server on the Kubernetes cluster with optional GPU support. | ||||||
|
|
||||||
| Args: | ||||||
| name (str): The name of the notebook server. | ||||||
| image (str): The image used for the notebook server. | ||||||
| lightkube_client (Client): The Kubernetes client. | ||||||
| image (str): The Docker image used for the notebook server. | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe leave as |
||||||
| lightkube_client (Client): The Kubernetes client used for server creation. | ||||||
| gpu (Optional[str]): Specifies the GPU type for the notebook if required. | ||||||
|
|
||||||
| Raises: | ||||||
| RuntimeError: If there is a failure in notebook creation or GPU label checking. | ||||||
| """ | ||||||
| if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]): | ||||||
| logger.error(f"Failed to create notebook with {gpu} GPU acceleration.\n") | ||||||
| logger.info( | ||||||
| "You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http://<data-science-stack-docs>/setup-gpu for more information on the setup." # noqa E501 | ||||||
| ) | ||||||
| raise RuntimeError() | ||||||
|
|
||||||
| if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist( | ||||||
| lightkube_client | ||||||
| ): | ||||||
|
|
@@ -59,14 +76,12 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None: | |||||
| logger.info(f"To connect to the existing notebook, go to {url}.") | ||||||
| raise RuntimeError() | ||||||
|
|
||||||
| manifests_file = Path( | ||||||
| Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2" | ||||||
| manifests_file = ( | ||||||
| Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2" | ||||||
| ) | ||||||
|
|
||||||
| image_full_name = _get_notebook_image_name(image) | ||||||
| config = _get_notebook_config(image_full_name, name) | ||||||
| config = _get_notebook_config(image_full_name, name, gpu) | ||||||
|
|
||||||
| # Initialize KubernetesResourceHandler | ||||||
| k8s_resource_handler = KubernetesResourceHandler( | ||||||
| field_manager=FIELD_MANAGER, | ||||||
| labels=DSS_CLI_MANAGER_LABELS, | ||||||
|
|
@@ -78,51 +93,69 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None: | |||||
|
|
||||||
| try: | ||||||
| k8s_resource_handler.apply() | ||||||
|
|
||||||
| wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name) | ||||||
|
|
||||||
| wait_for_deployment_ready( | ||||||
| lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name, timeout_seconds=None | ||||||
| ) | ||||||
| logger.info(f"Success: Notebook {name} created successfully.") | ||||||
| if gpu: | ||||||
| logger.info(f"{gpu.title()} GPU attached to notebook.") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor suggestion.
Suggested change
|
||||||
| except ApiError as err: | ||||||
| logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True) | ||||||
| logger.error(f"Failed to create Notebook with error code {err.status.code}.") | ||||||
| logger.info(" Check the debug logs for more details.") | ||||||
| raise RuntimeError() | ||||||
| except TimeoutError as err: | ||||||
| logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True) | ||||||
| logger.error(f"Timed out while trying to create Notebook {name}.") | ||||||
| logger.warn(" Some resources might be left in the cluster.") | ||||||
| logger.info(" Check the status with `dss list`.") | ||||||
| raise RuntimeError() | ||||||
| except ImagePullBackOffError as err: | ||||||
| logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True) | ||||||
| logger.error(f"Timed out while trying to create Notebook {name}.") | ||||||
| logger.error(f"Image {image_full_name} does not exist or is not accessible.") | ||||||
| logger.debug(f"Failed to create notebook {name}: {err}.", exc_info=True) | ||||||
| logger.error( | ||||||
| f"Failed to create notebook {name}: Image {image_full_name} does not exist or is not accessible.\n" # noqa E501 | ||||||
| ) | ||||||
| logger.info( | ||||||
| "Note: You might want to use some of these recommended images:\n" | ||||||
| f"{RECOMMENDED_IMAGES_MESSAGE}" | ||||||
| + RECOMMENDED_IMAGES_MESSAGE | ||||||
| ) | ||||||
| raise RuntimeError() | ||||||
| # Assumes that the notebook server is exposed by a service of the same name. | ||||||
|
|
||||||
| url = get_service_url(name, DSS_NAMESPACE, lightkube_client) | ||||||
| if url: | ||||||
| logger.info(f"Access the notebook at {url}.") | ||||||
|
|
||||||
|
|
||||||
| def _get_notebook_config(image: str, name: str) -> dict: | ||||||
| mlflow_tracking_uri = get_mlflow_tracking_uri() | ||||||
| def _get_notebook_config(image: str, name: str, gpu: Optional[str] = None) -> Dict[str, str]: | ||||||
| """ | ||||||
| Generates the configuration dictionary for creating a notebook deployment. | ||||||
|
|
||||||
| Args: | ||||||
| image (str): The Docker image to use for the notebook. | ||||||
| name (str): The name of the notebook. | ||||||
| gpu (Optional[str]): GPU label for the notebook deployment, if GPU support is enabled. | ||||||
|
|
||||||
| Returns: | ||||||
| Dict[str, str]: A dictionary with configuration values for the notebook deployment. | ||||||
| """ | ||||||
| # Base configuration for the notebook | ||||||
| context = { | ||||||
| "mlflow_tracking_uri": mlflow_tracking_uri, | ||||||
| "mlflow_tracking_uri": get_mlflow_tracking_uri(), | ||||||
| "notebook_name": name, | ||||||
| "namespace": DSS_NAMESPACE, | ||||||
| "notebook_image": image, | ||||||
| "pvc_name": NOTEBOOK_PVC_NAME, | ||||||
| } | ||||||
|
|
||||||
| # Conditionally add GPU configuration if specified | ||||||
| if gpu: | ||||||
| context["gpu"] = GPU_DEPLOYMENT_LABEL[gpu] | ||||||
|
|
||||||
| return context | ||||||
|
|
||||||
|
|
||||||
| def _get_notebook_image_name(image: str) -> str: | ||||||
| """ | ||||||
| Returns the image's full name if the input is a key in `NOTEBOOK_IMAGES_ALIASES` | ||||||
| else it returns the input. | ||||||
| Resolves the full Docker image name from an alias or returns the image if not an alias. | ||||||
|
|
||||||
| Args: | ||||||
| image (str): An alias for a notebook image or a full Docker image name. | ||||||
|
|
||||||
| Returns: | ||||||
| str: The resolved full Docker image name. | ||||||
| """ | ||||||
| return NOTEBOOK_IMAGES_ALIASES.get(image, image) | ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,6 +1,6 @@ | ||||||
| import click | ||||||
|
|
||||||
| from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE | ||||||
| from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE, SUPPORTED_GPUS | ||||||
| from dss.create_notebook import create_notebook | ||||||
| from dss.initialize import initialize | ||||||
| from dss.list import list_notebooks | ||||||
|
|
@@ -65,7 +65,15 @@ def initialize_command(kubeconfig: str) -> None: | |||||
| "--kubeconfig", | ||||||
| help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.", # noqa E501 | ||||||
| ) | ||||||
| def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: | ||||||
| @click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need a
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its recommendation of ux team to provide this
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if there's a miscommunication here. If I understand correctly, these two commands are both the same? Am I missing something about the CLI? |
||||||
| @click.option( | ||||||
| "--gpu", | ||||||
| type=click.Choice(SUPPORTED_GPUS), | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this need to be case sensitive?
Suggested change
|
||||||
| help="Specify the type of GPU acceleration, e.g., 'nvidia'.", | ||||||
| ) | ||||||
| def create_notebook_command( | ||||||
| name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str | ||||||
| ) -> None: | ||||||
| """Create a Jupyter notebook in DSS and connect it to MLflow. This command also | ||||||
| outputs the URL to access the notebook on success. | ||||||
|
|
||||||
|
|
@@ -78,11 +86,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: | |||||
| " For more information on using a specific image, see dss create --help." | ||||||
| ) | ||||||
|
|
||||||
| # Check mutual exclusivity | ||||||
| if no_gpu and gpu: | ||||||
| logger.error("You cannot specify both --no-gpu and --gpu options.") | ||||||
| raise click.UsageError("Options --no-gpu and --gpu are mutually exclusive.") | ||||||
| try: | ||||||
| kubeconfig = get_default_kubeconfig(kubeconfig) | ||||||
| lightkube_client = get_lightkube_client(kubeconfig) | ||||||
|
|
||||||
| create_notebook(name=name, image=image, lightkube_client=lightkube_client) | ||||||
| create_notebook( | ||||||
| name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu | ||||||
| ) | ||||||
| except RuntimeError: | ||||||
| click.get_current_context().exit(1) | ||||||
| except Exception as e: | ||||||
|
|
@@ -95,6 +109,7 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None: | |||||
| Examples | ||||||
| dss create my-notebook --image=pytorch | ||||||
| dss create my-notebook --image={DEFAULT_NOTEBOOK_IMAGE} | ||||||
| dss create my-notebook --image=charmedkubeflow/jupyter-pytorch-cuda-full:1.8.0 --gpu=nvidia | ||||||
|
|
||||||
| \b\n{RECOMMENDED_IMAGES_MESSAGE} | ||||||
| """ | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,11 @@ spec: | |
| - env: | ||
| - name: MLFLOW_TRACKING_URI | ||
| value: {{ mlflow_tracking_uri }} | ||
| {% if gpu %} | ||
| resources: | ||
| limits: | ||
| {{ gpu }}: 1 | ||
| {% endif %} | ||
|
Comment on lines
+26
to
+30
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we support only a single GPU per notebook? If I have two GPUs, should we support using both? |
||
| image: {{ notebook_image }} | ||
| imagePullPolicy: IfNotPresent | ||
| name: {{ notebook_name }} | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,6 +1,6 @@ | ||||||
| import os | ||||||
| import time | ||||||
| from typing import Optional | ||||||
| from typing import List, Optional | ||||||
|
|
||||||
| from lightkube import ApiError, Client, KubeConfig | ||||||
| from lightkube.resources.apps_v1 import Deployment | ||||||
|
|
@@ -32,58 +32,88 @@ def __init__(self, msg: str, *args): | |||||
| self.msg = str(msg) | ||||||
|
|
||||||
|
|
||||||
| def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool: | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This asserts that all nodes have the provided labels, not that a given node has gpu labels. Alternatively you could include the gpu labels here in the function code drop the
Suggested change
|
||||||
| """ | ||||||
| Check if at least one node in the Kubernetes cluster has all the specified labels. | ||||||
|
|
||||||
| Args: | ||||||
| lightkube_client (Client): The Kubernetes client. | ||||||
| labels (List[str]): A list of label keys that must be present on the node. | ||||||
|
|
||||||
| Returns: | ||||||
| bool: True if at least one node has all the specified labels, False otherwise. | ||||||
| """ | ||||||
| nodes = lightkube_client.list(Node) | ||||||
| for node in nodes: | ||||||
| node_labels = node.metadata.labels | ||||||
| if all(label in node_labels for label in labels): | ||||||
| return True | ||||||
| return False | ||||||
|
|
||||||
|
|
||||||
| def wait_for_deployment_ready( | ||||||
| client: Client, | ||||||
| namespace: str, | ||||||
| deployment_name: str, | ||||||
| timeout_seconds: int = 180, | ||||||
| timeout_seconds: Optional[int] = 180, | ||||||
| interval_seconds: int = 10, | ||||||
| ) -> None: | ||||||
| """ | ||||||
| Waits for a Kubernetes deployment to be ready. | ||||||
| Waits for a Kubernetes deployment to be ready. Can wait indefinitely if timeout_seconds is None. | ||||||
|
|
||||||
| Args: | ||||||
| client (Client): The Kubernetes client. | ||||||
| namespace (str): The namespace of the deployment. | ||||||
| deployment_name (str): The name of the deployment. | ||||||
| timeout_seconds (int): Timeout in seconds. Defaults to 600. | ||||||
| timeout_seconds (Optional[int]): Timeout in seconds, or None for no timeout. | ||||||
| Defaults to 180. | ||||||
| interval_seconds (int): Interval between checks in seconds. Defaults to 10. | ||||||
|
|
||||||
| Returns: | ||||||
| None | ||||||
| Raises: | ||||||
| ImagePullBackOffError: If there is an issue pulling the deployment image. | ||||||
| TimeoutError: If the timeout is reached before the deployment is ready. | ||||||
| """ | ||||||
| logger.info( | ||||||
| f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." | ||||||
| ) | ||||||
| start_time = time.time() | ||||||
| while True: | ||||||
| deployment: Deployment = client.get(Deployment, namespace=namespace, name=deployment_name) | ||||||
| if deployment.status.availableReplicas == deployment.spec.replicas: | ||||||
| if deployment.status and deployment.status.availableReplicas == deployment.spec.replicas: | ||||||
| logger.info(f"Deployment {deployment_name} in namespace {namespace} is ready") | ||||||
| break | ||||||
| elif time.time() - start_time >= timeout_seconds: | ||||||
| # Surround the following block with try-except? | ||||||
| # ----Block-start---- | ||||||
| pod: Pod = list( | ||||||
| try: | ||||||
| pods = list( | ||||||
| client.list( | ||||||
| Pod, | ||||||
| namespace=namespace, | ||||||
| labels={"canonical.com/dss-notebook": deployment_name}, | ||||||
| ) | ||||||
| )[0] | ||||||
| reason = pod.status.containerStatuses[0].state.waiting.reason | ||||||
| ) | ||||||
| except ApiError as e: | ||||||
| if e.response.status_code == 404: | ||||||
| pods = [] | ||||||
|
|
||||||
| if pods: | ||||||
| reason = ( | ||||||
| pods[0].status.containerStatuses[0].state.waiting.reason | ||||||
| if pods[0].status.containerStatuses | ||||||
| and pods[0].status.containerStatuses[0].state.waiting | ||||||
| else "Unknown" | ||||||
| ) | ||||||
| if reason in ["ImagePullBackOff", "ErrImagePull"]: | ||||||
| raise ImagePullBackOffError( | ||||||
| f"Failed to create Deployment {deployment_name} with {reason}" | ||||||
| ) | ||||||
| # ----Block-end---- | ||||||
|
|
||||||
| if timeout_seconds is not None and time.time() - start_time >= timeout_seconds: | ||||||
| raise TimeoutError( | ||||||
| f"Timeout waiting for deployment {deployment_name} in namespace {namespace} to be ready" # noqa E501 | ||||||
| ) | ||||||
| else: | ||||||
| time.sleep(interval_seconds) | ||||||
| logger.info( | ||||||
| f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..." | ||||||
| logger.debug( | ||||||
| f"Still waiting for deployment {deployment_name} in namespace {namespace} to be ready..." # noqa E501 | ||||||
| ) | ||||||
|
|
||||||
|
|
||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think the gpu requests are working as expected. Try this:
Then in each notebook server:
nvidia-smi. The GPU should be visible only in one notebook server, but it is visible in bothimport torch; torch.cuda.is_available(). The GPU should be available only in one notebook server, but it is available in bothnvidia-smion the host system and we'll see two processes both using the GPU, like:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One thing that is working correctly is the GPU requests. My machine has one gpu, and if I do:
notebook
ysits pending with the FailedScheduling warning: 1 Insufficient nvidia.com/gpu