Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/dss/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ def format_images_message(images_dict: dict) -> str:
RECOMMENDED_IMAGES_MESSAGE = format_images_message(NOTEBOOK_IMAGES_ALIASES)
DEFAULT_NOTEBOOK_IMAGE = "kubeflownotebookswg/jupyter-scipy:v1.8.0"

SUPPORTED_GPUS = ["nvidia"]
GPU_DEPLOYMENT_LABEL = {"nvidia": "nvidia.com/gpu"}
NODE_LABELS = {
"nvidia": [
"nvidia.com/gpu.present",
"nvidia.com/gpu.deploy.container-toolkit",
"nvidia.com/gpu.deploy.device-plugin",
]
}


class DeploymentState(Enum):
ACTIVE = "Active"
Expand Down
89 changes: 61 additions & 28 deletions src/dss/create_notebook.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the gpu requests are working as expected. Try this:

dss initialize --kubeconfig ~/.kube/config 
dss create gpu-omitted --image=kubeflownotebookswg/jupyter-pytorch-cuda-full:latest --kubeconfig ~/.kube/config
dss create gpu-selected --image=kubeflownotebookswg/jupyter-pytorch-cuda-full:latest --kubeconfig ~/.kube/config

Then in each notebook server:

  • create a terminal and do nvidia-smi. The GPU should be visible only in one notebook server, but it is visible in both
  • create a notebook and run import torch; torch.cuda.is_available(). The GPU should be available only in one notebook server, but it is available in both
  • create a notebook and run this tutorial. Both will run on the GPU. After doing this, run nvidia-smi on the host system and we'll see two processes both using the GPU, like:
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  Tesla T4                       On  | 00000000:00:1E.0 Off |                    0 |
| N/A   37C    P0              36W /  70W |    280MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A     70671      C   /opt/conda/bin/python                       158MiB |
|    0   N/A  N/A     96171      C   /opt/conda/bin/python                       118MiB |
+---------------------------------------------------------------------------------------+

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing that is working correctly is the GPU requests. My machine has one gpu, and if I do:

dss create x ... --gpu=nvidia
dss create y ... --gpu=nvidia

notebook y sits pending with the FailedScheduling warning: 1 Insufficient nvidia.com/gpu

Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from typing import Dict, Optional

from charmed_kubeflow_chisme.kubernetes import KubernetesResourceHandler
from lightkube import Client
Expand All @@ -10,7 +11,9 @@
DSS_CLI_MANAGER_LABELS,
DSS_NAMESPACE,
FIELD_MANAGER,
GPU_DEPLOYMENT_LABEL,
MANIFEST_TEMPLATES_LOCATION,
NODE_LABELS,
NOTEBOOK_IMAGES_ALIASES,
NOTEBOOK_PVC_NAME,
RECOMMENDED_IMAGES_MESSAGE,
Expand All @@ -23,22 +26,36 @@
does_notebook_exist,
get_mlflow_tracking_uri,
get_service_url,
node_has_gpu_labels,
wait_for_deployment_ready,
)

# Set up logger
logger = setup_logger("logs/dss.log")


def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
def create_notebook(
name: str, image: str, lightkube_client: Client, gpu: Optional[str] = None
) -> None:
"""
Creates a Notebook server on the Kubernetes cluster.
Creates a Notebook server on the Kubernetes cluster with optional GPU support.

Args:
name (str): The name of the notebook server.
image (str): The image used for the notebook server.
lightkube_client (Client): The Kubernetes client.
image (str): The Docker image used for the notebook server.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe leave as image or oci image instead of docker? just to not exclude rocks

lightkube_client (Client): The Kubernetes client used for server creation.
gpu (Optional[str]): Specifies the GPU type for the notebook if required.

Raises:
RuntimeError: If there is a failure in notebook creation or GPU label checking.
"""
if gpu and not node_has_gpu_labels(lightkube_client, NODE_LABELS[gpu]):
logger.error(f"Failed to create notebook with {gpu} GPU acceleration.\n")
logger.info(
"You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster. Please refer to this guide http://<data-science-stack-docs>/setup-gpu for more information on the setup." # noqa E501
)
raise RuntimeError()

if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(
lightkube_client
):
Expand All @@ -59,14 +76,12 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
logger.info(f"To connect to the existing notebook, go to {url}.")
raise RuntimeError()

manifests_file = Path(
Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2"
manifests_file = (
Path(__file__).parent / MANIFEST_TEMPLATES_LOCATION / "notebook_deployment.yaml.j2"
)

image_full_name = _get_notebook_image_name(image)
config = _get_notebook_config(image_full_name, name)
config = _get_notebook_config(image_full_name, name, gpu)

# Initialize KubernetesResourceHandler
k8s_resource_handler = KubernetesResourceHandler(
field_manager=FIELD_MANAGER,
labels=DSS_CLI_MANAGER_LABELS,
Expand All @@ -78,51 +93,69 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:

try:
k8s_resource_handler.apply()

wait_for_deployment_ready(lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name)

wait_for_deployment_ready(
lightkube_client, namespace=DSS_NAMESPACE, deployment_name=name, timeout_seconds=None
)
logger.info(f"Success: Notebook {name} created successfully.")
if gpu:
logger.info(f"{gpu.title()} GPU attached to notebook.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor suggestion. title() wont be correct in all cases (ex: Amd). I'd stick to the enumerators we say gpu should be

Suggested change
logger.info(f"{gpu.title()} GPU attached to notebook.")
logger.info(f"{gpu} GPU attached to notebook.")

except ApiError as err:
logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Failed to create Notebook with error code {err.status.code}.")
logger.info(" Check the debug logs for more details.")
raise RuntimeError()
except TimeoutError as err:
logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.warn(" Some resources might be left in the cluster.")
logger.info(" Check the status with `dss list`.")
raise RuntimeError()
except ImagePullBackOffError as err:
logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.error(f"Image {image_full_name} does not exist or is not accessible.")
logger.debug(f"Failed to create notebook {name}: {err}.", exc_info=True)
logger.error(
f"Failed to create notebook {name}: Image {image_full_name} does not exist or is not accessible.\n" # noqa E501
)
logger.info(
"Note: You might want to use some of these recommended images:\n"
f"{RECOMMENDED_IMAGES_MESSAGE}"
+ RECOMMENDED_IMAGES_MESSAGE
)
raise RuntimeError()
# Assumes that the notebook server is exposed by a service of the same name.

url = get_service_url(name, DSS_NAMESPACE, lightkube_client)
if url:
logger.info(f"Access the notebook at {url}.")


def _get_notebook_config(image: str, name: str) -> dict:
mlflow_tracking_uri = get_mlflow_tracking_uri()
def _get_notebook_config(image: str, name: str, gpu: Optional[str] = None) -> Dict[str, str]:
"""
Generates the configuration dictionary for creating a notebook deployment.

Args:
image (str): The Docker image to use for the notebook.
name (str): The name of the notebook.
gpu (Optional[str]): GPU label for the notebook deployment, if GPU support is enabled.

Returns:
Dict[str, str]: A dictionary with configuration values for the notebook deployment.
"""
# Base configuration for the notebook
context = {
"mlflow_tracking_uri": mlflow_tracking_uri,
"mlflow_tracking_uri": get_mlflow_tracking_uri(),
"notebook_name": name,
"namespace": DSS_NAMESPACE,
"notebook_image": image,
"pvc_name": NOTEBOOK_PVC_NAME,
}

# Conditionally add GPU configuration if specified
if gpu:
context["gpu"] = GPU_DEPLOYMENT_LABEL[gpu]

return context


def _get_notebook_image_name(image: str) -> str:
"""
Returns the image's full name if the input is a key in `NOTEBOOK_IMAGES_ALIASES`
else it returns the input.
Resolves the full Docker image name from an alias or returns the image if not an alias.

Args:
image (str): An alias for a notebook image or a full Docker image name.

Returns:
str: The resolved full Docker image name.
"""
return NOTEBOOK_IMAGES_ALIASES.get(image, image)
21 changes: 18 additions & 3 deletions src/dss/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import click

from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE
from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE, SUPPORTED_GPUS
from dss.create_notebook import create_notebook
from dss.initialize import initialize
from dss.list import list_notebooks
Expand Down Expand Up @@ -65,7 +65,15 @@ def initialize_command(kubeconfig: str) -> None:
"--kubeconfig",
help=f"Path to a Kubernetes config file. Defaults to the value of the KUBECONFIG environment variable, else to '{KUBECONFIG_DEFAULT}'.", # noqa E501
)
def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
@click.option("--no-gpu", is_flag=True, help="Create a notebook without GPU support.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a --no-gpu? Can the absence of --gpu=something imply no gpu, and we get rid of this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its recommendation of ux team to provide this --no-gpu flag

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if there's a miscommunication here. If I understand correctly, these two commands are both the same?

dss create my-notebook
dss create my-notebook --no-gpu

Am I missing something about the CLI?

@click.option(
"--gpu",
type=click.Choice(SUPPORTED_GPUS),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be case sensitive?

Suggested change
type=click.Choice(SUPPORTED_GPUS),
type=click.Choice(SUPPORTED_GPUS, case_sensitive=False),

help="Specify the type of GPU acceleration, e.g., 'nvidia'.",
)
def create_notebook_command(
name: str, image: str, kubeconfig: str, no_gpu: bool, gpu: str
) -> None:
"""Create a Jupyter notebook in DSS and connect it to MLflow. This command also
outputs the URL to access the notebook on success.

Expand All @@ -78,11 +86,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
" For more information on using a specific image, see dss create --help."
)

# Check mutual exclusivity
if no_gpu and gpu:
logger.error("You cannot specify both --no-gpu and --gpu options.")
raise click.UsageError("Options --no-gpu and --gpu are mutually exclusive.")
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

create_notebook(name=name, image=image, lightkube_client=lightkube_client)
create_notebook(
name=name, image=image, lightkube_client=lightkube_client, gpu=None if no_gpu else gpu
)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
Expand All @@ -95,6 +109,7 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
Examples
dss create my-notebook --image=pytorch
dss create my-notebook --image={DEFAULT_NOTEBOOK_IMAGE}
dss create my-notebook --image=charmedkubeflow/jupyter-pytorch-cuda-full:1.8.0 --gpu=nvidia

\b\n{RECOMMENDED_IMAGES_MESSAGE}
"""
Expand Down
5 changes: 5 additions & 0 deletions src/dss/manifest_templates/notebook_deployment.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ spec:
- env:
- name: MLFLOW_TRACKING_URI
value: {{ mlflow_tracking_uri }}
{% if gpu %}
resources:
limits:
{{ gpu }}: 1
{% endif %}
Comment on lines +26 to +30
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we support only a single GPU per notebook? If I have two GPUs, should we support using both?

image: {{ notebook_image }}
imagePullPolicy: IfNotPresent
name: {{ notebook_name }}
Expand Down
62 changes: 46 additions & 16 deletions src/dss/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import time
from typing import Optional
from typing import List, Optional

from lightkube import ApiError, Client, KubeConfig
from lightkube.resources.apps_v1 import Deployment
Expand Down Expand Up @@ -32,58 +32,88 @@ def __init__(self, msg: str, *args):
self.msg = str(msg)


def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This asserts that all nodes have the provided labels, not that a given node has gpu labels.

Alternatively you could include the gpu labels here in the function code drop the labels input

Suggested change
def node_has_gpu_labels(lightkube_client: Client, labels: List[str]) -> bool:
def all_nodes_have_labels(lightkube_client: Client, labels: List[str]) -> bool:

"""
Check if at least one node in the Kubernetes cluster has all the specified labels.

Args:
lightkube_client (Client): The Kubernetes client.
labels (List[str]): A list of label keys that must be present on the node.

Returns:
bool: True if at least one node has all the specified labels, False otherwise.
"""
nodes = lightkube_client.list(Node)
for node in nodes:
node_labels = node.metadata.labels
if all(label in node_labels for label in labels):
return True
return False


def wait_for_deployment_ready(
client: Client,
namespace: str,
deployment_name: str,
timeout_seconds: int = 180,
timeout_seconds: Optional[int] = 180,
interval_seconds: int = 10,
) -> None:
"""
Waits for a Kubernetes deployment to be ready.
Waits for a Kubernetes deployment to be ready. Can wait indefinitely if timeout_seconds is None.

Args:
client (Client): The Kubernetes client.
namespace (str): The namespace of the deployment.
deployment_name (str): The name of the deployment.
timeout_seconds (int): Timeout in seconds. Defaults to 600.
timeout_seconds (Optional[int]): Timeout in seconds, or None for no timeout.
Defaults to 180.
interval_seconds (int): Interval between checks in seconds. Defaults to 10.

Returns:
None
Raises:
ImagePullBackOffError: If there is an issue pulling the deployment image.
TimeoutError: If the timeout is reached before the deployment is ready.
"""
logger.info(
f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..."
)
start_time = time.time()
while True:
deployment: Deployment = client.get(Deployment, namespace=namespace, name=deployment_name)
if deployment.status.availableReplicas == deployment.spec.replicas:
if deployment.status and deployment.status.availableReplicas == deployment.spec.replicas:
logger.info(f"Deployment {deployment_name} in namespace {namespace} is ready")
break
elif time.time() - start_time >= timeout_seconds:
# Surround the following block with try-except?
# ----Block-start----
pod: Pod = list(
try:
pods = list(
client.list(
Pod,
namespace=namespace,
labels={"canonical.com/dss-notebook": deployment_name},
)
)[0]
reason = pod.status.containerStatuses[0].state.waiting.reason
)
except ApiError as e:
if e.response.status_code == 404:
pods = []

if pods:
reason = (
pods[0].status.containerStatuses[0].state.waiting.reason
if pods[0].status.containerStatuses
and pods[0].status.containerStatuses[0].state.waiting
else "Unknown"
)
if reason in ["ImagePullBackOff", "ErrImagePull"]:
raise ImagePullBackOffError(
f"Failed to create Deployment {deployment_name} with {reason}"
)
# ----Block-end----

if timeout_seconds is not None and time.time() - start_time >= timeout_seconds:
raise TimeoutError(
f"Timeout waiting for deployment {deployment_name} in namespace {namespace} to be ready" # noqa E501
)
else:
time.sleep(interval_seconds)
logger.info(
f"Waiting for deployment {deployment_name} in namespace {namespace} to be ready..."
logger.debug(
f"Still waiting for deployment {deployment_name} in namespace {namespace} to be ready..." # noqa E501
)


Expand Down
32 changes: 32 additions & 0 deletions tests/integration/test_dss.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,38 @@ def test_initialize_creates_dss(cleanup_after_initialize) -> None:
assert "notebooks" in kubectl_result.stdout


def test_create_notebook_gpu_failure(cleanup_after_initialize) -> None:
"""
Tests that `dss create` fails to creates a notebook on machine without GPU
(its expected to be run on GH runner without GPU).

Must be run after `dss initialize`
"""

result = subprocess.run(
[
DSS_NAMESPACE,
"create",
NOTEBOOK_NAME,
"--image",
NOTEBOOK_IMAGE,
"--kubeconfig",
KUBECONFIG,
"--gpu=nvidia",
],
capture_output=True,
text=True,
timeout=60 * 4,
)

# Check if the command executed successfully
assert result.returncode == 1
assert (
"You are trying to setup notebook backed by GPU but the GPU devices were not properly set up in the Kubernetes cluster." # noqa E501
in result.stderr
)


def test_create_notebook(cleanup_after_initialize) -> None:
"""
Tests that `dss create` successfully creates a notebook as expected.
Expand Down
Loading