From 1a9561dc9ca287db3dcb402ce4e36fafcd12099a Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 09:54:15 +0300 Subject: [PATCH 01/15] oke readme files --- cloud-service-providers/oci/oke/README.md | 18 ++++ .../oci/oke/prerequisites/README.md | 66 +++++++++++++ .../oci/oke/setup/README.md | 99 +++++++++++++++++++ 3 files changed, 183 insertions(+) create mode 100644 cloud-service-providers/oci/oke/README.md create mode 100644 cloud-service-providers/oci/oke/prerequisites/README.md create mode 100644 cloud-service-providers/oci/oke/setup/README.md diff --git a/cloud-service-providers/oci/oke/README.md b/cloud-service-providers/oci/oke/README.md new file mode 100644 index 00000000..f635b00a --- /dev/null +++ b/cloud-service-providers/oci/oke/README.md @@ -0,0 +1,18 @@ +# NIM on Oracle Cloud Infrastructure (OCI) OKE + +To deploy NIM on Oracle Cloud Infrastructure (OCI) successfully, it’s crucial to choose the correct GPU shapes and ensure that the appropriate NVIDIA drivers are installed. + +When you select a GPU shape for a managed node pool or self-managed node in OKE, you must also select a compatible Oracle Linux GPU image that has the CUDA libraries pre-installed. The names of compatible images include 'GPU'. OCI offers Oracle Linux (OEL) providing the possibility to use pre-installed GPU drivers. This simplifies the deployment process for NIM. + + +## Prerequisites + +Please follow [Pre-rquirement instruction](./prerequisites/README.md) to get ready for OKE creation. + +## Create OKE + +Please follow [Create OKE instruction](./setup/README.md) to create OKE. + +## Deploy NIM + +Please follow [Deploy NIM instruction](../../../helm/README.md) to deploy NIM. diff --git a/cloud-service-providers/oci/oke/prerequisites/README.md b/cloud-service-providers/oci/oke/prerequisites/README.md new file mode 100644 index 00000000..3c222714 --- /dev/null +++ b/cloud-service-providers/oci/oke/prerequisites/README.md @@ -0,0 +1,66 @@ +### OKE Prerequisites + +This list summarizes the key prerequisites you need to set up before deploying an OKE cluster on OCI. + +- **OCI Account and Tenancy**: + - Ensure you have an OCI account with the necessary permissions. + - Set up a compartment for your Kubernetes cluster. + +- **Networking**: + - Create a Virtual Cloud Network (VCN) with appropriate subnets. + - Ensure internet gateway, NAT gateway, and service gateway are configured. + - Set up route tables and security lists for network traffic. + +- **IAM Policies**: + - Define IAM policies to allow OKE service to manage resources in your compartment. + - Grant required permissions to users or groups managing the Kubernetes cluster. + +- **Service Limits**: + - Verify that your tenancy has sufficient service limits for compute instances, block storage, and other required resources. + +- **CLI and SDK Tools**: + - Install and configure the OCI CLI for managing OKE. + - Optionally, set up OCI SDKs for automating tasks. + +- **Kubernetes Version**: + - Decide on the Kubernetes version to deploy, ensuring compatibility with your applications and OCI features. + +- **API Endpoint**: + - Choose between the public or private endpoint for the Kubernetes API server, based on your security requirements. + +For more details, please reference this [link.](https://docs.oracle.com/en-us/iaas/Content/ContEng/Concepts/contengprerequisites.htm) + + +## Install OCI CLI + +``` +bash -c "$(curl -L https://raw.githubusercontent.com/oracle/oci-cli/master/scripts/install/install.sh)" +``` + +For more details, please reference this [link.](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm) + +## Install kubectl + +``` +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256" +echo "$(cat kubectl.sha256) kubectl" | sha256sum --check +sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl +kubectl version --client +``` + +For more details, please reference this [link.](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) + +## Install Helm + +``` +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 get_helm.sh +./get_helm.sh +``` + +For more details, please reference this [link.](https://helm.sh/docs/intro/install/) + +## Next step + +![Continue to OKE creation](../setup/README.md) \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md new file mode 100644 index 00000000..3130d5ca --- /dev/null +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -0,0 +1,99 @@ +# Setup OCI Kubernetes Engine (OKE) + +The key to creating Oracle Kubernetes Engine (OKE) for NIM is to create a proper GPU node pool. The following steps will guide you through the process. + +## Connect to OCI + +1. Log in to your Oracle Cloud Infrastructure (OCI) Console. +2. Select the appropriate compartment where you want to create the OKE cluster. + +## Identify GPU needed for NIM + +- Refer to the NIM documentation to identify the GPU you [need](https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html). Here is also a list of available OKE node shapes. + + +## Find the Region with the Desired GPU + +1. Go to the OCI Console and navigate to the "Shape Availability" section to find the region that supports the desired GPU shape. +2. Alternatively, use the OCI CLI to search for GPU availability: + + ```bash + oci compute shape list --all + ``` + + Cross-reference with the [OCI Regions](https://www.oracle.com/cloud/data-regions.html) to select the best region. + +## Request Quota + +Ensure you have the necessary service limits (quota) for the GPU shapes. If needed, request an increase via the OCI Console: + +1. Navigate to **Governance and Administration** > **Limits, Quotas, and Usage**. +2. Select **Request Service Limit Increase** for the relevant GPU shapes. + +## Create OKE + +1. In the OCI Console, navigate to **Developer Services** > **Kubernetes Clusters** > **OKE Clusters**. +2. Click **Create Cluster** and select **Start with Quick Create**. +3. Configure the following: + - **Name**: Provide a name for your cluster. + - **Compartment**: Select the appropriate compartment. + - **Kubernetes Version**: Choose the latest stable version. + - **Shape**: Choose a shape with the desired GPU (e.g., `BM.GPU.A100.1`, `BM.GPU.A10.1`). +4. Under **Node Pool Configuration**: + - **Node Pool Name**: Name your node pool. + - **Shape**: Select the GPU shape identified earlier. + - **Node Count**: Start with 1 node (adjust as needed). + - **Node Subnet**: Select a subnet within your VCN. +5. Click **Create Cluster** to start the provisioning process. + +## Create GPU nodepool + +1. After the cluster is created, navigate to the **Node Pools** section. +2. Click **Add Node Pool** and configure: + - **Name**: Provide a name for the node pool. + - **Node Shape**: Select the desired GPU-enabled shape. + - **Node Count**: Set the number of nodes (adjust according to your needs). + - **Additional Configuration**: Customize as needed (e.g., OS disk size, SSH keys). +3. Click **Create Node Pool**. + +## Connect to OKE + +1. Install the OCI CLI if you haven't already. +2. Retrieve the OKE cluster credentials: + + ```bash + oci ce cluster create-kubeconfig --cluster-id --file $HOME/.kube/config --region --token-version 2.0.0 --kube-endpoint PUBLIC_ENDPOINT + ``` + +3. Verify the connection to your OKE cluster: + + ```bash + kubectl get nodes + ``` + +## Install GPU Operator (Only if necessary) + +**Note:** If you're using an OCI GPU shape that comes with the drivers pre-installed (such as those having 'GPU' in their names, for example the ones in the `BM.GPU.A100` series), you can skip this section. The GPU drivers are already installed and configured. + +If your chosen shape does not include the GPU drivers, follow the steps below to install the NVIDIA GPU Operator. + +1. Add the NVIDIA Helm repository: + + ```bash + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --pass-credentials + helm repo update + ``` + +2. Install the GPU Operator in your OKE cluster: + + ```bash + helm install --create-namespace --namespace gpu-operator nvidia/gpu-operator --wait --generate-name + ``` + +3. Monitor the deployment to ensure everything is set up correctly: + + ```bash + kubectl get pods -n gpu-operator + ``` + +Official instructions for the NVIDIA GPU Operator can be found [here](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html). From 008b68adcc9c75a1cf21aeb1e0fe38a66948333c Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 09:56:02 +0300 Subject: [PATCH 02/15] . --- cloud-service-providers/oci/oke/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/README.md b/cloud-service-providers/oci/oke/README.md index f635b00a..3b0c1b3e 100644 --- a/cloud-service-providers/oci/oke/README.md +++ b/cloud-service-providers/oci/oke/README.md @@ -7,7 +7,7 @@ When you select a GPU shape for a managed node pool or self-managed node in OKE, ## Prerequisites -Please follow [Pre-rquirement instruction](./prerequisites/README.md) to get ready for OKE creation. +Please follow [Prerequisite instructions](./prerequisites/README.md) to get ready for OKE creation. ## Create OKE From be43d3cd1d0286d018881e7311e7cef4e269c576 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 09:59:27 +0300 Subject: [PATCH 03/15] . --- cloud-service-providers/oci/oke/prerequisites/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/prerequisites/README.md b/cloud-service-providers/oci/oke/prerequisites/README.md index 3c222714..47e5183a 100644 --- a/cloud-service-providers/oci/oke/prerequisites/README.md +++ b/cloud-service-providers/oci/oke/prerequisites/README.md @@ -63,4 +63,4 @@ For more details, please reference this [link.](https://helm.sh/docs/intro/insta ## Next step -![Continue to OKE creation](../setup/README.md) \ No newline at end of file +[Continue to OKE creation](../setup/README.md) \ No newline at end of file From 60f7418c993c21cd2f9cc58e2b0b38893609d5df Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:02:29 +0300 Subject: [PATCH 04/15] . --- cloud-service-providers/oci/oke/setup/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 3130d5ca..b5351fb5 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -9,7 +9,7 @@ The key to creating Oracle Kubernetes Engine (OKE) for NIM is to create a proper ## Identify GPU needed for NIM -- Refer to the NIM documentation to identify the GPU you [need](https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html). Here is also a list of available OKE node shapes. +- Refer to the NIM documentation to identify the NVIDIA GPU you [need](https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html). Here is also a list of available [OKE NVIDIA GPU node shapes](https://docs.oracle.com/en-us/iaas/Content/Compute/References/computeshapes.htm#vm-gpu). ## Find the Region with the Desired GPU From be9169e6e683a0d6c51ef299522e0b4e43eba1cf Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:11:15 +0300 Subject: [PATCH 05/15] . --- cloud-service-providers/oci/oke/setup/README.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index b5351fb5..73c1ee9d 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -12,10 +12,9 @@ The key to creating Oracle Kubernetes Engine (OKE) for NIM is to create a proper - Refer to the NIM documentation to identify the NVIDIA GPU you [need](https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html). Here is also a list of available [OKE NVIDIA GPU node shapes](https://docs.oracle.com/en-us/iaas/Content/Compute/References/computeshapes.htm#vm-gpu). -## Find the Region with the Desired GPU +## Confirm the GPU availability in -1. Go to the OCI Console and navigate to the "Shape Availability" section to find the region that supports the desired GPU shape. -2. Alternatively, use the OCI CLI to search for GPU availability: +Use the OCI CLI to search for GPU availability: ```bash oci compute shape list --all From 414801a2c1a86cf314f717b9133e9b1b8538de5b Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:11:53 +0300 Subject: [PATCH 06/15] . --- cloud-service-providers/oci/oke/setup/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 73c1ee9d..29297bb4 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -17,7 +17,7 @@ The key to creating Oracle Kubernetes Engine (OKE) for NIM is to create a proper Use the OCI CLI to search for GPU availability: ```bash - oci compute shape list --all + oci compute shape list --region --compartment-id --all --query 'data[*].shape' --output json | jq -r '.[]' | grep -i 'gpu' ``` Cross-reference with the [OCI Regions](https://www.oracle.com/cloud/data-regions.html) to select the best region. From 1ccd947a764f4acc9d7a7163219cce11b5141f28 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:19:11 +0300 Subject: [PATCH 07/15] . --- cloud-service-providers/oci/oke/setup/README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 29297bb4..1abb6b96 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -45,12 +45,16 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need - **Node Subnet**: Select a subnet within your VCN. 5. Click **Create Cluster** to start the provisioning process. -## Create GPU nodepool +## Create GPU nodepool on existing OKE cluster -1. After the cluster is created, navigate to the **Node Pools** section. +1. For an existing OKE cluster, navigate to the **Node Pools** section. 2. Click **Add Node Pool** and configure: - **Name**: Provide a name for the node pool. + - **Compartment**: Select the appropriate compartment. + - **Version**: the Kubernetes version of the nodes - defaults to current cluster version. + - **Node Placement Configuration** - select Availability Domain and Worker node subnet. - **Node Shape**: Select the desired GPU-enabled shape. + - **Node Image**: is automatically populated with an OEL GPU image which you can change to a different version. - **Node Count**: Set the number of nodes (adjust according to your needs). - **Additional Configuration**: Customize as needed (e.g., OS disk size, SSH keys). 3. Click **Create Node Pool**. @@ -58,7 +62,7 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need ## Connect to OKE 1. Install the OCI CLI if you haven't already. -2. Retrieve the OKE cluster credentials: +2. Retrieve the OKE cluster credentials using the Access Cluster buton in the console Cluster details page: ```bash oci ce cluster create-kubeconfig --cluster-id --file $HOME/.kube/config --region --token-version 2.0.0 --kube-endpoint PUBLIC_ENDPOINT From 5d57eb29030d609ed299fb2f22aaa2692ee420fa Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:21:21 +0300 Subject: [PATCH 08/15] . --- .../oci/oke/setup/README.md | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 1abb6b96..848ef58f 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -73,30 +73,3 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need ```bash kubectl get nodes ``` - -## Install GPU Operator (Only if necessary) - -**Note:** If you're using an OCI GPU shape that comes with the drivers pre-installed (such as those having 'GPU' in their names, for example the ones in the `BM.GPU.A100` series), you can skip this section. The GPU drivers are already installed and configured. - -If your chosen shape does not include the GPU drivers, follow the steps below to install the NVIDIA GPU Operator. - -1. Add the NVIDIA Helm repository: - - ```bash - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --pass-credentials - helm repo update - ``` - -2. Install the GPU Operator in your OKE cluster: - - ```bash - helm install --create-namespace --namespace gpu-operator nvidia/gpu-operator --wait --generate-name - ``` - -3. Monitor the deployment to ensure everything is set up correctly: - - ```bash - kubectl get pods -n gpu-operator - ``` - -Official instructions for the NVIDIA GPU Operator can be found [here](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html). From 1ccb9d890f4b3af41396a7f9f5b42545301b4046 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:28:17 +0300 Subject: [PATCH 09/15] cloudinit --- cloud-service-providers/oci/oke/setup/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 848ef58f..28e7d781 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -56,7 +56,13 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need - **Node Shape**: Select the desired GPU-enabled shape. - **Node Image**: is automatically populated with an OEL GPU image which you can change to a different version. - **Node Count**: Set the number of nodes (adjust according to your needs). - - **Additional Configuration**: Customize as needed (e.g., OS disk size, SSH keys). + - **Boot volume**: Specify a larger size than the default 50GB size, for example 300 GB. To complement this change also go to the next point on custo cloudinit.sh. + - **Show advanced options** -> **Initialization script** -> **Paste Cloud-init Script** and paste: + ``` + #!/bin/bash + /usr/libexec/oci-growfs -y + + ``` 3. Click **Create Node Pool**. ## Connect to OKE From af97fcc04cc1a4bd97af4783bfddac0708ed5b78 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:32:15 +0300 Subject: [PATCH 10/15] . --- cloud-service-providers/oci/oke/setup/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 28e7d781..355265c0 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -61,7 +61,6 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need ``` #!/bin/bash /usr/libexec/oci-growfs -y - ``` 3. Click **Create Node Pool**. From f315dfc5a9097356ee2461725435aefd753850e4 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 10:45:23 +0300 Subject: [PATCH 11/15] . --- cloud-service-providers/oci/oke/setup/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 355265c0..3398cae5 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -60,6 +60,8 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need - **Show advanced options** -> **Initialization script** -> **Paste Cloud-init Script** and paste: ``` #!/bin/bash + curl --fail -H "Authorization: Bearer Oracle" -L0 http://169.254.169.254/opc/v2/instance/metadata/oke_init_script | base64 --decode >/var/run/oke-init.sh + bash /var/run/oke-init.sh /usr/libexec/oci-growfs -y ``` 3. Click **Create Node Pool**. From 637c4a295943489a28f62498e08669a4b79bdf8f Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 12:02:28 +0300 Subject: [PATCH 12/15] oke setup --- cloud-service-providers/oci/oke/setup/README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md index 3398cae5..f0cdfbb1 100644 --- a/cloud-service-providers/oci/oke/setup/README.md +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -31,19 +31,22 @@ Ensure you have the necessary service limits (quota) for the GPU shapes. If need ## Create OKE +To easily create the OKE cluster and NVIDIA GPU nodepool, you can use Quick Create to initially set up the cluster with a default node pool that includes a single, simple VM node. After the cluster is created, you can add a new node pool with GPU shapes that require a larger boot volume size. This way, you can ensure the GPUs have the necessary storage while manually configuring the nodes as needed. + 1. In the OCI Console, navigate to **Developer Services** > **Kubernetes Clusters** > **OKE Clusters**. -2. Click **Create Cluster** and select **Start with Quick Create**. +2. Click **Create Cluster** and select **Quick Create**. 3. Configure the following: - **Name**: Provide a name for your cluster. - **Compartment**: Select the appropriate compartment. - **Kubernetes Version**: Choose the latest stable version. - - **Shape**: Choose a shape with the desired GPU (e.g., `BM.GPU.A100.1`, `BM.GPU.A10.1`). -4. Under **Node Pool Configuration**: - - **Node Pool Name**: Name your node pool. - - **Shape**: Select the GPU shape identified earlier. + - **Kubernetes API endpoint**: Private or public. + - **Node type**: Managed. + - **Kubernetes worker nodes**: Private or public. +4. Under **Shape and image**: + - **Shape**: You can leave the default simple VM.Standard.x. - **Node Count**: Start with 1 node (adjust as needed). - - **Node Subnet**: Select a subnet within your VCN. -5. Click **Create Cluster** to start the provisioning process. + - **Add an SSH key**(optional): In order to have access to nodes. +5. Click **Create Cluster** to start the provisioning process. This will provision a simple cluster, to which you can subsequently add a GPU nodepool. ## Create GPU nodepool on existing OKE cluster From 9cda673f62e37a97874eab54441e3283b113f3aa Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 13:04:46 +0300 Subject: [PATCH 13/15] tf readme --- .../oci/oke/terraform/README.md | 142 +++++ .../oci/oke/terraform/common.tf | 14 + .../oci/oke/terraform/datasources.tf | 49 ++ .../oci/oke/terraform/helm-deployments.tf | 192 ++++++ .../terraform/helm-module/helm-deployement.tf | 226 +++++++ .../oke/terraform/helm-module/variables.tf | 118 ++++ .../cert-manager-values.yaml.tpl | 2 + .../jupyterhub-values.yaml.tpl | 38 ++ .../nginx-values.yaml.tpl | 13 + .../helm-values-templates/nim-values.yaml.tpl | 16 + .../oci/oke/terraform/main.tf | 166 +++++ .../oci/oke/terraform/provider.tf | 24 + .../oci/oke/terraform/schema.yaml | 519 ++++++++++++++++ .../terraform/terrafom.auto.tfvars.example | 32 + .../oci/oke/terraform/tls.tf | 12 + .../oci/oke/terraform/variables.tf | 565 ++++++++++++++++++ 16 files changed, 2128 insertions(+) create mode 100644 cloud-service-providers/oci/oke/terraform/README.md create mode 100644 cloud-service-providers/oci/oke/terraform/common.tf create mode 100644 cloud-service-providers/oci/oke/terraform/datasources.tf create mode 100644 cloud-service-providers/oci/oke/terraform/helm-deployments.tf create mode 100644 cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf create mode 100644 cloud-service-providers/oci/oke/terraform/helm-module/variables.tf create mode 100644 cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl create mode 100644 cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl create mode 100644 cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl create mode 100644 cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl create mode 100644 cloud-service-providers/oci/oke/terraform/main.tf create mode 100644 cloud-service-providers/oci/oke/terraform/provider.tf create mode 100644 cloud-service-providers/oci/oke/terraform/schema.yaml create mode 100644 cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example create mode 100644 cloud-service-providers/oci/oke/terraform/tls.tf create mode 100644 cloud-service-providers/oci/oke/terraform/variables.tf diff --git a/cloud-service-providers/oci/oke/terraform/README.md b/cloud-service-providers/oci/oke/terraform/README.md new file mode 100644 index 00000000..8934d3c0 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/README.md @@ -0,0 +1,142 @@ +# orm-stack-oke-helm-deployment-nim + +## Getting started + +This stack deploys an OKE cluster with two nodepools: +- one nodepool with flexible shapes +- one nodepool with GPU shapes + +And several supporting applications using helm: +- nginx +- cert-manager +- jupyterhub + +With the scope of demonstrating [nVidia NIM LLM](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) self-hosted model capabilities. + +**Note:** For helm deployments it's necessary to create bastion and operator host (with the associated policy for the operator to manage the clsuter), **or** configure a cluster with public API endpoint. + +In case the bastion and operator hosts are not created, is a prerequisite to have the following tools already installed and configured: +- bash +- helm +- jq +- kubectl +- oci-cli + +[ +![Deploy to Oracle Cloud] +(https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg) +] +(https://cloud.oracle.com/resourcemanager/stacks/create +?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) + +## Helm Deployments + +### Nginx + +[Nginx](https://kubernetes.github.io/ingress-nginx/deploy/) is deployed and configured as default ingress controller. + +### Cert-manager + +[Cert-manager](https://cert-manager.io/docs/) is deployed to handle the configuration of TLS certificate for the configured ingress resources. Currently it's using the [staging Let's Encrypt endpoint](https://letsencrypt.org/docs/staging-environment/). + +### Jupyterhub + +[Jupyterhub](https://jupyterhub.readthedocs.io/en/stable/) will be accessible to the address: [https://jupyter.a.b.c.d.nip.io](https://jupyter.a.b.c.d.nip.io), where a.b.c.d is the public IP address of the load balancer associated with the NGINX ingress controller. + +JupyterHub is using a dummy authentication scheme (user/password) and the access is secured using the variables: + +``` +jupyter_admin_user +jupyter_admin_password +``` + +It also supports the option to automatically clone a git repo when user is connecting and making it available under `examples` directory. + +### NIM + +The LLM is deployed using [NIM](https://docs.nvidia.com/nim/index.html). + +Parameters: +- `nim_image_repository` and `nim_image_tag` - used to specify the container image location +- `NGC_API_KEY` - required to authenticate with NGC services + +Models with large context length require GPUs with lots of memory. In case of Mistral, with a context length of 32k, the deployment on A10 instances, fails with the default container settings. + +To work around this issue, we can limit the context length using the `--max-model-len` argument for the vLLM. The underlying inference engine used by NIM. + +In case of Mistral models, create a file `nim_user_values_override.yaml` file with the content below and provide it as input during ORM stack variable configuration. + +## How to deploy? + +1. Deploy directly to OCI using the below button: + +[ +![Deploy to Oracle Cloud] +(https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg) +] +(https://cloud.oracle.com/resourcemanager/stacks/create +?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) + + +2. Deploy via ORM +- Create a new stack +- Upload the TF configuration files +- Configure the variables +- Apply + +3. Local deployment + +- Create a file called `terraform.auto.tfvars` with the required values. + +``` +# ORM injected values + +region = "uk-london-1" +tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaaiyavtwbz4kyu7g7b6wglllccbflmjx2lzk5nwpbme44mv54xu7dq" +compartment_ocid = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# OKE Terraform module values +create_iam_resources = false +create_iam_tag_namespace = false +ssh_public_key = "" + +## NodePool with non-GPU shape is created by default with size 1 +simple_np_flex_shape = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 16 } + +## NodePool with GPU shape is created by default with size 0 +gpu_np_size = 1 +gpu_np_shape = "VM.GPU.A10.1" + +## OKE Deployment values +cluster_name = "oke" +vcn_name = "oke-vcn" +compartment_id = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# Jupyter Hub deployment values +jupyter_admin_user = "oracle-ai" +jupyter_admin_password = "" +playbooks_repo = "https://github.com/robo-cap/llm-jupyter-notebooks.git" + +# NIM Deployment values +nim_image_repository = "nvcr.io/nim/meta/llama3-8b-instruct" +nim_image_tag = "latest" +NGC_API_KEY = "" +``` + +- Execute the commands + +``` +terraform init +terraform plan +terraform apply +``` + +After the deployment is successful, get the Jupyter URL from the Terraform output and run it in the browser. +Log in with the user/password that you previously set. +Open and run the **NVIDIA_NIM_model_interaction.ipynb** notebook. + +## Known Issues + +If `terraform destroy` fails, manually remove the LoadBalancer resource configured for the Nginx Ingress Controller. + +After `terrafrom destroy`, the block volumes corresponding to the PVCs used by the applications in the cluster won't be removed. You have to manually remove them. \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/common.tf b/cloud-service-providers/oci/oke/terraform/common.tf new file mode 100644 index 00000000..a9a3dd33 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/common.tf @@ -0,0 +1,14 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + state_id = coalesce(var.state_id, random_string.state_id.id) +} + +resource "random_string" "state_id" { + length = 6 + lower = true + numeric = false + special = false + upper = false +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/datasources.tf b/cloud-service-providers/oci/oke/terraform/datasources.tf new file mode 100644 index 00000000..25a52e60 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/datasources.tf @@ -0,0 +1,49 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +data "oci_identity_tenancy" "tenant_details" { + + tenancy_id = var.tenancy_ocid +} + +data "oci_identity_regions" "home_region" { + + filter { + name = "key" + values = [data.oci_identity_tenancy.tenant_details.home_region_key] + } +} + +data "oci_identity_availability_domains" "ads" { + + compartment_id = var.tenancy_ocid +} + +data "oci_core_shapes" "gpu_shapes" { + for_each = { for entry in data.oci_identity_availability_domains.ads.availability_domains : entry.name => entry.id } + + compartment_id = var.compartment_id + availability_domain = each.key + + filter { + name = "name" + values = [var.gpu_np_shape] + } +} + +data "oci_load_balancer_load_balancers" "lbs" { + + compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + + filter { + name = "freeform_tags.state_id" + values = [local.state_id] + } + + filter { + name = "freeform_tags.application" + values = ["nginx"] + } + + depends_on = [module.nginx] +} diff --git a/cloud-service-providers/oci/oke/terraform/helm-deployments.tf b/cloud-service-providers/oci/oke/terraform/helm-deployments.tf new file mode 100644 index 00000000..cce93e9f --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-deployments.tf @@ -0,0 +1,192 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + deploy_from_operator = var.create_operator_and_bastion + deploy_from_local = alltrue([!local.deploy_from_operator, var.control_plane_is_public]) +} + +data "oci_containerengine_cluster_kube_config" "kube_config" { + count = local.deploy_from_local ? 1 : 0 + + cluster_id = module.oke.cluster_id + endpoint = "PUBLIC_ENDPOINT" +} + +module "nginx" { + count = var.deploy_nginx ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "ingress-nginx" + helm_chart_name = "ingress-nginx" + namespace = "nginx" + helm_repository_url = "https://kubernetes.github.io/ingress-nginx" + + pre_deployment_commands = [] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/nginx-values.yaml.tpl", + { + min_bw = 100, + max_bw = 100, + pub_lb_nsg_id = module.oke.pub_lb_nsg_id + state_id = local.state_id + } + ) + helm_user_values_override = try(base64decode(var.nginx_user_values_override), var.nginx_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + depends_on = [module.oke] +} + + +module "cert-manager" { + count = var.deploy_cert_manager ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "cert-manager" + helm_chart_name = "cert-manager" + namespace = "cert-manager" + helm_repository_url = "https://charts.jetstack.io" + + pre_deployment_commands = [] + post_deployment_commands = [ + "cat <<'EOF' | kubectl apply -f -", + "apiVersion: cert-manager.io/v1", + "kind: ClusterIssuer", + "metadata:", + " name: le-clusterissuer", + "spec:", + " acme:", + " # You must replace this email address with your own.", + " # Let's Encrypt will use this to contact you about expiring", + " # certificates, and issues related to your account.", + " email: user@oracle.om", + " server: https://acme-staging-v02.api.letsencrypt.org/directory", + " privateKeySecretRef:", + " # Secret resource that will be used to store the account's private key.", + " name: le-clusterissuer-secret", + " # Add a single challenge solver, HTTP01 using nginx", + " solvers:", + " - http01:", + " ingress:", + " ingressClassName: nginx", + "EOF" + ] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/cert-manager-values.yaml.tpl", + {} + ) + helm_user_values_override = try(base64decode(var.cert_manager_user_values_override), var.cert_manager_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke] +} + +module "jupyterhub" { + count = var.deploy_jupyterhub ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "jupyterhub" + helm_chart_name = "jupyterhub" + namespace = "default" + helm_repository_url = "https://hub.jupyter.org/helm-chart/" + + pre_deployment_commands = ["export PUBLIC_IP=$(kubectl get svc -A -l app.kubernetes.io/name=ingress-nginx -o json | jq -r '.items[] | select(.spec.type == \"LoadBalancer\") | .status.loadBalancer.ingress[].ip')"] + deployment_extra_args = [ + "--set ingress.hosts[0]=jupyter.$${PUBLIC_IP}.nip.io", + "--set ingress.tls[0].hosts[0]=jupyter.$${PUBLIC_IP}.nip.io", + "--set ingress.tls[0].secretName=jupyter-tls" + ] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/jupyterhub-values.yaml.tpl", + { + admin_user = var.jupyter_admin_user + admin_password = var.jupyter_admin_password + playbooks_repo = var.jupyter_playbooks_repo + } + ) + helm_user_values_override = try(base64decode(var.jupyterhub_user_values_override), var.jupyterhub_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke, module.nginx] +} + +module "nim" { + count = var.deploy_nim ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "llm" + helm_chart_name = "nim-llm" + namespace = "default" + helm_repository_url = "https://robo-cap.github.io/helm-charts/" + + pre_deployment_commands = [ + "export PUBLIC_IP=$(kubectl get svc -A -l app.kubernetes.io/name=ingress-nginx -o json | jq -r '.items[] | select(.spec.type == \"LoadBalancer\") | .status.loadBalancer.ingress[].ip')", + "kubectl get secret -n default nvcr-${local.state_id} || kubectl create secret docker-registry -n default nvcr-${local.state_id} --docker-server=nvcr.io --docker-username='${var.nvcr_username}' --docker-password='%{if length(var.nvcr_password) > 0}${var.nvcr_password}%{else}${var.NGC_API_KEY}%{endif}'", + "kubectl get secret -n default ngcapi-${local.state_id} || kubectl create secret generic -n default ngcapi-${local.state_id} --from-literal=NGC_CLI_API_KEY=${var.NGC_API_KEY}", + ] + deployment_extra_args = [ + "--set service.name=llm", + "--timeout 10m0s" + ] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/nim-values.yaml.tpl", + { + nvcr_secret = "nvcr-${local.state_id}", + ngcapi_secret = "ngcapi-${local.state_id}", + nim_image_repository = var.nim_image_repository + nim_image_tag = var.nim_image_tag + NGC_API_KEY = var.NGC_API_KEY + } + ) + helm_user_values_override = try(base64decode(var.nim_user_values_override), var.nim_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke, module.nginx] +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf b/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf new file mode 100644 index 00000000..644018fe --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf @@ -0,0 +1,226 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + operator_helm_values_path = coalesce(var.operator_helm_values_path, "/home/${var.operator_user}/tf-helm-values") + operator_helm_charts_path = coalesce(var.operator_helm_charts_path, "/home/${var.operator_user}/tf-helm-charts") + operator_helm_chart_path = "${local.operator_helm_charts_path}/${var.namespace}-${var.deployment_name}-${basename(var.helm_chart_path)}" + + helm_values_override_user_file = "${var.namespace}-${var.deployment_name}-user-values-override.yaml" + helm_values_override_template_file = "${var.namespace}-${var.deployment_name}-template-values-override.yaml" + + operator_helm_values_override_user_file_path = join("/", [local.operator_helm_values_path, local.helm_values_override_user_file]) + operator_helm_values_override_template_file_path = join("/", [local.operator_helm_values_path, local.helm_values_override_template_file]) + + local_helm_values_override_user_file_path = join("/", [path.root, "generated", local.helm_values_override_user_file]) + local_helm_values_override_template_file_path = join("/", [path.root, "generated", local.helm_values_override_template_file]) + + local_kubeconfig_path = "${path.root}/generated/kubeconfig-${var.namespace}-${var.deployment_name}" +} + +resource "null_resource" "copy_chart_top_operator" { + count = var.deploy_from_operator && var.helm_chart_path != "" ? 1 : 0 + + triggers = { + helm_chart_path = var.helm_chart_path + } + + connection { + bastion_host = var.bastion_host + bastion_user = var.bastion_user + bastion_private_key = var.ssh_private_key + host = var.operator_host + user = var.operator_user + private_key = var.ssh_private_key + timeout = "40m" + type = "ssh" + } + + provisioner "remote-exec" { + inline = [ + "rm -rf ${local.operator_helm_chart_path}", + "mkdir -p ${local.operator_helm_charts_path}" + ] + } + + provisioner "file" { + source = var.helm_chart_path + destination = local.operator_helm_chart_path + } +} + +resource "null_resource" "helm_deployment_via_operator" { + count = var.deploy_from_operator ? 1 : 0 + + triggers = { + manifest_md5 = try(md5("${var.helm_template_values_override}-${var.helm_user_values_override}"), null) + deployment_name = var.deployment_name + namespace = var.namespace + bastion_host = var.bastion_host + bastion_user = var.bastion_user + ssh_private_key = var.ssh_private_key + operator_host = var.operator_host + operator_user = var.operator_user + } + + connection { + bastion_host = self.triggers.bastion_host + bastion_user = self.triggers.bastion_user + bastion_private_key = self.triggers.ssh_private_key + host = self.triggers.operator_host + user = self.triggers.operator_user + private_key = self.triggers.ssh_private_key + timeout = "40m" + type = "ssh" + } + + provisioner "remote-exec" { + inline = ["mkdir -p ${local.operator_helm_values_path}"] + } + + provisioner "file" { + content = var.helm_template_values_override + destination = local.operator_helm_values_override_template_file_path + } + + provisioner "file" { + content = var.helm_user_values_override + destination = local.operator_helm_values_override_user_file_path + } + + provisioner "remote-exec" { + inline = concat( + var.pre_deployment_commands, + [ + "if [ -s \"${local.operator_helm_values_override_user_file_path}\" ]; then", + join(" ", concat([ + "helm upgrade --install ${var.deployment_name}", + "%{if var.helm_chart_path != ""}${local.operator_helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif}", + "--namespace ${var.namespace} --create-namespace --wait", + "-f ${local.operator_helm_values_override_template_file_path}", + "-f ${local.operator_helm_values_override_user_file_path}" + ], var.deployment_extra_args)), + "else", + join(" ", concat([ + "helm upgrade --install ${var.deployment_name}", + "%{if var.helm_chart_path != ""}${local.operator_helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif}", + "--namespace ${var.namespace} --create-namespace --wait", + "-f ${local.operator_helm_values_override_template_file_path}" + ], var.deployment_extra_args)), + "fi" + ], + var.post_deployment_commands + ) + + } + + provisioner "remote-exec" { + when = destroy + inline = ["helm uninstall ${self.triggers.deployment_name} --namespace ${self.triggers.namespace} --wait"] + on_failure = continue + } + + lifecycle { + ignore_changes = [ + triggers["bastion_host"], + triggers["bastion_user"], + triggers["ssh_private_key"], + triggers["operator_host"], + triggers["operator_user"] + ] + } + + depends_on = [null_resource.copy_chart_top_operator] +} + + +resource "local_file" "helm_template_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.helm_template_values_override + filename = local.local_helm_values_override_template_file_path +} + + +resource "local_file" "helm_user_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.helm_user_values_override + filename = local.local_helm_values_override_user_file_path +} + +resource "local_file" "cluster_kube_config_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.kube_config + filename = local.local_kubeconfig_path +} + +resource "null_resource" "helm_deployment_from_local" { + count = var.deploy_from_local ? 1 : 0 + + triggers = { + manifest_md5 = try(md5("${var.helm_template_values_override}-${var.helm_user_values_override}"), null) + deployment_name = var.deployment_name + namespace = var.namespace + kube_config = var.kube_config + } + + provisioner "local-exec" { + working_dir = path.root + command = <<-EOT + export KUBECONFIG=${local.local_kubeconfig_path} + ${join("\n", var.pre_deployment_commands)} + if [ -s "${local.local_helm_values_override_user_file_path}" ]; then + echo "" + echo "Terraform generated values:" + cat "${local.local_helm_values_override_template_file_path}" + echo "" + echo "User provided values:" + cat "${local.local_helm_values_override_user_file_path}" + echo "" + helm upgrade --install ${var.deployment_name} \ + %{if var.helm_chart_path != ""}${var.helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif} \ + --namespace ${var.namespace} \ + --create-namespace --wait \ + -f ${local.local_helm_values_override_template_file_path} \ + -f ${local.local_helm_values_override_user_file_path} ${join(" ", var.deployment_extra_args)} + else + echo "" + echo "Terraform generated values:" + cat "${local.local_helm_values_override_template_file_path}" + echo "" + helm upgrade --install ${var.deployment_name} \ + %{if var.helm_chart_path != ""}${var.helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif} \ + --namespace ${var.namespace} \ + --create-namespace --wait \ + -f ${local.local_helm_values_override_template_file_path} ${join(" ", var.deployment_extra_args)} + fi + ${join("\n", var.post_deployment_commands)} + EOT + } + + # This provisioner is not executed when the resource is commented out: https://github.com/hashicorp/terraform/issues/25073 + provisioner "local-exec" { + when = destroy + environment = { + kube_config = self.triggers.kube_config + } + working_dir = path.root + command = <<-EOT + mkdir -p ./generated; \ + echo "$kube_config" > ./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy; \ + export KUBECONFIG=./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy; \ + helm uninstall ${self.triggers.deployment_name} --namespace ${self.triggers.namespace} --wait; \ + rm ./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy + EOT + on_failure = continue + } + lifecycle { + ignore_changes = [ + triggers["local_kubeconfig_path"] + ] + } + + depends_on = [local_file.cluster_kube_config_file, local_file.helm_template_file, local_file.helm_user_file] +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf b/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf new file mode 100644 index 00000000..664d8e34 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf @@ -0,0 +1,118 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +variable "deploy_from_local" { + type = bool + default = true + description = "Wether to attempt deployment of the helm charts using local-exec." +} + +variable "deploy_from_operator" { + type = bool + default = false + description = "Wether to attempt deployment of the helm charts using remote-exec." +} + +variable "deployment_name" { + type = string + default = false + description = "The name of the helm deployment." +} + +variable "namespace" { + type = string + default = "default" + description = "The kubernetes namespace to target for the helm deployment." +} + +variable "helm_chart_name" { + type = string + default = "" + description = "The name of the helm chart. Used together with `helm_repository_url` when helm_chart_path=''." +} + +variable "helm_chart_path" { + type = string + default = "" + description = "The path of the helm chart. If not empty will override the `helm_repository_url` and `helm_chart_name` values." +} + +variable "helm_repository_url" { + type = string + default = "" + description = "The helm chart repository url." +} + +variable "operator_helm_values_path" { + type = string + default = "" + description = "The directory on the operator host where to push the values-override for the helm chart." +} + +variable "operator_helm_charts_path" { + type = string + default = "" + description = "The directory on the operator host where to push the helm-charts when `helm_chart_path` is not empty." +} + +variable "helm_template_values_override" { + type = string + description = "The values-override file content populated using terraform templates." +} + +variable "helm_user_values_override" { + type = string + description = "The values-override file provided by the user as variable." +} +variable "pre_deployment_commands" { + type = list(string) + default = [] + description = "List of commands to be executed before attempting the helm deployment." +} +variable "post_deployment_commands" { + type = list(string) + default = [] + description = "List of commands to be executed after the helm deployment." +} + +variable "deployment_extra_args" { + type = list(string) + default = [] + description = "List of arguments to be appended to the helm upgrade --install command." +} + +variable "kube_config" { + type = string + default = "" + description = "The Kubeconfig file content to use for helm deployments using local-exec." +} + +variable "bastion_host" { + type = string + default = null + description = "The IP address of the bastion host." +} + +variable "bastion_user" { + type = string + default = "opc" + description = "The user to be used for SSH connection to the bastion host." +} + +variable "ssh_private_key" { + type = string + default = null + description = "The SSH private key to be used for connection to operator/bastion hosts." +} + +variable "operator_host" { + type = string + default = null + description = "The IP address of the operator host." +} + +variable "operator_user" { + type = string + default = "opc" + description = "The user to be used for SSH connection to the operator host." +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl new file mode 100644 index 00000000..cae8d95b --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl @@ -0,0 +1,2 @@ +crds: + enabled: true \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl new file mode 100644 index 00000000..7be38b31 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl @@ -0,0 +1,38 @@ +--- +singleuser: + defaultUrl: "/lab" + extraEnv: + JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp" + %{ if playbooks_repo != "" } + lifecycleHooks: + postStart: + exec: + command: + [ + "/bin/sh", + "-c", + "gitpuller ${playbooks_repo} main examples" + ] + %{ endif } + cloudMetadata: + blockWithIptables: false + +hub: + config: + Authenticator: + admin_users: + - ${admin_user} + DummyAuthenticator: + password: '${admin_password}' + JupyterHub: + authenticator_class: dummy + +ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: "le-clusterissuer" + +proxy: + service: + type: ClusterIP \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl new file mode 100644 index 00000000..6b83ac47 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl @@ -0,0 +1,13 @@ +controller: + service: + targetPorts: + http: http + https: https + annotations: + oci.oraclecloud.com/load-balancer-type: "lb" + service.beta.kubernetes.io/oci-load-balancer-shape: "flexible" + service.beta.kubernetes.io/oci-load-balancer-shape-flex-min: "${min_bw}" + service.beta.kubernetes.io/oci-load-balancer-shape-flex-max: "${max_bw}" + service.beta.kubernetes.io/oci-load-balancer-security-list-management-mode: "None" + oci.oraclecloud.com/oci-network-security-groups: "${pub_lb_nsg_id}" + oci.oraclecloud.com/initial-freeform-tags-override: '{"state_id": "${state_id}", "application": "nginx", "role": "service_lb"}' \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl new file mode 100644 index 00000000..54f99e28 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl @@ -0,0 +1,16 @@ + +imagePullSecrets: +- name: ${nvcr_secret} + +model: + ngcAPISecret: ${ngcapi_secret} + +image: + repository: ${nim_image_repository} + tag: ${nim_image_tag} + +resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/main.tf b/cloud-service-providers/oci/oke/terraform/main.tf new file mode 100644 index 00000000..c964af30 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/main.tf @@ -0,0 +1,166 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + ads_supporting_gpu_shape = [for key, value in data.oci_core_shapes.gpu_shapes : key if length(value.shapes) > 0] + gpu_np_placement_ads = ( + length(var.gpu_np_placement_ads) > 0 ? + [for entry in var.gpu_np_placement_ads : length(tostring(entry)) > 1 ? substr(tostring(entry), -1, -1) : tonumber(entry)] : + [for entry in local.ads_supporting_gpu_shape : substr(tostring(entry), -1, -1)] + ) +} + +# This module is using a modified version of the oke-terraform-module to address the compatibility with Terraform versions < 1.3.0 +# The OKE module documentation is available here: https://oracle-terraform-modules.github.io/terraform-oci-oke/ +module "oke" { + source = "git::https://github.com/robo-cap/terraform-oci-oke.git?ref=v5.1.8-ai" + + # source = "oracle-terraform-modules/oke/oci" + # version = "5.1.8" + + providers = { + oci.home = oci.home + } + + state_id = local.state_id + # IAM + tenancy_id = var.tenancy_ocid + compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + network_compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + create_iam_resources = var.create_iam_resources + create_iam_worker_policy = var.create_iam_worker_policy + create_iam_autoscaler_policy = var.create_iam_autoscaler_policy + create_iam_operator_policy = var.create_iam_operator_policy + create_iam_kms_policy = var.create_iam_kms_policy + create_iam_tag_namespace = var.create_iam_tag_namespace + create_iam_defined_tags = var.create_iam_defined_tags + # it's recommended to create the following tag namespace and tag keys outside of the oke module + # tag namespace: oke + # tag keys: state_id, role, pool, cluster_autoscaler + use_defined_tags = var.use_defined_tags + tag_namespace = var.tag_namespace + freeform_tags = var.freeform_tags + + defined_tags = var.defined_tags + + # Common + ssh_private_key = tls_private_key.stack_key.private_key_openssh + ssh_public_key = local.bundled_ssh_public_keys + + # Bastion variables + create_bastion = var.create_operator_and_bastion + bastion_allowed_cidrs = var.bastion_allowed_cidrs + bastion_image_os = var.bastion_image_os + bastion_image_os_version = var.bastion_image_os_version + bastion_image_type = var.bastion_image_type + bastion_image_id = var.bastion_image_id + bastion_user = var.bastion_user + + # Operator variables + create_operator = var.create_operator_and_bastion + create_operator_policy_to_manage_cluster = var.create_operator_policy_to_manage_cluster + operator_image_os = var.operator_image_os + operator_image_os_version = var.operator_image_os_version + operator_image_type = var.operator_image_type + operator_image_id = var.operator_image_id + operator_install_kubectl_from_repo = var.operator_install_kubectl_from_repo + operator_user = var.operator_user + + # Network variables + create_vcn = var.create_vcn + lockdown_default_seclist = true # *true/false + vcn_id = var.vcn_id # Ignored if create_vcn = true + vcn_cidrs = [var.cidr_vcn] # Ignored if create_vcn = false + vcn_name = var.vcn_name # Ignored if create_vcn = false + ig_route_table_id = var.ig_route_table_id + nat_route_table_id = var.nat_route_table_id + + + subnets = { + bastion = { cidr = var.cidr_bastion_subnet } + operator = { cidr = var.cidr_operator_subnet } + cp = { cidr = var.cidr_cp_subnet } + int_lb = { cidr = var.cidr_int_lb_subnet } + pub_lb = { cidr = var.cidr_pub_lb_subnet } + workers = { cidr = var.cidr_workers_subnet } + pods = { cidr = var.cidr_pods_subnet } + } + + nat_gateway_route_rules = [ + # { + # destination = "192.168.0.0/16" + # destination_type = "CIDR_BLOCK" + # network_entity_id = "drg" + # description = "Terraformed - 192/16 to DRG" + # }, + ] + + # Cluster variables + create_cluster = var.create_cluster // *true/false + cluster_name = var.cluster_name + cluster_type = var.cluster_type + cni_type = var.cni_type // *flannel/npn + kubernetes_version = var.kubernetes_version + pods_cidr = var.pods_cidr + services_cidr = var.services_cidr + control_plane_is_public = var.control_plane_is_public + assign_public_ip_to_control_plane = var.control_plane_is_public + load_balancers = var.load_balancers + preferred_load_balancer = var.preferred_load_balancer + control_plane_allowed_cidrs = var.control_plane_allowed_cidrs + allow_rules_public_lb = { + "Allow TCP ingress to public load balancers for SSL traffic from anywhere" : { + protocol = 6, port = 443, source = "0.0.0.0/0", source_type = "CIDR_BLOCK", + }, + "Allow TCP ingress to public load balancers for HTTP traffic from anywhere" : { + protocol = 6, port = 80, source = "0.0.0.0/0", source_type = "CIDR_BLOCK", + } + } + + worker_pools = { + simple-np = { + description = "Worker nodes for the OKE cluster.", + size = var.simple_np_size + os = "Oracle Linux", + os_version = "8", + image_type = "oke", + image_id = "ocid1.image...", + shape = lookup(var.simple_np_flex_shape, "instanceShape", "VM.Standard.E4.Flex"), + ocpus = lookup(var.simple_np_flex_shape, "ocpus", 2), + memory = lookup(var.simple_np_flex_shape, "memory", 16) + boot_volume_size = var.simple_np_boot_volume_size + }, + gpu-np = { + description = "Worker nodes with GPU for the OKE cluster.", + size = var.gpu_np_size, + os = "Oracle Linux", + os_version = "8", + image_type = "oke", + image_id = "ocid1.image...", + shape = var.gpu_np_shape, + boot_volume_size = var.gpu_np_boot_volume_size + placement_ads = local.gpu_np_placement_ads + } + } + + output_detail = true +} + +output "bastion" { + value = "%{if var.create_operator_and_bastion}${module.oke.bastion_public_ip}%{else}bastion host not created.%{endif}" +} + +output "operator" { + value = "%{if var.create_operator_and_bastion}${module.oke.operator_private_ip}%{else}operator host not created.%{endif}" +} + +output "ssh_to_operator" { + value = "%{if var.create_operator_and_bastion}${module.oke.ssh_to_operator}%{else}bastion and operator hosts not created.%{endif}" +} + +output "jupyter_hub_url" { + value = (var.deploy_nginx && var.deploy_jupyterhub && length(coalesce(data.oci_load_balancer_load_balancers.lbs.load_balancers, [])) > 0 ? + "https://jupyter.${data.oci_load_balancer_load_balancers.lbs.load_balancers[0].ip_addresses[0]}.nip.io" : + "" + ) +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/provider.tf b/cloud-service-providers/oci/oke/terraform/provider.tf new file mode 100644 index 00000000..ce9697e7 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/provider.tf @@ -0,0 +1,24 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +provider "oci" { + alias = "home" + region = lookup(data.oci_identity_regions.home_region.regions[0], "name") +} + +provider "oci" { + region = var.region +} + +terraform { + required_version = ">= 1.3.0" + + required_providers { + + oci = { + configuration_aliases = [oci.home] + source = "oracle/oci" + version = ">= 4.119.0" + } + } +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/schema.yaml b/cloud-service-providers/oci/oke/terraform/schema.yaml new file mode 100644 index 00000000..c6c3e396 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/schema.yaml @@ -0,0 +1,519 @@ +title: OKE AI Stack deployment +description: Terraform Stack for OKE deployment with Nginx, Jupyter Notebook and NIM. +informationalText: Terraform Stack for OKE deployment with Nginx, Jupyter Notebook and NIM. +schemaVersion: 1.1.0 +version: "20190304" + +# URL of Logo Icon used on Application Information tab. Logo must be 130x130 pixels. +# (Optional) +logoUrl: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/laivzo5VNcM9uZf9O6ftZb4-QTPOcszBFwtfu7AGOtbINpfSDKqbJAnSNIRDjtX6/n/orasenatdpltintegration03/b/default-bucket/o/oracle_icon.jpg + + +locale: "en" +variableGroups: + - title: "Hidden RMS variables" + visible: false + variables: + - tenancy_ocid + - compartment_ocid + - region + - user_ocid + - current_user_ocid + + - title: "Hidden stack variables" + visible: false + variables: + - create_iam_resources + - create_iam_tag_namespace + - create_iam_defined_tags + - use_defined_tags + - tag_namespace + - bastion_allowed_cidrs + - bastion_image_os + - bastion_image_os_version + - bastion_image_type + - bastion_image_id + - bastion_user + - operator_image_os + - operator_image_os_version + - operator_image_type + - operator_image_id + - operator_user + - control_plane_allowed_cidrs + - create_cluster + - operator_allowed_cidrs + - state_id + - nvcr_username + - nvcr_password + - operator_install_kubectl_from_repo + - cluster_type + - load_balancers + - preferred_load_balancer + - pods_cidr + - services_cidr + - create_iam_autoscaler_policy + - create_iam_kms_policy + - create_iam_operator_policy + - create_iam_worker_policy + - defined_tags + - freeform_tags + + + - title: "General configuration" + visible: true + variables: + - compartment_id + - cluster_name + - show_advanced_oke + - kubernetes_version + - cni_type + + - title: "Networking configuration" + visible: true + variables: + - create_vcn + - vcn_name + - vcn_id + - ig_route_table_id + - nat_route_table_id + - cidr_vcn + - cidr_bastion_subnet + - cidr_operator_subnet + - cidr_cp_subnet + - cidr_int_lb_subnet + - cidr_pub_lb_subnet + - cidr_workers_subnet + - cidr_pods_subnet + + - title: "Kubernetes nodepools configuration" + visible: true + variables: + - simple_np_size + - simple_np_flex_shape + - simple_np_boot_volume_size + - gpu_np_size + - gpu_np_shape + - gpu_np_boot_volume_size + + - title: "Access to the Kubernetes cluster" + visible: true + variables: + - create_operator_and_bastion + - create_operator_policy_to_manage_cluster + - control_plane_is_public + - ssh_public_key + + + - title: "Helm Chart deployments" + visible: true + variables: + - deploy_nginx + - nginx_user_values_override + - deploy_cert_manager + - cert_manager_user_values_override + - deploy_jupyterhub + - jupyterhub_user_values_override + - jupyter_admin_user + - jupyter_admin_password + - jupyter_playbooks_repo + - deploy_nim + - nim_user_values_override + - nim_image_repository + - nim_image_tag + - NGC_API_KEY + +variables: + create_iam_resources: + type: bool + default: false + visible: false + + create_iam_tag_namespace: + type: bool + default: false + visible: false + + create_iam_defined_tags: + type: bool + default: false + visible: false + + use_defined_tags: + type: bool + default: false + visible: false + + compartment_id: + type: oci:identity:compartment:id + title: Deployment compartment target + description: Please select the compartment where the resources will be created + required: true + + cluster_name: + type: string + minLength: 3 + maxLength: 40 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: Cluster Name + description: The name of the OKE cluster. + default: oke + required: true + + show_advanced_oke: + title: Show advanced OKE settings + description: Expand options to set advanced OKE settings + type: boolean + default: false + + kubernetes_version: + type: enum + title: Kubernetes version + description: The version of the Kubernetes cluster. + default: v1.30.1 + enum: + - v1.30.1 + - v1.29.1 + - v1.28.2 + allowMultiple: false + required: true + visible: ${show_advanced_oke} + + cni_type: + type: enum + title: Kubernetes cluster networking type + description: The networking to be used with the OKE cluster. + default: flannel + enum: + - flannel + - npn + allowMultiple: false + required: true + visible: ${show_advanced_oke} + + create_vcn: + title: Create new VCN + description: Create new VCN for the OKE cluster. + type: boolean + default: true + + vcn_name: + type: string + minLength: 3 + maxLength: 40 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: VCN Name + description: The name of VCN. + default: oke-vcn + required: true + visible: ${create_vcn} + + vcn_id: + title: Select VCN for the OKE cluster + description: Select the existing VCN for the OKE cluster. + type: oci:core:vcn:id + dependsOn: + compartmentId: ${compartment_id} + visible: + not: + - ${create_vcn} + + ig_route_table_id: + title: Public subnet Route Table OCID + description: OCID of the route table for public subnets from the selected VCN. + type: string + visible: + not: + - ${create_vcn} + default: "" + + nat_route_table_id: + title: Private subnet Route Table OCID + description: OCID of the route table for private subnets from the selected VCN. + type: string + visible: + not: + - ${create_vcn} + default: "" + + cidr_vcn: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: VCN CIDR Block + description: The CIDR block to use with the new VCN. + default: 10.0.0.0/16 + required: true + visible: ${create_vcn} + + cidr_bastion_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: Bastion subnet CIDR + description: The CIDR block used for the bastion subnet. + default: 10.0.0.0/29 + required: true + + cidr_operator_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: Operator subnet CIDR + description: The CIDR block used for the operator subnet. + default: 10.0.0.64/29 + required: true + + cidr_cp_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: OKE Control Plane subnet CIDR + description: The CIDR block used for the OKE Control Plane subnet. + default: 10.0.0.8/29 + required: true + + cidr_int_lb_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Internal LBs subnet CIDR + description: The CIDR block used for the OKE Internal Load Balancers subnet. + default: 10.0.0.32/27 + required: true + + cidr_pub_lb_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Public LBs subnet CIDR + description: The CIDR block used for the OKE Public Load Balancers subnet. + default: 10.0.128.0/27 + required: true + + cidr_workers_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Worker nodes subnet CIDR + description: The CIDR block used for the OKE Workers subnet. + default: 10.0.144.0/20 + required: true + + cidr_pods_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Pods subnet CIDR + description: The CIDR block used for the OKE Pods subnet (will be used only if networking type is NPN). + default: 10.0.64.0/18 + required: true + visible: + eq: + - ${cni_type} + - npn + + control_plane_is_public: + title: Create public OKE API? + description: Associate a public IP address with the OKE control plane endpoint? This is required for Helm deployments when bastion and operator hosts are not created. + type: boolean + default: true + + ssh_public_key: + title: SSH public key + description: Public SSH key to be included in the ~/.ssh/authorized_keys file for the bastion, operator and worker nodes. + # renders variable as an SSH key control + type: oci:core:ssh:publickey + required: false + + simple_np_flex_shape: + title: Generic Nodepool shape + type: oci:core:instanceshapewithflex:name + dependsOn: + compartmentId: ${compartment_id} + required: true + + simple_np_size: + title: Generic Nodepool size + type: integer + minimum: 1 + required: true + default: 1 + + simple_np_boot_volume_size: + title: Generic Nodepool boot volume size + description: The size of the boot volume for the nodes in the Generic nodepool. + type: integer + minimum: 50 + required: true + default: 50 + + gpu_np_shape: + title: GPU Nodepool shape + type: enum + enum: + - VM.GPU.A10.1 + - VM.GPU.A10.2 + - VM.GPU2.1 + - VM.GPU3.1 + - VM.GPU3.2 + - VM.GPU3.4 + - BM.GPU2.2 + - BM.GPU3.8 + - BM.GPU4.8 + - BM.GPU.A10.4 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + default: VM.GPU.A10.1 + + gpu_np_size: + title: GPU Nodepool size + type: integer + minimum: 0 + required: true + default: 1 + + gpu_np_boot_volume_size: + title: GPU Nodepool boot volume size + description: The size of the boot volume for the nodes in the GPU nodepool. + type: integer + minimum: 50 + required: true + default: 100 + + create_operator_and_bastion: + title: Create bastion and operator hosts + description: Bastion and operator hosts are required for successful helm chart deployment when the OKE cluster endpoint is private. + type: boolean + default: true + + create_operator_policy_to_manage_cluster: + type: boolean + default: true + title: Create operator IAM policy + description: Create minimal IAM policy to allow the operator host to manage the OKE cluster. The policy is required for successful helm chart deployment via the operator host. + visible: ${create_operator_and_bastion} + + deploy_nginx: + type: boolean + default: true + title: Helm | Deploy Nginx ingress controller + description: Nginx ingress controller is used to expose the OKE services to the user. + visible: true + + nginx_user_values_override: + type: file + title: Helm | Nginx Ingress Controller helm chart values override + description: Override the values for the Nginx Ingress Controller Helm chart . + visible: ${deploy_nginx} + + deploy_cert_manager: + type: boolean + default: true + title: Helm | Deploy Cert-Manager + description: Cert-manager is used to generate TLS certificates for the ingress resources. + visible: true + + cert_manager_user_values_override: + type: file + title: Helm | Cert-Manager helm chart values override + description: Override the values for the Cert-Manager chart . + visible: ${deploy_cert_manager} + + deploy_jupyterhub: + type: boolean + default: true + title: Helm | Deploy JupyterHub + description: JupyterHub provides a web accessible Python environment where demos can be easily executed. + visible: true + + jupyterhub_user_values_override: + type: file + title: Helm | JupyterHub helm chart values override + description: Override the values for the JupyterHub Helm chart . + visible: ${deploy_jupyterhub} + + jupyter_admin_user: + type: string + minLength: 3 + maxLength: 16 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: Helm | JupyterHub - Admin User + description: The admin user to connect to the Jupyter Notebooks. + default: oracle-ai + required: true + visible: ${deploy_jupyterhub} + + jupyter_admin_password: + type: password + pattern: "^[\\S]{6,16}$" + title: Helm | JupyterHub - Admin Password + description: "The admin password to connect to the Jupyter Notebooks. (min: 6, max: 16 characters)" + confirmation: true + required: true + visible: ${deploy_jupyterhub} + + jupyter_playbooks_repo: + type: string + title: Helm | JupyterHub - Playbooks Git Repo + description: "Git Repository with Jupyter plabooks examples." + default: "https://github.com/ionut-sturzu/nim_notebooks.git" + visible: ${deploy_jupyterhub} + + deploy_nim: + type: boolean + default: true + title: Helm | Deploy nVidia NIM LLM + description: NIM is NVIDIA's set of accelerated inference microservices that allow organizations to run AI models on NVIDIA GPUs anywhere. + visible: true + + nim_user_values_override: + type: file + title: Helm | NIM helm chart values override + description: Override the values for the NIM Helm chart . + visible: ${deploy_nim} + + nim_image_repository: + type: string + title: Helm | NIM - Container image repo + description: The NIM container image repository. + default: "nvcr.io/nim/meta/llama3-8b-instruct" + visible: ${deploy_nim} + required: true + + nim_image_tag: + type: string + title: Helm | NIM - Container image tag + description: The NIM container image tag. + default: "latest" + visible: ${deploy_nim} + required: true + + NGC_API_KEY: + type: password + title: Helm | NIM - NGC API KEY + description: API key to authenticate to the NGC service. + visible: ${deploy_nim} + +outputs: + bastion: + title: Bastion public IP + description: The bastion host public IP address. + type: copyableString + + operator: + title: Operator public IP + description: The operator host public IP address. + sensitive: true + type: copyableString + visible: false + + ssh_to_operator: + title: Command to connect to operator host. + description: The ssh command to connect to the operator host via the bastion. + sensitive: true + type: copyableString + + jupyter_hub_url: + title: URL to access JupyterHub + type: link + displayText: URL to access JupyterHub + +outputGroups: +- title: "Access details" + outputs: + - ${bastion} + - ${operator} + - ${ssh_to_operator} + - ${jupyter_hub_url} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example b/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example new file mode 100644 index 00000000..eead931e --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example @@ -0,0 +1,32 @@ +# ORM injected values + +region = "uk-london-1" +tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaaiyavtwbz4kyu7g7b6wglllccbflmjx2lzk5nwpbme44mv54xu7dq" +compartment_ocid = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# OKE Terraform module values +create_iam_resources = false +create_iam_tag_namespace = false +ssh_public_key = "" + +## NodePool with non-GPU shape is created by default with size 1 +simple_np_flex_shape = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 12 } + +## NodePool with GPU shape is created by default with size 0 +gpu_np_size = 1 +gpu_np_shape = "VM.GPU.A10.1" + +## OKE Deployment values +cluster_name = "oke" +vcn_name = "oke-vcn" +compartment_id = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# Jupyter Hub deployment values +jupyter_admin_user = "oracle-ai" +jupyter_admin_password = "" +jupyter_playbooks_repo = "https://github.com/ionut-sturzu/nim_notebooks.git" + +# NIM Deployment values +nim_image_repository = "nvcr.io/nim/meta/llama3-8b-instruct" +nim_image_tag = "latest" +NGC_API_KEY = "" \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/tls.tf b/cloud-service-providers/oci/oke/terraform/tls.tf new file mode 100644 index 00000000..2200cf43 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/tls.tf @@ -0,0 +1,12 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + user_public_ssh_key = chomp(var.ssh_public_key) + bundled_ssh_public_keys = "${local.user_public_ssh_key}\n${chomp(tls_private_key.stack_key.public_key_openssh)}" +} + +resource "tls_private_key" "stack_key" { + algorithm = "RSA" + rsa_bits = 4096 +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/variables.tf b/cloud-service-providers/oci/oke/terraform/variables.tf new file mode 100644 index 00000000..555fd416 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/variables.tf @@ -0,0 +1,565 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +### ORM Variables +variable "compartment_ocid" { + type = string + default = null + description = "A compartment OCID automatically populated by Resource Manager." +} + +variable "current_user_ocid" { + type = string + default = null + description = "A user OCID automatically populated by Resource Manager." +} + +variable "region" { + type = string + default = null + description = "The OCI region where OKE resources will be created." +} + +variable "tenancy_ocid" { + type = string + default = null + description = "The tenancy id of the OCI Cloud Account in which to create the resources." +} + + +### OKE Module - Common Variables +variable "create_operator_and_bastion" { + type = bool + default = true + description = "Whether to create bastion and operator host." +} + +variable "ssh_public_key" { + type = string + description = "The contents of the SSH public key file. Used to allow login for workers/bastion/operator. This public key " +} + +variable "state_id" { + type = string + default = null + description = "Optional Terraform state_id used to identify the resources of this deployment." +} + + +### OKE Module - Bastion Variables +variable "bastion_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "A list of CIDR blocks to allow SSH access to the bastion host." +} + +variable "bastion_image_os" { + type = string + default = "Oracle Linux" + description = "Image ID for created bastion instance." +} + +variable "bastion_image_os_version" { + type = string + default = "8" + description = "Bastion image operating system version when bastion_image_type = 'platform'." +} + +variable "bastion_image_type" { + type = string + default = "platform" + description = "Whether to use a platform or custom image for the created bastion instance. When custom is set, the bastion_image_id must be specified." + + validation { + condition = contains(["platform", "custom"], var.bastion_image_type) + error_message = "The bastion_image_type can be only `platform` or `custom`." + } +} + +variable "bastion_image_id" { + type = string + default = null + description = "Image ID for created bastion instance." +} + +variable "bastion_user" { + type = string + default = "opc" + description = "User for SSH access through bastion host." +} + + +### OKE Module - Operator Variables +variable "create_operator_policy_to_manage_cluster" { + default = true + description = "Whether to create minimal IAM policy to allow the operator host to manage the cluster." + type = bool +} + +variable "operator_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "List with allowed CIDR blocks to connect to the operator host." +} + +variable "operator_image_os" { + type = string + default = "Oracle Linux" + description = "Operator image operating system name when operator_image_type = 'platform'." +} + +variable "operator_image_os_version" { + type = string + default = "8" + description = "Operator image operating system version when operator_image_type = 'platform'." +} + +variable "operator_image_type" { + type = string + default = "platform" + description = "Whether to use a platform or custom image for the created operator instance. When custom is set, the operator_image_id must be specified." + + validation { + condition = contains(["platform", "custom"], var.operator_image_type) + error_message = "The operator_image_type can be only `platform` or `custom`." + } +} + +variable "operator_install_kubectl_from_repo" { + default = false + description = "Whether to install kubectl on the created operator host from olcne repo." + type = bool +} + +variable "operator_image_id" { + type = string + default = null + description = "Image ID for created operator instance." +} + +variable "operator_user" { + type = string + default = "opc" + description = "User for SSH access to operator host." +} + + +### OKE Module - Cluster Variables +variable "create_cluster" { + type = bool + default = true + description = "Whether to create the OKE cluster and dependent resources." +} + +variable "cluster_name" { + type = string + default = "oke" + description = "The name of oke cluster." +} + +variable "cluster_type" { + default = "enhanced" + description = "The cluster type. See Working with Enhanced Clusters and Basic Clusters for more information." + type = string + validation { + condition = contains(["basic", "enhanced"], lower(var.cluster_type)) + error_message = "Accepted values are 'basic' or 'enhanced'." + } +} + +variable "cni_type" { + type = string + default = "flannel" + description = "The CNI for the cluster: 'flannel' or 'npn'. See Pod Networking." + + validation { + condition = contains(["flannel", "npn"], var.cni_type) + error_message = "The cni_type can be only `flannel` or `npn`." + } +} + +variable "control_plane_is_public" { + type = bool + default = true + description = "Whether the Kubernetes control plane endpoint should be allocated a public IP address to enable access over public internet." +} + +variable "control_plane_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "The list of CIDR blocks from which the control plane can be accessed." +} + +variable "create_vcn" { + type = bool + default = true + description = "Whether to create a Virtual Cloud Network." +} + +variable "kubernetes_version" { + type = string + default = "v1.30.1" + description = "The version of kubernetes to use when provisioning OKE." +} + +variable "load_balancers" { + default = "both" + description = "The type of subnets to create for load balancers." + type = string + validation { + condition = contains(["public", "internal", "both"], var.load_balancers) + error_message = "Accepted values are public, internal or both." + } +} + +variable "preferred_load_balancer" { + default = "public" + description = "The preferred load balancer subnets that OKE will automatically choose when creating a load balancer. Valid values are 'public' or 'internal'. If 'public' is chosen, the value for load_balancers must be either 'public' or 'both'. If 'private' is chosen, the value for load_balancers must be either 'internal' or 'both'. NOTE: Service annotations for internal load balancers must still be specified regardless of this setting. See Load Balancer Annotations for more information." + type = string + validation { + condition = contains(["public", "internal"], var.preferred_load_balancer) + error_message = "Accepted values are public or internal." + } +} + +variable "pods_cidr" { + default = "10.244.0.0/16" + description = "The CIDR range used for IP addresses by the pods. A /16 CIDR is generally sufficient. This CIDR should not overlap with any subnet range in the VCN (it can also be outside the VCN CIDR range). Ignored when cni_type = 'npn'." + type = string +} + +variable "services_cidr" { + default = "10.96.0.0/16" + description = "The CIDR range used within the cluster by Kubernetes services (ClusterIPs). This CIDR should not overlap with the VCN CIDR range." + type = string +} + + +### OKE Module - IAM Variables +variable "compartment_id" { + type = string + default = null + description = "The compartment id where resources will be created." +} + +variable "create_iam_resources" { + type = bool + default = false + description = "Whether to create IAM dynamic groups, policies, and defined tags." +} + +variable "create_iam_defined_tags" { + type = bool + default = false + description = "Whether to create defined tag keys." +} + +variable "create_iam_autoscaler_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for Cluster Autoscaler management. Depends on configuration of associated component when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_autoscaler_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_kms_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for cluster autoscaler. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_kms_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_operator_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for operator access to the OKE control plane. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_operator_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_worker_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for self-managed worker nodes. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_worker_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_tag_namespace" { + type = bool + default = false + description = "Whether to create defined tag namespace." +} + +variable "defined_tags" { + default = { + bastion = {} + cluster = {} + iam = {} + network = {} + operator = {} + persistent_volume = {} + service_lb = {} + workers = {} + } + description = "Defined tags to be applied to created resources. Must already exist in the tenancy." + type = any +} + +variable "freeform_tags" { + default = { + bastion = {} + cluster = {} + iam = {} + network = {} + operator = {} + persistent_volume = {} + service_lb = {} + workers = {} + } + description = "Freeform tags to be applied to created resources." + type = any +} + +variable "tag_namespace" { + type = string + default = "oke" + description = "The tag namespace to use if use_defined_tags=true." +} + +variable "use_defined_tags" { + type = bool + default = false + description = "Wether to set defined tags on the creted resources. By default only free-form tags are used." +} + + +### OKE Module - Network Variables +variable "cidr_vcn" { + type = string + default = "10.0.0.0/16" + description = "The IPv4 CIDR block the VCN will use." +} + +variable "cidr_bastion_subnet" { + type = string + default = "10.0.0.0/29" + description = "The IPv4 CIDR block to be used for the bastion subnet." +} + +variable "cidr_operator_subnet" { + type = string + default = "10.0.0.64/29" + description = "The IPv4 CIDR block to be used for the operator subnet." +} + +variable "cidr_cp_subnet" { + type = string + default = "10.0.0.8/29" + description = "The IPv4 CIDR block to be used for the OKE control plane endpoint." +} + +variable "cidr_int_lb_subnet" { + type = string + default = "10.0.0.32/27" + description = "The IPv4 CIDR block to be used for the private load balancer subnet." +} + +variable "cidr_pub_lb_subnet" { + type = string + default = "10.0.128.0/27" + description = "The IPv4 CIDR block to be used for the public load balancer subnet." +} + +variable "cidr_workers_subnet" { + type = string + default = "10.0.144.0/20" + description = "The IPv4 CIDR block to be used for the kubernetes workers subnet." +} + +variable "cidr_pods_subnet" { + type = string + default = "10.0.64.0/18" + description = "The IPv4 CIDR block to be used for the pods subnet." +} + +variable "vcn_id" { + type = string + default = null + description = "Optional ID of existing VCN. Takes priority over vcn_name filter. Ignored when `create_vcn = true`." +} + +variable "ig_route_table_id" { + default = null + description = "Optional ID of existing public subnets route table in VCN." + type = string +} + +variable "nat_route_table_id" { + default = null + description = "Optional ID of existing private subnets route table in VCN." + type = string +} + +variable "vcn_name" { + type = string + default = null + description = "Display name for the created VCN. Defaults to 'oke' suffixed with the generated Terraform 'state_id' value." +} + + +### OKE Module - Worker NodePool Variables +variable "gpu_np_size" { + type = number + default = 0 + description = "The size of the nodepool with GPU shapes." +} + +variable "gpu_np_boot_volume_size" { + type = number + default = 100 + description = "The size of the boot volume for the nodes in the GPU nodepool." +} + +variable "gpu_np_shape" { + type = string + default = "VM.GPU.A10.1" + description = "The compute shape to use for the GPUs nodepool." +} + + +variable "gpu_np_placement_ads" { + type = list(any) + default = [] + description = "List with the ADs where to attempt the placement of the GPU worker nodes." +} + +variable "simple_np_flex_shape" { + type = map(any) + default = { + "instanceShape" = "VM.Standard.E4.Flex" + "ocpus" = 2 + "memory" = 16 + } + description = "The compute shape and configuration to use for the non-GPU kubernetes nodepool." +} + +variable "simple_np_size" { + type = number + default = 1 + description = "The size of the non-GPU kubernetes nodepool." +} + +variable "simple_np_boot_volume_size" { + type = number + default = 50 + description = "The boot volume size for the nodes in the non-GPU kubernetes nodepool." +} + + +### Helm chart deployments +variable "deploy_nginx" { + type = bool + default = true + description = "Controls the deployment of the nginx helm chart." +} + +variable "nginx_user_values_override" { + type = string + default = "" + description = "User provided values to override the Nginx helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_cert_manager" { + type = bool + default = true + description = "Controls the deployment of the cert-manager helm chart." +} + +variable "cert_manager_user_values_override" { + type = string + default = "" + description = "User provided values to override the Cert-Manager helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_jupyterhub" { + type = bool + default = true + description = "Controls the deployment of the jupyterhub helm chart." +} + +variable "jupyterhub_user_values_override" { + type = string + default = "" + description = "User provided values to override the JupyterHub helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_nim" { + type = bool + default = true + description = "Controls the deployment of the NIM helm chart." +} + +variable "nim_user_values_override" { + type = string + default = "" + description = "User provided values to override the NIM helm chart defaults and those generated by Terraform using the templates." +} + +### JupyterHub Values +variable "jupyter_admin_user" { + type = string + description = "JupyterHub administrative user name." + default = "oracle-ai" +} + +variable "jupyter_admin_password" { + type = string + description = "JupyterHub administrative user password." +} + +variable "jupyter_playbooks_repo" { + type = string + default = "https://github.com/robo-cap/llm-jupyter-notebooks.git" + description = "Link for the Git repository that will be automatically imported in the home directory of the JupyterHub container." +} + +### NIM Values +variable "nvcr_username" { + type = string + default = "$oauthtoken" + description = "User to be used to pull the NIM container image." +} + +variable "nvcr_password" { + type = string + description = "Password to be used to pull NIM container image. If no password is set, NGC_API_KEY is used instead." + default = "" +} + +variable "nim_image_repository" { + type = string + default = "nvcr.io/nim/meta/llama3-8b-instruct" + description = "The NIM container image repository." +} + +variable "nim_image_tag" { + type = string + default = "latest" + description = "The NIM container image tag." +} + +variable "NGC_API_KEY" { + type = string + description = "NGC API KEY. https://org.ngc.nvidia.com/setup/api-key" + default = "" +} \ No newline at end of file From 1df5b210e39f0449078f69621d202b0531383437 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 13:06:29 +0300 Subject: [PATCH 14/15] , --- cloud-service-providers/oci/oke/terraform/README.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cloud-service-providers/oci/oke/terraform/README.md b/cloud-service-providers/oci/oke/terraform/README.md index 8934d3c0..42286b5c 100644 --- a/cloud-service-providers/oci/oke/terraform/README.md +++ b/cloud-service-providers/oci/oke/terraform/README.md @@ -22,12 +22,8 @@ In case the bastion and operator hosts are not created, is a prerequisite to hav - kubectl - oci-cli -[ -![Deploy to Oracle Cloud] -(https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg) -] -(https://cloud.oracle.com/resourcemanager/stacks/create -?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) +[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) + ## Helm Deployments @@ -115,7 +111,7 @@ compartment_id = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6 # Jupyter Hub deployment values jupyter_admin_user = "oracle-ai" jupyter_admin_password = "" -playbooks_repo = "https://github.com/robo-cap/llm-jupyter-notebooks.git" +playbooks_repo = "https://github.com/ionut-sturzu/nim_notebooks.git" # NIM Deployment values nim_image_repository = "nvcr.io/nim/meta/llama3-8b-instruct" From 2b2c652f4e23c97442aa7eaecc9b9d5024a1a1b7 Mon Sep 17 00:00:00 2001 From: Adina Date: Fri, 30 Aug 2024 13:08:18 +0300 Subject: [PATCH 15/15] add deploy to oci button --- cloud-service-providers/oci/oke/terraform/README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/cloud-service-providers/oci/oke/terraform/README.md b/cloud-service-providers/oci/oke/terraform/README.md index 42286b5c..a86f4211 100644 --- a/cloud-service-providers/oci/oke/terraform/README.md +++ b/cloud-service-providers/oci/oke/terraform/README.md @@ -66,12 +66,7 @@ In case of Mistral models, create a file `nim_user_values_override.yaml` file wi 1. Deploy directly to OCI using the below button: -[ -![Deploy to Oracle Cloud] -(https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg) -] -(https://cloud.oracle.com/resourcemanager/stacks/create -?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) +[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) 2. Deploy via ORM