diff --git a/cloud-service-providers/oci/oke/README.md b/cloud-service-providers/oci/oke/README.md new file mode 100644 index 00000000..3b0c1b3e --- /dev/null +++ b/cloud-service-providers/oci/oke/README.md @@ -0,0 +1,18 @@ +# NIM on Oracle Cloud Infrastructure (OCI) OKE + +To deploy NIM on Oracle Cloud Infrastructure (OCI) successfully, it’s crucial to choose the correct GPU shapes and ensure that the appropriate NVIDIA drivers are installed. + +When you select a GPU shape for a managed node pool or self-managed node in OKE, you must also select a compatible Oracle Linux GPU image that has the CUDA libraries pre-installed. The names of compatible images include 'GPU'. OCI offers Oracle Linux (OEL) providing the possibility to use pre-installed GPU drivers. This simplifies the deployment process for NIM. + + +## Prerequisites + +Please follow [Prerequisite instructions](./prerequisites/README.md) to get ready for OKE creation. + +## Create OKE + +Please follow [Create OKE instruction](./setup/README.md) to create OKE. + +## Deploy NIM + +Please follow [Deploy NIM instruction](../../../helm/README.md) to deploy NIM. diff --git a/cloud-service-providers/oci/oke/prerequisites/README.md b/cloud-service-providers/oci/oke/prerequisites/README.md new file mode 100644 index 00000000..47e5183a --- /dev/null +++ b/cloud-service-providers/oci/oke/prerequisites/README.md @@ -0,0 +1,66 @@ +### OKE Prerequisites + +This list summarizes the key prerequisites you need to set up before deploying an OKE cluster on OCI. + +- **OCI Account and Tenancy**: + - Ensure you have an OCI account with the necessary permissions. + - Set up a compartment for your Kubernetes cluster. + +- **Networking**: + - Create a Virtual Cloud Network (VCN) with appropriate subnets. + - Ensure internet gateway, NAT gateway, and service gateway are configured. + - Set up route tables and security lists for network traffic. + +- **IAM Policies**: + - Define IAM policies to allow OKE service to manage resources in your compartment. + - Grant required permissions to users or groups managing the Kubernetes cluster. + +- **Service Limits**: + - Verify that your tenancy has sufficient service limits for compute instances, block storage, and other required resources. + +- **CLI and SDK Tools**: + - Install and configure the OCI CLI for managing OKE. + - Optionally, set up OCI SDKs for automating tasks. + +- **Kubernetes Version**: + - Decide on the Kubernetes version to deploy, ensuring compatibility with your applications and OCI features. + +- **API Endpoint**: + - Choose between the public or private endpoint for the Kubernetes API server, based on your security requirements. + +For more details, please reference this [link.](https://docs.oracle.com/en-us/iaas/Content/ContEng/Concepts/contengprerequisites.htm) + + +## Install OCI CLI + +``` +bash -c "$(curl -L https://raw.githubusercontent.com/oracle/oci-cli/master/scripts/install/install.sh)" +``` + +For more details, please reference this [link.](https://docs.oracle.com/en-us/iaas/Content/API/SDKDocs/cliinstall.htm) + +## Install kubectl + +``` +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl.sha256" +echo "$(cat kubectl.sha256) kubectl" | sha256sum --check +sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl +kubectl version --client +``` + +For more details, please reference this [link.](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) + +## Install Helm + +``` +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 get_helm.sh +./get_helm.sh +``` + +For more details, please reference this [link.](https://helm.sh/docs/intro/install/) + +## Next step + +[Continue to OKE creation](../setup/README.md) \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/setup/README.md b/cloud-service-providers/oci/oke/setup/README.md new file mode 100644 index 00000000..f0cdfbb1 --- /dev/null +++ b/cloud-service-providers/oci/oke/setup/README.md @@ -0,0 +1,85 @@ +# Setup OCI Kubernetes Engine (OKE) + +The key to creating Oracle Kubernetes Engine (OKE) for NIM is to create a proper GPU node pool. The following steps will guide you through the process. + +## Connect to OCI + +1. Log in to your Oracle Cloud Infrastructure (OCI) Console. +2. Select the appropriate compartment where you want to create the OKE cluster. + +## Identify GPU needed for NIM + +- Refer to the NIM documentation to identify the NVIDIA GPU you [need](https://docs.nvidia.com/nim/large-language-models/latest/support-matrix.html). Here is also a list of available [OKE NVIDIA GPU node shapes](https://docs.oracle.com/en-us/iaas/Content/Compute/References/computeshapes.htm#vm-gpu). + + +## Confirm the GPU availability in + +Use the OCI CLI to search for GPU availability: + + ```bash + oci compute shape list --region --compartment-id --all --query 'data[*].shape' --output json | jq -r '.[]' | grep -i 'gpu' + ``` + + Cross-reference with the [OCI Regions](https://www.oracle.com/cloud/data-regions.html) to select the best region. + +## Request Quota + +Ensure you have the necessary service limits (quota) for the GPU shapes. If needed, request an increase via the OCI Console: + +1. Navigate to **Governance and Administration** > **Limits, Quotas, and Usage**. +2. Select **Request Service Limit Increase** for the relevant GPU shapes. + +## Create OKE + +To easily create the OKE cluster and NVIDIA GPU nodepool, you can use Quick Create to initially set up the cluster with a default node pool that includes a single, simple VM node. After the cluster is created, you can add a new node pool with GPU shapes that require a larger boot volume size. This way, you can ensure the GPUs have the necessary storage while manually configuring the nodes as needed. + +1. In the OCI Console, navigate to **Developer Services** > **Kubernetes Clusters** > **OKE Clusters**. +2. Click **Create Cluster** and select **Quick Create**. +3. Configure the following: + - **Name**: Provide a name for your cluster. + - **Compartment**: Select the appropriate compartment. + - **Kubernetes Version**: Choose the latest stable version. + - **Kubernetes API endpoint**: Private or public. + - **Node type**: Managed. + - **Kubernetes worker nodes**: Private or public. +4. Under **Shape and image**: + - **Shape**: You can leave the default simple VM.Standard.x. + - **Node Count**: Start with 1 node (adjust as needed). + - **Add an SSH key**(optional): In order to have access to nodes. +5. Click **Create Cluster** to start the provisioning process. This will provision a simple cluster, to which you can subsequently add a GPU nodepool. + +## Create GPU nodepool on existing OKE cluster + +1. For an existing OKE cluster, navigate to the **Node Pools** section. +2. Click **Add Node Pool** and configure: + - **Name**: Provide a name for the node pool. + - **Compartment**: Select the appropriate compartment. + - **Version**: the Kubernetes version of the nodes - defaults to current cluster version. + - **Node Placement Configuration** - select Availability Domain and Worker node subnet. + - **Node Shape**: Select the desired GPU-enabled shape. + - **Node Image**: is automatically populated with an OEL GPU image which you can change to a different version. + - **Node Count**: Set the number of nodes (adjust according to your needs). + - **Boot volume**: Specify a larger size than the default 50GB size, for example 300 GB. To complement this change also go to the next point on custo cloudinit.sh. + - **Show advanced options** -> **Initialization script** -> **Paste Cloud-init Script** and paste: + ``` + #!/bin/bash + curl --fail -H "Authorization: Bearer Oracle" -L0 http://169.254.169.254/opc/v2/instance/metadata/oke_init_script | base64 --decode >/var/run/oke-init.sh + bash /var/run/oke-init.sh + /usr/libexec/oci-growfs -y + ``` +3. Click **Create Node Pool**. + +## Connect to OKE + +1. Install the OCI CLI if you haven't already. +2. Retrieve the OKE cluster credentials using the Access Cluster buton in the console Cluster details page: + + ```bash + oci ce cluster create-kubeconfig --cluster-id --file $HOME/.kube/config --region --token-version 2.0.0 --kube-endpoint PUBLIC_ENDPOINT + ``` + +3. Verify the connection to your OKE cluster: + + ```bash + kubectl get nodes + ``` diff --git a/cloud-service-providers/oci/oke/terraform/README.md b/cloud-service-providers/oci/oke/terraform/README.md new file mode 100644 index 00000000..a86f4211 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/README.md @@ -0,0 +1,133 @@ +# orm-stack-oke-helm-deployment-nim + +## Getting started + +This stack deploys an OKE cluster with two nodepools: +- one nodepool with flexible shapes +- one nodepool with GPU shapes + +And several supporting applications using helm: +- nginx +- cert-manager +- jupyterhub + +With the scope of demonstrating [nVidia NIM LLM](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) self-hosted model capabilities. + +**Note:** For helm deployments it's necessary to create bastion and operator host (with the associated policy for the operator to manage the clsuter), **or** configure a cluster with public API endpoint. + +In case the bastion and operator hosts are not created, is a prerequisite to have the following tools already installed and configured: +- bash +- helm +- jq +- kubectl +- oci-cli + +[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) + + +## Helm Deployments + +### Nginx + +[Nginx](https://kubernetes.github.io/ingress-nginx/deploy/) is deployed and configured as default ingress controller. + +### Cert-manager + +[Cert-manager](https://cert-manager.io/docs/) is deployed to handle the configuration of TLS certificate for the configured ingress resources. Currently it's using the [staging Let's Encrypt endpoint](https://letsencrypt.org/docs/staging-environment/). + +### Jupyterhub + +[Jupyterhub](https://jupyterhub.readthedocs.io/en/stable/) will be accessible to the address: [https://jupyter.a.b.c.d.nip.io](https://jupyter.a.b.c.d.nip.io), where a.b.c.d is the public IP address of the load balancer associated with the NGINX ingress controller. + +JupyterHub is using a dummy authentication scheme (user/password) and the access is secured using the variables: + +``` +jupyter_admin_user +jupyter_admin_password +``` + +It also supports the option to automatically clone a git repo when user is connecting and making it available under `examples` directory. + +### NIM + +The LLM is deployed using [NIM](https://docs.nvidia.com/nim/index.html). + +Parameters: +- `nim_image_repository` and `nim_image_tag` - used to specify the container image location +- `NGC_API_KEY` - required to authenticate with NGC services + +Models with large context length require GPUs with lots of memory. In case of Mistral, with a context length of 32k, the deployment on A10 instances, fails with the default container settings. + +To work around this issue, we can limit the context length using the `--max-model-len` argument for the vLLM. The underlying inference engine used by NIM. + +In case of Mistral models, create a file `nim_user_values_override.yaml` file with the content below and provide it as input during ORM stack variable configuration. + +## How to deploy? + +1. Deploy directly to OCI using the below button: + +[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/ionut-sturzu/nim_on_oke/archive/refs/heads/main.zip) + + +2. Deploy via ORM +- Create a new stack +- Upload the TF configuration files +- Configure the variables +- Apply + +3. Local deployment + +- Create a file called `terraform.auto.tfvars` with the required values. + +``` +# ORM injected values + +region = "uk-london-1" +tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaaiyavtwbz4kyu7g7b6wglllccbflmjx2lzk5nwpbme44mv54xu7dq" +compartment_ocid = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# OKE Terraform module values +create_iam_resources = false +create_iam_tag_namespace = false +ssh_public_key = "" + +## NodePool with non-GPU shape is created by default with size 1 +simple_np_flex_shape = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 16 } + +## NodePool with GPU shape is created by default with size 0 +gpu_np_size = 1 +gpu_np_shape = "VM.GPU.A10.1" + +## OKE Deployment values +cluster_name = "oke" +vcn_name = "oke-vcn" +compartment_id = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# Jupyter Hub deployment values +jupyter_admin_user = "oracle-ai" +jupyter_admin_password = "" +playbooks_repo = "https://github.com/ionut-sturzu/nim_notebooks.git" + +# NIM Deployment values +nim_image_repository = "nvcr.io/nim/meta/llama3-8b-instruct" +nim_image_tag = "latest" +NGC_API_KEY = "" +``` + +- Execute the commands + +``` +terraform init +terraform plan +terraform apply +``` + +After the deployment is successful, get the Jupyter URL from the Terraform output and run it in the browser. +Log in with the user/password that you previously set. +Open and run the **NVIDIA_NIM_model_interaction.ipynb** notebook. + +## Known Issues + +If `terraform destroy` fails, manually remove the LoadBalancer resource configured for the Nginx Ingress Controller. + +After `terrafrom destroy`, the block volumes corresponding to the PVCs used by the applications in the cluster won't be removed. You have to manually remove them. \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/common.tf b/cloud-service-providers/oci/oke/terraform/common.tf new file mode 100644 index 00000000..a9a3dd33 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/common.tf @@ -0,0 +1,14 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + state_id = coalesce(var.state_id, random_string.state_id.id) +} + +resource "random_string" "state_id" { + length = 6 + lower = true + numeric = false + special = false + upper = false +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/datasources.tf b/cloud-service-providers/oci/oke/terraform/datasources.tf new file mode 100644 index 00000000..25a52e60 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/datasources.tf @@ -0,0 +1,49 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +data "oci_identity_tenancy" "tenant_details" { + + tenancy_id = var.tenancy_ocid +} + +data "oci_identity_regions" "home_region" { + + filter { + name = "key" + values = [data.oci_identity_tenancy.tenant_details.home_region_key] + } +} + +data "oci_identity_availability_domains" "ads" { + + compartment_id = var.tenancy_ocid +} + +data "oci_core_shapes" "gpu_shapes" { + for_each = { for entry in data.oci_identity_availability_domains.ads.availability_domains : entry.name => entry.id } + + compartment_id = var.compartment_id + availability_domain = each.key + + filter { + name = "name" + values = [var.gpu_np_shape] + } +} + +data "oci_load_balancer_load_balancers" "lbs" { + + compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + + filter { + name = "freeform_tags.state_id" + values = [local.state_id] + } + + filter { + name = "freeform_tags.application" + values = ["nginx"] + } + + depends_on = [module.nginx] +} diff --git a/cloud-service-providers/oci/oke/terraform/helm-deployments.tf b/cloud-service-providers/oci/oke/terraform/helm-deployments.tf new file mode 100644 index 00000000..cce93e9f --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-deployments.tf @@ -0,0 +1,192 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + deploy_from_operator = var.create_operator_and_bastion + deploy_from_local = alltrue([!local.deploy_from_operator, var.control_plane_is_public]) +} + +data "oci_containerengine_cluster_kube_config" "kube_config" { + count = local.deploy_from_local ? 1 : 0 + + cluster_id = module.oke.cluster_id + endpoint = "PUBLIC_ENDPOINT" +} + +module "nginx" { + count = var.deploy_nginx ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "ingress-nginx" + helm_chart_name = "ingress-nginx" + namespace = "nginx" + helm_repository_url = "https://kubernetes.github.io/ingress-nginx" + + pre_deployment_commands = [] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/nginx-values.yaml.tpl", + { + min_bw = 100, + max_bw = 100, + pub_lb_nsg_id = module.oke.pub_lb_nsg_id + state_id = local.state_id + } + ) + helm_user_values_override = try(base64decode(var.nginx_user_values_override), var.nginx_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + depends_on = [module.oke] +} + + +module "cert-manager" { + count = var.deploy_cert_manager ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "cert-manager" + helm_chart_name = "cert-manager" + namespace = "cert-manager" + helm_repository_url = "https://charts.jetstack.io" + + pre_deployment_commands = [] + post_deployment_commands = [ + "cat <<'EOF' | kubectl apply -f -", + "apiVersion: cert-manager.io/v1", + "kind: ClusterIssuer", + "metadata:", + " name: le-clusterissuer", + "spec:", + " acme:", + " # You must replace this email address with your own.", + " # Let's Encrypt will use this to contact you about expiring", + " # certificates, and issues related to your account.", + " email: user@oracle.om", + " server: https://acme-staging-v02.api.letsencrypt.org/directory", + " privateKeySecretRef:", + " # Secret resource that will be used to store the account's private key.", + " name: le-clusterissuer-secret", + " # Add a single challenge solver, HTTP01 using nginx", + " solvers:", + " - http01:", + " ingress:", + " ingressClassName: nginx", + "EOF" + ] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/cert-manager-values.yaml.tpl", + {} + ) + helm_user_values_override = try(base64decode(var.cert_manager_user_values_override), var.cert_manager_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke] +} + +module "jupyterhub" { + count = var.deploy_jupyterhub ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "jupyterhub" + helm_chart_name = "jupyterhub" + namespace = "default" + helm_repository_url = "https://hub.jupyter.org/helm-chart/" + + pre_deployment_commands = ["export PUBLIC_IP=$(kubectl get svc -A -l app.kubernetes.io/name=ingress-nginx -o json | jq -r '.items[] | select(.spec.type == \"LoadBalancer\") | .status.loadBalancer.ingress[].ip')"] + deployment_extra_args = [ + "--set ingress.hosts[0]=jupyter.$${PUBLIC_IP}.nip.io", + "--set ingress.tls[0].hosts[0]=jupyter.$${PUBLIC_IP}.nip.io", + "--set ingress.tls[0].secretName=jupyter-tls" + ] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/jupyterhub-values.yaml.tpl", + { + admin_user = var.jupyter_admin_user + admin_password = var.jupyter_admin_password + playbooks_repo = var.jupyter_playbooks_repo + } + ) + helm_user_values_override = try(base64decode(var.jupyterhub_user_values_override), var.jupyterhub_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke, module.nginx] +} + +module "nim" { + count = var.deploy_nim ? 1 : 0 + source = "./helm-module" + + bastion_host = module.oke.bastion_public_ip + bastion_user = var.bastion_user + operator_host = module.oke.operator_private_ip + operator_user = var.bastion_user + ssh_private_key = tls_private_key.stack_key.private_key_openssh + + deploy_from_operator = local.deploy_from_operator + deploy_from_local = local.deploy_from_local + + deployment_name = "llm" + helm_chart_name = "nim-llm" + namespace = "default" + helm_repository_url = "https://robo-cap.github.io/helm-charts/" + + pre_deployment_commands = [ + "export PUBLIC_IP=$(kubectl get svc -A -l app.kubernetes.io/name=ingress-nginx -o json | jq -r '.items[] | select(.spec.type == \"LoadBalancer\") | .status.loadBalancer.ingress[].ip')", + "kubectl get secret -n default nvcr-${local.state_id} || kubectl create secret docker-registry -n default nvcr-${local.state_id} --docker-server=nvcr.io --docker-username='${var.nvcr_username}' --docker-password='%{if length(var.nvcr_password) > 0}${var.nvcr_password}%{else}${var.NGC_API_KEY}%{endif}'", + "kubectl get secret -n default ngcapi-${local.state_id} || kubectl create secret generic -n default ngcapi-${local.state_id} --from-literal=NGC_CLI_API_KEY=${var.NGC_API_KEY}", + ] + deployment_extra_args = [ + "--set service.name=llm", + "--timeout 10m0s" + ] + post_deployment_commands = [] + + helm_template_values_override = templatefile( + "${path.root}/helm-values-templates/nim-values.yaml.tpl", + { + nvcr_secret = "nvcr-${local.state_id}", + ngcapi_secret = "ngcapi-${local.state_id}", + nim_image_repository = var.nim_image_repository + nim_image_tag = var.nim_image_tag + NGC_API_KEY = var.NGC_API_KEY + } + ) + helm_user_values_override = try(base64decode(var.nim_user_values_override), var.nim_user_values_override) + + kube_config = one(data.oci_containerengine_cluster_kube_config.kube_config.*.content) + + depends_on = [module.oke, module.nginx] +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf b/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf new file mode 100644 index 00000000..644018fe --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-module/helm-deployement.tf @@ -0,0 +1,226 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + operator_helm_values_path = coalesce(var.operator_helm_values_path, "/home/${var.operator_user}/tf-helm-values") + operator_helm_charts_path = coalesce(var.operator_helm_charts_path, "/home/${var.operator_user}/tf-helm-charts") + operator_helm_chart_path = "${local.operator_helm_charts_path}/${var.namespace}-${var.deployment_name}-${basename(var.helm_chart_path)}" + + helm_values_override_user_file = "${var.namespace}-${var.deployment_name}-user-values-override.yaml" + helm_values_override_template_file = "${var.namespace}-${var.deployment_name}-template-values-override.yaml" + + operator_helm_values_override_user_file_path = join("/", [local.operator_helm_values_path, local.helm_values_override_user_file]) + operator_helm_values_override_template_file_path = join("/", [local.operator_helm_values_path, local.helm_values_override_template_file]) + + local_helm_values_override_user_file_path = join("/", [path.root, "generated", local.helm_values_override_user_file]) + local_helm_values_override_template_file_path = join("/", [path.root, "generated", local.helm_values_override_template_file]) + + local_kubeconfig_path = "${path.root}/generated/kubeconfig-${var.namespace}-${var.deployment_name}" +} + +resource "null_resource" "copy_chart_top_operator" { + count = var.deploy_from_operator && var.helm_chart_path != "" ? 1 : 0 + + triggers = { + helm_chart_path = var.helm_chart_path + } + + connection { + bastion_host = var.bastion_host + bastion_user = var.bastion_user + bastion_private_key = var.ssh_private_key + host = var.operator_host + user = var.operator_user + private_key = var.ssh_private_key + timeout = "40m" + type = "ssh" + } + + provisioner "remote-exec" { + inline = [ + "rm -rf ${local.operator_helm_chart_path}", + "mkdir -p ${local.operator_helm_charts_path}" + ] + } + + provisioner "file" { + source = var.helm_chart_path + destination = local.operator_helm_chart_path + } +} + +resource "null_resource" "helm_deployment_via_operator" { + count = var.deploy_from_operator ? 1 : 0 + + triggers = { + manifest_md5 = try(md5("${var.helm_template_values_override}-${var.helm_user_values_override}"), null) + deployment_name = var.deployment_name + namespace = var.namespace + bastion_host = var.bastion_host + bastion_user = var.bastion_user + ssh_private_key = var.ssh_private_key + operator_host = var.operator_host + operator_user = var.operator_user + } + + connection { + bastion_host = self.triggers.bastion_host + bastion_user = self.triggers.bastion_user + bastion_private_key = self.triggers.ssh_private_key + host = self.triggers.operator_host + user = self.triggers.operator_user + private_key = self.triggers.ssh_private_key + timeout = "40m" + type = "ssh" + } + + provisioner "remote-exec" { + inline = ["mkdir -p ${local.operator_helm_values_path}"] + } + + provisioner "file" { + content = var.helm_template_values_override + destination = local.operator_helm_values_override_template_file_path + } + + provisioner "file" { + content = var.helm_user_values_override + destination = local.operator_helm_values_override_user_file_path + } + + provisioner "remote-exec" { + inline = concat( + var.pre_deployment_commands, + [ + "if [ -s \"${local.operator_helm_values_override_user_file_path}\" ]; then", + join(" ", concat([ + "helm upgrade --install ${var.deployment_name}", + "%{if var.helm_chart_path != ""}${local.operator_helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif}", + "--namespace ${var.namespace} --create-namespace --wait", + "-f ${local.operator_helm_values_override_template_file_path}", + "-f ${local.operator_helm_values_override_user_file_path}" + ], var.deployment_extra_args)), + "else", + join(" ", concat([ + "helm upgrade --install ${var.deployment_name}", + "%{if var.helm_chart_path != ""}${local.operator_helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif}", + "--namespace ${var.namespace} --create-namespace --wait", + "-f ${local.operator_helm_values_override_template_file_path}" + ], var.deployment_extra_args)), + "fi" + ], + var.post_deployment_commands + ) + + } + + provisioner "remote-exec" { + when = destroy + inline = ["helm uninstall ${self.triggers.deployment_name} --namespace ${self.triggers.namespace} --wait"] + on_failure = continue + } + + lifecycle { + ignore_changes = [ + triggers["bastion_host"], + triggers["bastion_user"], + triggers["ssh_private_key"], + triggers["operator_host"], + triggers["operator_user"] + ] + } + + depends_on = [null_resource.copy_chart_top_operator] +} + + +resource "local_file" "helm_template_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.helm_template_values_override + filename = local.local_helm_values_override_template_file_path +} + + +resource "local_file" "helm_user_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.helm_user_values_override + filename = local.local_helm_values_override_user_file_path +} + +resource "local_file" "cluster_kube_config_file" { + count = var.deploy_from_local ? 1 : 0 + + content = var.kube_config + filename = local.local_kubeconfig_path +} + +resource "null_resource" "helm_deployment_from_local" { + count = var.deploy_from_local ? 1 : 0 + + triggers = { + manifest_md5 = try(md5("${var.helm_template_values_override}-${var.helm_user_values_override}"), null) + deployment_name = var.deployment_name + namespace = var.namespace + kube_config = var.kube_config + } + + provisioner "local-exec" { + working_dir = path.root + command = <<-EOT + export KUBECONFIG=${local.local_kubeconfig_path} + ${join("\n", var.pre_deployment_commands)} + if [ -s "${local.local_helm_values_override_user_file_path}" ]; then + echo "" + echo "Terraform generated values:" + cat "${local.local_helm_values_override_template_file_path}" + echo "" + echo "User provided values:" + cat "${local.local_helm_values_override_user_file_path}" + echo "" + helm upgrade --install ${var.deployment_name} \ + %{if var.helm_chart_path != ""}${var.helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif} \ + --namespace ${var.namespace} \ + --create-namespace --wait \ + -f ${local.local_helm_values_override_template_file_path} \ + -f ${local.local_helm_values_override_user_file_path} ${join(" ", var.deployment_extra_args)} + else + echo "" + echo "Terraform generated values:" + cat "${local.local_helm_values_override_template_file_path}" + echo "" + helm upgrade --install ${var.deployment_name} \ + %{if var.helm_chart_path != ""}${var.helm_chart_path}%{else}${var.helm_chart_name} --repo ${var.helm_repository_url}%{endif} \ + --namespace ${var.namespace} \ + --create-namespace --wait \ + -f ${local.local_helm_values_override_template_file_path} ${join(" ", var.deployment_extra_args)} + fi + ${join("\n", var.post_deployment_commands)} + EOT + } + + # This provisioner is not executed when the resource is commented out: https://github.com/hashicorp/terraform/issues/25073 + provisioner "local-exec" { + when = destroy + environment = { + kube_config = self.triggers.kube_config + } + working_dir = path.root + command = <<-EOT + mkdir -p ./generated; \ + echo "$kube_config" > ./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy; \ + export KUBECONFIG=./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy; \ + helm uninstall ${self.triggers.deployment_name} --namespace ${self.triggers.namespace} --wait; \ + rm ./generated/kubeconfig-${self.triggers.namespace}-${self.triggers.deployment_name}-on-destroy + EOT + on_failure = continue + } + lifecycle { + ignore_changes = [ + triggers["local_kubeconfig_path"] + ] + } + + depends_on = [local_file.cluster_kube_config_file, local_file.helm_template_file, local_file.helm_user_file] +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf b/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf new file mode 100644 index 00000000..664d8e34 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-module/variables.tf @@ -0,0 +1,118 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +variable "deploy_from_local" { + type = bool + default = true + description = "Wether to attempt deployment of the helm charts using local-exec." +} + +variable "deploy_from_operator" { + type = bool + default = false + description = "Wether to attempt deployment of the helm charts using remote-exec." +} + +variable "deployment_name" { + type = string + default = false + description = "The name of the helm deployment." +} + +variable "namespace" { + type = string + default = "default" + description = "The kubernetes namespace to target for the helm deployment." +} + +variable "helm_chart_name" { + type = string + default = "" + description = "The name of the helm chart. Used together with `helm_repository_url` when helm_chart_path=''." +} + +variable "helm_chart_path" { + type = string + default = "" + description = "The path of the helm chart. If not empty will override the `helm_repository_url` and `helm_chart_name` values." +} + +variable "helm_repository_url" { + type = string + default = "" + description = "The helm chart repository url." +} + +variable "operator_helm_values_path" { + type = string + default = "" + description = "The directory on the operator host where to push the values-override for the helm chart." +} + +variable "operator_helm_charts_path" { + type = string + default = "" + description = "The directory on the operator host where to push the helm-charts when `helm_chart_path` is not empty." +} + +variable "helm_template_values_override" { + type = string + description = "The values-override file content populated using terraform templates." +} + +variable "helm_user_values_override" { + type = string + description = "The values-override file provided by the user as variable." +} +variable "pre_deployment_commands" { + type = list(string) + default = [] + description = "List of commands to be executed before attempting the helm deployment." +} +variable "post_deployment_commands" { + type = list(string) + default = [] + description = "List of commands to be executed after the helm deployment." +} + +variable "deployment_extra_args" { + type = list(string) + default = [] + description = "List of arguments to be appended to the helm upgrade --install command." +} + +variable "kube_config" { + type = string + default = "" + description = "The Kubeconfig file content to use for helm deployments using local-exec." +} + +variable "bastion_host" { + type = string + default = null + description = "The IP address of the bastion host." +} + +variable "bastion_user" { + type = string + default = "opc" + description = "The user to be used for SSH connection to the bastion host." +} + +variable "ssh_private_key" { + type = string + default = null + description = "The SSH private key to be used for connection to operator/bastion hosts." +} + +variable "operator_host" { + type = string + default = null + description = "The IP address of the operator host." +} + +variable "operator_user" { + type = string + default = "opc" + description = "The user to be used for SSH connection to the operator host." +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl new file mode 100644 index 00000000..cae8d95b --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/cert-manager-values.yaml.tpl @@ -0,0 +1,2 @@ +crds: + enabled: true \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl new file mode 100644 index 00000000..7be38b31 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/jupyterhub-values.yaml.tpl @@ -0,0 +1,38 @@ +--- +singleuser: + defaultUrl: "/lab" + extraEnv: + JUPYTERHUB_SINGLEUSER_APP: "jupyter_server.serverapp.ServerApp" + %{ if playbooks_repo != "" } + lifecycleHooks: + postStart: + exec: + command: + [ + "/bin/sh", + "-c", + "gitpuller ${playbooks_repo} main examples" + ] + %{ endif } + cloudMetadata: + blockWithIptables: false + +hub: + config: + Authenticator: + admin_users: + - ${admin_user} + DummyAuthenticator: + password: '${admin_password}' + JupyterHub: + authenticator_class: dummy + +ingress: + enabled: true + ingressClassName: nginx + annotations: + cert-manager.io/cluster-issuer: "le-clusterissuer" + +proxy: + service: + type: ClusterIP \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl new file mode 100644 index 00000000..6b83ac47 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nginx-values.yaml.tpl @@ -0,0 +1,13 @@ +controller: + service: + targetPorts: + http: http + https: https + annotations: + oci.oraclecloud.com/load-balancer-type: "lb" + service.beta.kubernetes.io/oci-load-balancer-shape: "flexible" + service.beta.kubernetes.io/oci-load-balancer-shape-flex-min: "${min_bw}" + service.beta.kubernetes.io/oci-load-balancer-shape-flex-max: "${max_bw}" + service.beta.kubernetes.io/oci-load-balancer-security-list-management-mode: "None" + oci.oraclecloud.com/oci-network-security-groups: "${pub_lb_nsg_id}" + oci.oraclecloud.com/initial-freeform-tags-override: '{"state_id": "${state_id}", "application": "nginx", "role": "service_lb"}' \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl new file mode 100644 index 00000000..54f99e28 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/helm-values-templates/nim-values.yaml.tpl @@ -0,0 +1,16 @@ + +imagePullSecrets: +- name: ${nvcr_secret} + +model: + ngcAPISecret: ${ngcapi_secret} + +image: + repository: ${nim_image_repository} + tag: ${nim_image_tag} + +resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/main.tf b/cloud-service-providers/oci/oke/terraform/main.tf new file mode 100644 index 00000000..c964af30 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/main.tf @@ -0,0 +1,166 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + ads_supporting_gpu_shape = [for key, value in data.oci_core_shapes.gpu_shapes : key if length(value.shapes) > 0] + gpu_np_placement_ads = ( + length(var.gpu_np_placement_ads) > 0 ? + [for entry in var.gpu_np_placement_ads : length(tostring(entry)) > 1 ? substr(tostring(entry), -1, -1) : tonumber(entry)] : + [for entry in local.ads_supporting_gpu_shape : substr(tostring(entry), -1, -1)] + ) +} + +# This module is using a modified version of the oke-terraform-module to address the compatibility with Terraform versions < 1.3.0 +# The OKE module documentation is available here: https://oracle-terraform-modules.github.io/terraform-oci-oke/ +module "oke" { + source = "git::https://github.com/robo-cap/terraform-oci-oke.git?ref=v5.1.8-ai" + + # source = "oracle-terraform-modules/oke/oci" + # version = "5.1.8" + + providers = { + oci.home = oci.home + } + + state_id = local.state_id + # IAM + tenancy_id = var.tenancy_ocid + compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + network_compartment_id = coalesce(var.compartment_id, var.compartment_ocid) + create_iam_resources = var.create_iam_resources + create_iam_worker_policy = var.create_iam_worker_policy + create_iam_autoscaler_policy = var.create_iam_autoscaler_policy + create_iam_operator_policy = var.create_iam_operator_policy + create_iam_kms_policy = var.create_iam_kms_policy + create_iam_tag_namespace = var.create_iam_tag_namespace + create_iam_defined_tags = var.create_iam_defined_tags + # it's recommended to create the following tag namespace and tag keys outside of the oke module + # tag namespace: oke + # tag keys: state_id, role, pool, cluster_autoscaler + use_defined_tags = var.use_defined_tags + tag_namespace = var.tag_namespace + freeform_tags = var.freeform_tags + + defined_tags = var.defined_tags + + # Common + ssh_private_key = tls_private_key.stack_key.private_key_openssh + ssh_public_key = local.bundled_ssh_public_keys + + # Bastion variables + create_bastion = var.create_operator_and_bastion + bastion_allowed_cidrs = var.bastion_allowed_cidrs + bastion_image_os = var.bastion_image_os + bastion_image_os_version = var.bastion_image_os_version + bastion_image_type = var.bastion_image_type + bastion_image_id = var.bastion_image_id + bastion_user = var.bastion_user + + # Operator variables + create_operator = var.create_operator_and_bastion + create_operator_policy_to_manage_cluster = var.create_operator_policy_to_manage_cluster + operator_image_os = var.operator_image_os + operator_image_os_version = var.operator_image_os_version + operator_image_type = var.operator_image_type + operator_image_id = var.operator_image_id + operator_install_kubectl_from_repo = var.operator_install_kubectl_from_repo + operator_user = var.operator_user + + # Network variables + create_vcn = var.create_vcn + lockdown_default_seclist = true # *true/false + vcn_id = var.vcn_id # Ignored if create_vcn = true + vcn_cidrs = [var.cidr_vcn] # Ignored if create_vcn = false + vcn_name = var.vcn_name # Ignored if create_vcn = false + ig_route_table_id = var.ig_route_table_id + nat_route_table_id = var.nat_route_table_id + + + subnets = { + bastion = { cidr = var.cidr_bastion_subnet } + operator = { cidr = var.cidr_operator_subnet } + cp = { cidr = var.cidr_cp_subnet } + int_lb = { cidr = var.cidr_int_lb_subnet } + pub_lb = { cidr = var.cidr_pub_lb_subnet } + workers = { cidr = var.cidr_workers_subnet } + pods = { cidr = var.cidr_pods_subnet } + } + + nat_gateway_route_rules = [ + # { + # destination = "192.168.0.0/16" + # destination_type = "CIDR_BLOCK" + # network_entity_id = "drg" + # description = "Terraformed - 192/16 to DRG" + # }, + ] + + # Cluster variables + create_cluster = var.create_cluster // *true/false + cluster_name = var.cluster_name + cluster_type = var.cluster_type + cni_type = var.cni_type // *flannel/npn + kubernetes_version = var.kubernetes_version + pods_cidr = var.pods_cidr + services_cidr = var.services_cidr + control_plane_is_public = var.control_plane_is_public + assign_public_ip_to_control_plane = var.control_plane_is_public + load_balancers = var.load_balancers + preferred_load_balancer = var.preferred_load_balancer + control_plane_allowed_cidrs = var.control_plane_allowed_cidrs + allow_rules_public_lb = { + "Allow TCP ingress to public load balancers for SSL traffic from anywhere" : { + protocol = 6, port = 443, source = "0.0.0.0/0", source_type = "CIDR_BLOCK", + }, + "Allow TCP ingress to public load balancers for HTTP traffic from anywhere" : { + protocol = 6, port = 80, source = "0.0.0.0/0", source_type = "CIDR_BLOCK", + } + } + + worker_pools = { + simple-np = { + description = "Worker nodes for the OKE cluster.", + size = var.simple_np_size + os = "Oracle Linux", + os_version = "8", + image_type = "oke", + image_id = "ocid1.image...", + shape = lookup(var.simple_np_flex_shape, "instanceShape", "VM.Standard.E4.Flex"), + ocpus = lookup(var.simple_np_flex_shape, "ocpus", 2), + memory = lookup(var.simple_np_flex_shape, "memory", 16) + boot_volume_size = var.simple_np_boot_volume_size + }, + gpu-np = { + description = "Worker nodes with GPU for the OKE cluster.", + size = var.gpu_np_size, + os = "Oracle Linux", + os_version = "8", + image_type = "oke", + image_id = "ocid1.image...", + shape = var.gpu_np_shape, + boot_volume_size = var.gpu_np_boot_volume_size + placement_ads = local.gpu_np_placement_ads + } + } + + output_detail = true +} + +output "bastion" { + value = "%{if var.create_operator_and_bastion}${module.oke.bastion_public_ip}%{else}bastion host not created.%{endif}" +} + +output "operator" { + value = "%{if var.create_operator_and_bastion}${module.oke.operator_private_ip}%{else}operator host not created.%{endif}" +} + +output "ssh_to_operator" { + value = "%{if var.create_operator_and_bastion}${module.oke.ssh_to_operator}%{else}bastion and operator hosts not created.%{endif}" +} + +output "jupyter_hub_url" { + value = (var.deploy_nginx && var.deploy_jupyterhub && length(coalesce(data.oci_load_balancer_load_balancers.lbs.load_balancers, [])) > 0 ? + "https://jupyter.${data.oci_load_balancer_load_balancers.lbs.load_balancers[0].ip_addresses[0]}.nip.io" : + "" + ) +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/provider.tf b/cloud-service-providers/oci/oke/terraform/provider.tf new file mode 100644 index 00000000..ce9697e7 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/provider.tf @@ -0,0 +1,24 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +provider "oci" { + alias = "home" + region = lookup(data.oci_identity_regions.home_region.regions[0], "name") +} + +provider "oci" { + region = var.region +} + +terraform { + required_version = ">= 1.3.0" + + required_providers { + + oci = { + configuration_aliases = [oci.home] + source = "oracle/oci" + version = ">= 4.119.0" + } + } +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/schema.yaml b/cloud-service-providers/oci/oke/terraform/schema.yaml new file mode 100644 index 00000000..c6c3e396 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/schema.yaml @@ -0,0 +1,519 @@ +title: OKE AI Stack deployment +description: Terraform Stack for OKE deployment with Nginx, Jupyter Notebook and NIM. +informationalText: Terraform Stack for OKE deployment with Nginx, Jupyter Notebook and NIM. +schemaVersion: 1.1.0 +version: "20190304" + +# URL of Logo Icon used on Application Information tab. Logo must be 130x130 pixels. +# (Optional) +logoUrl: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/laivzo5VNcM9uZf9O6ftZb4-QTPOcszBFwtfu7AGOtbINpfSDKqbJAnSNIRDjtX6/n/orasenatdpltintegration03/b/default-bucket/o/oracle_icon.jpg + + +locale: "en" +variableGroups: + - title: "Hidden RMS variables" + visible: false + variables: + - tenancy_ocid + - compartment_ocid + - region + - user_ocid + - current_user_ocid + + - title: "Hidden stack variables" + visible: false + variables: + - create_iam_resources + - create_iam_tag_namespace + - create_iam_defined_tags + - use_defined_tags + - tag_namespace + - bastion_allowed_cidrs + - bastion_image_os + - bastion_image_os_version + - bastion_image_type + - bastion_image_id + - bastion_user + - operator_image_os + - operator_image_os_version + - operator_image_type + - operator_image_id + - operator_user + - control_plane_allowed_cidrs + - create_cluster + - operator_allowed_cidrs + - state_id + - nvcr_username + - nvcr_password + - operator_install_kubectl_from_repo + - cluster_type + - load_balancers + - preferred_load_balancer + - pods_cidr + - services_cidr + - create_iam_autoscaler_policy + - create_iam_kms_policy + - create_iam_operator_policy + - create_iam_worker_policy + - defined_tags + - freeform_tags + + + - title: "General configuration" + visible: true + variables: + - compartment_id + - cluster_name + - show_advanced_oke + - kubernetes_version + - cni_type + + - title: "Networking configuration" + visible: true + variables: + - create_vcn + - vcn_name + - vcn_id + - ig_route_table_id + - nat_route_table_id + - cidr_vcn + - cidr_bastion_subnet + - cidr_operator_subnet + - cidr_cp_subnet + - cidr_int_lb_subnet + - cidr_pub_lb_subnet + - cidr_workers_subnet + - cidr_pods_subnet + + - title: "Kubernetes nodepools configuration" + visible: true + variables: + - simple_np_size + - simple_np_flex_shape + - simple_np_boot_volume_size + - gpu_np_size + - gpu_np_shape + - gpu_np_boot_volume_size + + - title: "Access to the Kubernetes cluster" + visible: true + variables: + - create_operator_and_bastion + - create_operator_policy_to_manage_cluster + - control_plane_is_public + - ssh_public_key + + + - title: "Helm Chart deployments" + visible: true + variables: + - deploy_nginx + - nginx_user_values_override + - deploy_cert_manager + - cert_manager_user_values_override + - deploy_jupyterhub + - jupyterhub_user_values_override + - jupyter_admin_user + - jupyter_admin_password + - jupyter_playbooks_repo + - deploy_nim + - nim_user_values_override + - nim_image_repository + - nim_image_tag + - NGC_API_KEY + +variables: + create_iam_resources: + type: bool + default: false + visible: false + + create_iam_tag_namespace: + type: bool + default: false + visible: false + + create_iam_defined_tags: + type: bool + default: false + visible: false + + use_defined_tags: + type: bool + default: false + visible: false + + compartment_id: + type: oci:identity:compartment:id + title: Deployment compartment target + description: Please select the compartment where the resources will be created + required: true + + cluster_name: + type: string + minLength: 3 + maxLength: 40 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: Cluster Name + description: The name of the OKE cluster. + default: oke + required: true + + show_advanced_oke: + title: Show advanced OKE settings + description: Expand options to set advanced OKE settings + type: boolean + default: false + + kubernetes_version: + type: enum + title: Kubernetes version + description: The version of the Kubernetes cluster. + default: v1.30.1 + enum: + - v1.30.1 + - v1.29.1 + - v1.28.2 + allowMultiple: false + required: true + visible: ${show_advanced_oke} + + cni_type: + type: enum + title: Kubernetes cluster networking type + description: The networking to be used with the OKE cluster. + default: flannel + enum: + - flannel + - npn + allowMultiple: false + required: true + visible: ${show_advanced_oke} + + create_vcn: + title: Create new VCN + description: Create new VCN for the OKE cluster. + type: boolean + default: true + + vcn_name: + type: string + minLength: 3 + maxLength: 40 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: VCN Name + description: The name of VCN. + default: oke-vcn + required: true + visible: ${create_vcn} + + vcn_id: + title: Select VCN for the OKE cluster + description: Select the existing VCN for the OKE cluster. + type: oci:core:vcn:id + dependsOn: + compartmentId: ${compartment_id} + visible: + not: + - ${create_vcn} + + ig_route_table_id: + title: Public subnet Route Table OCID + description: OCID of the route table for public subnets from the selected VCN. + type: string + visible: + not: + - ${create_vcn} + default: "" + + nat_route_table_id: + title: Private subnet Route Table OCID + description: OCID of the route table for private subnets from the selected VCN. + type: string + visible: + not: + - ${create_vcn} + default: "" + + cidr_vcn: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: VCN CIDR Block + description: The CIDR block to use with the new VCN. + default: 10.0.0.0/16 + required: true + visible: ${create_vcn} + + cidr_bastion_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: Bastion subnet CIDR + description: The CIDR block used for the bastion subnet. + default: 10.0.0.0/29 + required: true + + cidr_operator_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: Operator subnet CIDR + description: The CIDR block used for the operator subnet. + default: 10.0.0.64/29 + required: true + + cidr_cp_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-9])$" + title: OKE Control Plane subnet CIDR + description: The CIDR block used for the OKE Control Plane subnet. + default: 10.0.0.8/29 + required: true + + cidr_int_lb_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Internal LBs subnet CIDR + description: The CIDR block used for the OKE Internal Load Balancers subnet. + default: 10.0.0.32/27 + required: true + + cidr_pub_lb_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Public LBs subnet CIDR + description: The CIDR block used for the OKE Public Load Balancers subnet. + default: 10.0.128.0/27 + required: true + + cidr_workers_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Worker nodes subnet CIDR + description: The CIDR block used for the OKE Workers subnet. + default: 10.0.144.0/20 + required: true + + cidr_pods_subnet: + type: string + pattern: "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\/(?:[1][6-9])|(?:2[0-8])$" + title: OKE Pods subnet CIDR + description: The CIDR block used for the OKE Pods subnet (will be used only if networking type is NPN). + default: 10.0.64.0/18 + required: true + visible: + eq: + - ${cni_type} + - npn + + control_plane_is_public: + title: Create public OKE API? + description: Associate a public IP address with the OKE control plane endpoint? This is required for Helm deployments when bastion and operator hosts are not created. + type: boolean + default: true + + ssh_public_key: + title: SSH public key + description: Public SSH key to be included in the ~/.ssh/authorized_keys file for the bastion, operator and worker nodes. + # renders variable as an SSH key control + type: oci:core:ssh:publickey + required: false + + simple_np_flex_shape: + title: Generic Nodepool shape + type: oci:core:instanceshapewithflex:name + dependsOn: + compartmentId: ${compartment_id} + required: true + + simple_np_size: + title: Generic Nodepool size + type: integer + minimum: 1 + required: true + default: 1 + + simple_np_boot_volume_size: + title: Generic Nodepool boot volume size + description: The size of the boot volume for the nodes in the Generic nodepool. + type: integer + minimum: 50 + required: true + default: 50 + + gpu_np_shape: + title: GPU Nodepool shape + type: enum + enum: + - VM.GPU.A10.1 + - VM.GPU.A10.2 + - VM.GPU2.1 + - VM.GPU3.1 + - VM.GPU3.2 + - VM.GPU3.4 + - BM.GPU2.2 + - BM.GPU3.8 + - BM.GPU4.8 + - BM.GPU.A10.4 + - BM.GPU.A100-v2.8 + - BM.GPU.H100.8 + default: VM.GPU.A10.1 + + gpu_np_size: + title: GPU Nodepool size + type: integer + minimum: 0 + required: true + default: 1 + + gpu_np_boot_volume_size: + title: GPU Nodepool boot volume size + description: The size of the boot volume for the nodes in the GPU nodepool. + type: integer + minimum: 50 + required: true + default: 100 + + create_operator_and_bastion: + title: Create bastion and operator hosts + description: Bastion and operator hosts are required for successful helm chart deployment when the OKE cluster endpoint is private. + type: boolean + default: true + + create_operator_policy_to_manage_cluster: + type: boolean + default: true + title: Create operator IAM policy + description: Create minimal IAM policy to allow the operator host to manage the OKE cluster. The policy is required for successful helm chart deployment via the operator host. + visible: ${create_operator_and_bastion} + + deploy_nginx: + type: boolean + default: true + title: Helm | Deploy Nginx ingress controller + description: Nginx ingress controller is used to expose the OKE services to the user. + visible: true + + nginx_user_values_override: + type: file + title: Helm | Nginx Ingress Controller helm chart values override + description: Override the values for the Nginx Ingress Controller Helm chart . + visible: ${deploy_nginx} + + deploy_cert_manager: + type: boolean + default: true + title: Helm | Deploy Cert-Manager + description: Cert-manager is used to generate TLS certificates for the ingress resources. + visible: true + + cert_manager_user_values_override: + type: file + title: Helm | Cert-Manager helm chart values override + description: Override the values for the Cert-Manager chart . + visible: ${deploy_cert_manager} + + deploy_jupyterhub: + type: boolean + default: true + title: Helm | Deploy JupyterHub + description: JupyterHub provides a web accessible Python environment where demos can be easily executed. + visible: true + + jupyterhub_user_values_override: + type: file + title: Helm | JupyterHub helm chart values override + description: Override the values for the JupyterHub Helm chart . + visible: ${deploy_jupyterhub} + + jupyter_admin_user: + type: string + minLength: 3 + maxLength: 16 + pattern: "^[a-zA-Z0-9][a-zA-Z0-9-]*?[a-zA-Z0-9]$" + title: Helm | JupyterHub - Admin User + description: The admin user to connect to the Jupyter Notebooks. + default: oracle-ai + required: true + visible: ${deploy_jupyterhub} + + jupyter_admin_password: + type: password + pattern: "^[\\S]{6,16}$" + title: Helm | JupyterHub - Admin Password + description: "The admin password to connect to the Jupyter Notebooks. (min: 6, max: 16 characters)" + confirmation: true + required: true + visible: ${deploy_jupyterhub} + + jupyter_playbooks_repo: + type: string + title: Helm | JupyterHub - Playbooks Git Repo + description: "Git Repository with Jupyter plabooks examples." + default: "https://github.com/ionut-sturzu/nim_notebooks.git" + visible: ${deploy_jupyterhub} + + deploy_nim: + type: boolean + default: true + title: Helm | Deploy nVidia NIM LLM + description: NIM is NVIDIA's set of accelerated inference microservices that allow organizations to run AI models on NVIDIA GPUs anywhere. + visible: true + + nim_user_values_override: + type: file + title: Helm | NIM helm chart values override + description: Override the values for the NIM Helm chart . + visible: ${deploy_nim} + + nim_image_repository: + type: string + title: Helm | NIM - Container image repo + description: The NIM container image repository. + default: "nvcr.io/nim/meta/llama3-8b-instruct" + visible: ${deploy_nim} + required: true + + nim_image_tag: + type: string + title: Helm | NIM - Container image tag + description: The NIM container image tag. + default: "latest" + visible: ${deploy_nim} + required: true + + NGC_API_KEY: + type: password + title: Helm | NIM - NGC API KEY + description: API key to authenticate to the NGC service. + visible: ${deploy_nim} + +outputs: + bastion: + title: Bastion public IP + description: The bastion host public IP address. + type: copyableString + + operator: + title: Operator public IP + description: The operator host public IP address. + sensitive: true + type: copyableString + visible: false + + ssh_to_operator: + title: Command to connect to operator host. + description: The ssh command to connect to the operator host via the bastion. + sensitive: true + type: copyableString + + jupyter_hub_url: + title: URL to access JupyterHub + type: link + displayText: URL to access JupyterHub + +outputGroups: +- title: "Access details" + outputs: + - ${bastion} + - ${operator} + - ${ssh_to_operator} + - ${jupyter_hub_url} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example b/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example new file mode 100644 index 00000000..eead931e --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/terrafom.auto.tfvars.example @@ -0,0 +1,32 @@ +# ORM injected values + +region = "uk-london-1" +tenancy_ocid = "ocid1.tenancy.oc1..aaaaaaaaiyavtwbz4kyu7g7b6wglllccbflmjx2lzk5nwpbme44mv54xu7dq" +compartment_ocid = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# OKE Terraform module values +create_iam_resources = false +create_iam_tag_namespace = false +ssh_public_key = "" + +## NodePool with non-GPU shape is created by default with size 1 +simple_np_flex_shape = { "instanceShape" = "VM.Standard.E4.Flex", "ocpus" = 2, "memory" = 12 } + +## NodePool with GPU shape is created by default with size 0 +gpu_np_size = 1 +gpu_np_shape = "VM.GPU.A10.1" + +## OKE Deployment values +cluster_name = "oke" +vcn_name = "oke-vcn" +compartment_id = "ocid1.compartment.oc1..aaaaaaaaqi3if6t4n24qyabx5pjzlw6xovcbgugcmatavjvapyq3jfb4diqq" + +# Jupyter Hub deployment values +jupyter_admin_user = "oracle-ai" +jupyter_admin_password = "" +jupyter_playbooks_repo = "https://github.com/ionut-sturzu/nim_notebooks.git" + +# NIM Deployment values +nim_image_repository = "nvcr.io/nim/meta/llama3-8b-instruct" +nim_image_tag = "latest" +NGC_API_KEY = "" \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/tls.tf b/cloud-service-providers/oci/oke/terraform/tls.tf new file mode 100644 index 00000000..2200cf43 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/tls.tf @@ -0,0 +1,12 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +locals { + user_public_ssh_key = chomp(var.ssh_public_key) + bundled_ssh_public_keys = "${local.user_public_ssh_key}\n${chomp(tls_private_key.stack_key.public_key_openssh)}" +} + +resource "tls_private_key" "stack_key" { + algorithm = "RSA" + rsa_bits = 4096 +} \ No newline at end of file diff --git a/cloud-service-providers/oci/oke/terraform/variables.tf b/cloud-service-providers/oci/oke/terraform/variables.tf new file mode 100644 index 00000000..555fd416 --- /dev/null +++ b/cloud-service-providers/oci/oke/terraform/variables.tf @@ -0,0 +1,565 @@ +# Copyright (c) 2022, 2024 Oracle Corporation and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl + +### ORM Variables +variable "compartment_ocid" { + type = string + default = null + description = "A compartment OCID automatically populated by Resource Manager." +} + +variable "current_user_ocid" { + type = string + default = null + description = "A user OCID automatically populated by Resource Manager." +} + +variable "region" { + type = string + default = null + description = "The OCI region where OKE resources will be created." +} + +variable "tenancy_ocid" { + type = string + default = null + description = "The tenancy id of the OCI Cloud Account in which to create the resources." +} + + +### OKE Module - Common Variables +variable "create_operator_and_bastion" { + type = bool + default = true + description = "Whether to create bastion and operator host." +} + +variable "ssh_public_key" { + type = string + description = "The contents of the SSH public key file. Used to allow login for workers/bastion/operator. This public key " +} + +variable "state_id" { + type = string + default = null + description = "Optional Terraform state_id used to identify the resources of this deployment." +} + + +### OKE Module - Bastion Variables +variable "bastion_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "A list of CIDR blocks to allow SSH access to the bastion host." +} + +variable "bastion_image_os" { + type = string + default = "Oracle Linux" + description = "Image ID for created bastion instance." +} + +variable "bastion_image_os_version" { + type = string + default = "8" + description = "Bastion image operating system version when bastion_image_type = 'platform'." +} + +variable "bastion_image_type" { + type = string + default = "platform" + description = "Whether to use a platform or custom image for the created bastion instance. When custom is set, the bastion_image_id must be specified." + + validation { + condition = contains(["platform", "custom"], var.bastion_image_type) + error_message = "The bastion_image_type can be only `platform` or `custom`." + } +} + +variable "bastion_image_id" { + type = string + default = null + description = "Image ID for created bastion instance." +} + +variable "bastion_user" { + type = string + default = "opc" + description = "User for SSH access through bastion host." +} + + +### OKE Module - Operator Variables +variable "create_operator_policy_to_manage_cluster" { + default = true + description = "Whether to create minimal IAM policy to allow the operator host to manage the cluster." + type = bool +} + +variable "operator_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "List with allowed CIDR blocks to connect to the operator host." +} + +variable "operator_image_os" { + type = string + default = "Oracle Linux" + description = "Operator image operating system name when operator_image_type = 'platform'." +} + +variable "operator_image_os_version" { + type = string + default = "8" + description = "Operator image operating system version when operator_image_type = 'platform'." +} + +variable "operator_image_type" { + type = string + default = "platform" + description = "Whether to use a platform or custom image for the created operator instance. When custom is set, the operator_image_id must be specified." + + validation { + condition = contains(["platform", "custom"], var.operator_image_type) + error_message = "The operator_image_type can be only `platform` or `custom`." + } +} + +variable "operator_install_kubectl_from_repo" { + default = false + description = "Whether to install kubectl on the created operator host from olcne repo." + type = bool +} + +variable "operator_image_id" { + type = string + default = null + description = "Image ID for created operator instance." +} + +variable "operator_user" { + type = string + default = "opc" + description = "User for SSH access to operator host." +} + + +### OKE Module - Cluster Variables +variable "create_cluster" { + type = bool + default = true + description = "Whether to create the OKE cluster and dependent resources." +} + +variable "cluster_name" { + type = string + default = "oke" + description = "The name of oke cluster." +} + +variable "cluster_type" { + default = "enhanced" + description = "The cluster type. See Working with Enhanced Clusters and Basic Clusters for more information." + type = string + validation { + condition = contains(["basic", "enhanced"], lower(var.cluster_type)) + error_message = "Accepted values are 'basic' or 'enhanced'." + } +} + +variable "cni_type" { + type = string + default = "flannel" + description = "The CNI for the cluster: 'flannel' or 'npn'. See Pod Networking." + + validation { + condition = contains(["flannel", "npn"], var.cni_type) + error_message = "The cni_type can be only `flannel` or `npn`." + } +} + +variable "control_plane_is_public" { + type = bool + default = true + description = "Whether the Kubernetes control plane endpoint should be allocated a public IP address to enable access over public internet." +} + +variable "control_plane_allowed_cidrs" { + type = list(string) + default = ["0.0.0.0/0"] + description = "The list of CIDR blocks from which the control plane can be accessed." +} + +variable "create_vcn" { + type = bool + default = true + description = "Whether to create a Virtual Cloud Network." +} + +variable "kubernetes_version" { + type = string + default = "v1.30.1" + description = "The version of kubernetes to use when provisioning OKE." +} + +variable "load_balancers" { + default = "both" + description = "The type of subnets to create for load balancers." + type = string + validation { + condition = contains(["public", "internal", "both"], var.load_balancers) + error_message = "Accepted values are public, internal or both." + } +} + +variable "preferred_load_balancer" { + default = "public" + description = "The preferred load balancer subnets that OKE will automatically choose when creating a load balancer. Valid values are 'public' or 'internal'. If 'public' is chosen, the value for load_balancers must be either 'public' or 'both'. If 'private' is chosen, the value for load_balancers must be either 'internal' or 'both'. NOTE: Service annotations for internal load balancers must still be specified regardless of this setting. See Load Balancer Annotations for more information." + type = string + validation { + condition = contains(["public", "internal"], var.preferred_load_balancer) + error_message = "Accepted values are public or internal." + } +} + +variable "pods_cidr" { + default = "10.244.0.0/16" + description = "The CIDR range used for IP addresses by the pods. A /16 CIDR is generally sufficient. This CIDR should not overlap with any subnet range in the VCN (it can also be outside the VCN CIDR range). Ignored when cni_type = 'npn'." + type = string +} + +variable "services_cidr" { + default = "10.96.0.0/16" + description = "The CIDR range used within the cluster by Kubernetes services (ClusterIPs). This CIDR should not overlap with the VCN CIDR range." + type = string +} + + +### OKE Module - IAM Variables +variable "compartment_id" { + type = string + default = null + description = "The compartment id where resources will be created." +} + +variable "create_iam_resources" { + type = bool + default = false + description = "Whether to create IAM dynamic groups, policies, and defined tags." +} + +variable "create_iam_defined_tags" { + type = bool + default = false + description = "Whether to create defined tag keys." +} + +variable "create_iam_autoscaler_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for Cluster Autoscaler management. Depends on configuration of associated component when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_autoscaler_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_kms_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for cluster autoscaler. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_kms_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_operator_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for operator access to the OKE control plane. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_operator_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_worker_policy" { + default = "auto" + description = "Whether to create an IAM dynamic group and policy rules for self-managed worker nodes. Depends on configuration of associated components when set to 'auto'. Ignored when 'create_iam_resources' is false." + type = string + validation { + condition = contains(["never", "auto", "always"], var.create_iam_worker_policy) + error_message = "Accepted values are never, auto, or always" + } +} + +variable "create_iam_tag_namespace" { + type = bool + default = false + description = "Whether to create defined tag namespace." +} + +variable "defined_tags" { + default = { + bastion = {} + cluster = {} + iam = {} + network = {} + operator = {} + persistent_volume = {} + service_lb = {} + workers = {} + } + description = "Defined tags to be applied to created resources. Must already exist in the tenancy." + type = any +} + +variable "freeform_tags" { + default = { + bastion = {} + cluster = {} + iam = {} + network = {} + operator = {} + persistent_volume = {} + service_lb = {} + workers = {} + } + description = "Freeform tags to be applied to created resources." + type = any +} + +variable "tag_namespace" { + type = string + default = "oke" + description = "The tag namespace to use if use_defined_tags=true." +} + +variable "use_defined_tags" { + type = bool + default = false + description = "Wether to set defined tags on the creted resources. By default only free-form tags are used." +} + + +### OKE Module - Network Variables +variable "cidr_vcn" { + type = string + default = "10.0.0.0/16" + description = "The IPv4 CIDR block the VCN will use." +} + +variable "cidr_bastion_subnet" { + type = string + default = "10.0.0.0/29" + description = "The IPv4 CIDR block to be used for the bastion subnet." +} + +variable "cidr_operator_subnet" { + type = string + default = "10.0.0.64/29" + description = "The IPv4 CIDR block to be used for the operator subnet." +} + +variable "cidr_cp_subnet" { + type = string + default = "10.0.0.8/29" + description = "The IPv4 CIDR block to be used for the OKE control plane endpoint." +} + +variable "cidr_int_lb_subnet" { + type = string + default = "10.0.0.32/27" + description = "The IPv4 CIDR block to be used for the private load balancer subnet." +} + +variable "cidr_pub_lb_subnet" { + type = string + default = "10.0.128.0/27" + description = "The IPv4 CIDR block to be used for the public load balancer subnet." +} + +variable "cidr_workers_subnet" { + type = string + default = "10.0.144.0/20" + description = "The IPv4 CIDR block to be used for the kubernetes workers subnet." +} + +variable "cidr_pods_subnet" { + type = string + default = "10.0.64.0/18" + description = "The IPv4 CIDR block to be used for the pods subnet." +} + +variable "vcn_id" { + type = string + default = null + description = "Optional ID of existing VCN. Takes priority over vcn_name filter. Ignored when `create_vcn = true`." +} + +variable "ig_route_table_id" { + default = null + description = "Optional ID of existing public subnets route table in VCN." + type = string +} + +variable "nat_route_table_id" { + default = null + description = "Optional ID of existing private subnets route table in VCN." + type = string +} + +variable "vcn_name" { + type = string + default = null + description = "Display name for the created VCN. Defaults to 'oke' suffixed with the generated Terraform 'state_id' value." +} + + +### OKE Module - Worker NodePool Variables +variable "gpu_np_size" { + type = number + default = 0 + description = "The size of the nodepool with GPU shapes." +} + +variable "gpu_np_boot_volume_size" { + type = number + default = 100 + description = "The size of the boot volume for the nodes in the GPU nodepool." +} + +variable "gpu_np_shape" { + type = string + default = "VM.GPU.A10.1" + description = "The compute shape to use for the GPUs nodepool." +} + + +variable "gpu_np_placement_ads" { + type = list(any) + default = [] + description = "List with the ADs where to attempt the placement of the GPU worker nodes." +} + +variable "simple_np_flex_shape" { + type = map(any) + default = { + "instanceShape" = "VM.Standard.E4.Flex" + "ocpus" = 2 + "memory" = 16 + } + description = "The compute shape and configuration to use for the non-GPU kubernetes nodepool." +} + +variable "simple_np_size" { + type = number + default = 1 + description = "The size of the non-GPU kubernetes nodepool." +} + +variable "simple_np_boot_volume_size" { + type = number + default = 50 + description = "The boot volume size for the nodes in the non-GPU kubernetes nodepool." +} + + +### Helm chart deployments +variable "deploy_nginx" { + type = bool + default = true + description = "Controls the deployment of the nginx helm chart." +} + +variable "nginx_user_values_override" { + type = string + default = "" + description = "User provided values to override the Nginx helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_cert_manager" { + type = bool + default = true + description = "Controls the deployment of the cert-manager helm chart." +} + +variable "cert_manager_user_values_override" { + type = string + default = "" + description = "User provided values to override the Cert-Manager helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_jupyterhub" { + type = bool + default = true + description = "Controls the deployment of the jupyterhub helm chart." +} + +variable "jupyterhub_user_values_override" { + type = string + default = "" + description = "User provided values to override the JupyterHub helm chart defaults and those generated by Terraform using the templates." +} + +variable "deploy_nim" { + type = bool + default = true + description = "Controls the deployment of the NIM helm chart." +} + +variable "nim_user_values_override" { + type = string + default = "" + description = "User provided values to override the NIM helm chart defaults and those generated by Terraform using the templates." +} + +### JupyterHub Values +variable "jupyter_admin_user" { + type = string + description = "JupyterHub administrative user name." + default = "oracle-ai" +} + +variable "jupyter_admin_password" { + type = string + description = "JupyterHub administrative user password." +} + +variable "jupyter_playbooks_repo" { + type = string + default = "https://github.com/robo-cap/llm-jupyter-notebooks.git" + description = "Link for the Git repository that will be automatically imported in the home directory of the JupyterHub container." +} + +### NIM Values +variable "nvcr_username" { + type = string + default = "$oauthtoken" + description = "User to be used to pull the NIM container image." +} + +variable "nvcr_password" { + type = string + description = "Password to be used to pull NIM container image. If no password is set, NGC_API_KEY is used instead." + default = "" +} + +variable "nim_image_repository" { + type = string + default = "nvcr.io/nim/meta/llama3-8b-instruct" + description = "The NIM container image repository." +} + +variable "nim_image_tag" { + type = string + default = "latest" + description = "The NIM container image tag." +} + +variable "NGC_API_KEY" { + type = string + description = "NGC API KEY. https://org.ngc.nvidia.com/setup/api-key" + default = "" +} \ No newline at end of file