diff --git a/configure-job-templates.yaml b/configure-job-templates.yaml index 569201e..d9c4169 100644 --- a/configure-job-templates.yaml +++ b/configure-job-templates.yaml @@ -55,7 +55,8 @@ "manage-with-acm", "redhat-internal-certificate", "coe-sso", - "lvms" + "lvms", + "rhoai" ], "default": "manage-with-acm\nredhat-internal-certificate\ncoe-sso", "max": 1024, diff --git a/inventory/host_vars/ocp11.yaml b/inventory/host_vars/ocp11.yaml index 27ae677..21df5a5 100644 --- a/inventory/host_vars/ocp11.yaml +++ b/inventory/host_vars/ocp11.yaml @@ -16,8 +16,8 @@ cluster_name: ocp11 # cluster_api_vip: # cluster_ingress_vip: -control_plan_cores: 16 -control_plan_ram_gb: 32 +control_plan_cores: 12 +control_plan_ram_gb: 48 control_plans: - vm_network_ip_address: 10.32.105.128 vm_network_mac_address: 0E:C0:EF:20:6F:80 @@ -25,9 +25,11 @@ control_plans: openshift_version: 4.17.2 stormshift_cluster_features: - - lvms - - coe-sso - redhat-internal-certificate + - coe-sso + - lvms + - rhoai + # See https://docs.openshift.com/container-platform/4.17/installing/overview/cluster-capabilities.html diff --git a/roles/feature-lvms/tasks/add-disk-to-vm.yaml b/roles/feature-lvms/tasks/add-disk-to-vm.yaml index 08ba2a4..6319e77 100644 --- a/roles/feature-lvms/tasks/add-disk-to-vm.yaml +++ b/roles/feature-lvms/tasks/add-disk-to-vm.yaml @@ -25,8 +25,10 @@ requests: storage: "{{ disk_size }}" storageClassName: "{{ disk_storageclass }}" + register: dataVolume - name: Hotplug virtual disk to VM + when: dataVolume is changed kubernetes.core.k8s: api_key: "{{ hostvars['isar']['k8s_auth_api_key'] }}" host: "{{ hostvars['isar']['k8s_auth_host'] }}" @@ -70,6 +72,7 @@ register: diskAdded - name: Stop VM (hotplug did not work via ansible) + when: diskAdded is defined and diskAdded is changed kubevirt.core.kubevirt_vm: api_key: "{{ hostvars['isar']['k8s_auth_api_key'] }}" host: "{{ hostvars['isar']['k8s_auth_host'] }}" @@ -81,9 +84,9 @@ running: false wait: yes register: vmStopped - when: diskAdded is changed - name: Start VM (hotplug did not work via ansible) + when: vmStopped is defined and vmStopped is changed kubevirt.core.kubevirt_vm: api_key: "{{ hostvars['isar']['k8s_auth_api_key'] }}" host: "{{ hostvars['isar']['k8s_auth_host'] }}" @@ -94,4 +97,3 @@ namespace: "{{ target_namespace }}" running: true wait: yes - when: vmStopped is defined and vmStopped is changed diff --git a/roles/feature-lvms/tasks/post-deploy.yaml b/roles/feature-lvms/tasks/post-deploy.yaml index 0552324..04d21be 100644 --- a/roles/feature-lvms/tasks/post-deploy.yaml +++ b/roles/feature-lvms/tasks/post-deploy.yaml @@ -41,7 +41,7 @@ host: "api.{{ inventory_hostname }}.{{ cluster_base_domain }}" port: 443 sleep: 1 - timeout: 300 + timeout: 600 - name: Install LVMS operator @@ -68,7 +68,6 @@ spec: targetNamespaces: - openshift-storage - - kind: Subscription apiVersion: operators.coreos.com/v1alpha1 metadata: @@ -87,10 +86,23 @@ kind: CustomResourceDefinition name: lvmclusters.lvm.topolvm.io wait: yes + wait_timeout: 300 wait_condition: type: Established status: True +- name: Wait for operator pod to be ready + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: Pod + namespace: openshift-storage + label_selectors: + - "app.kubernetes.io/name = lvms-operator" + wait: yes + wait_timeout: 300 + wait_condition: + type: Ready + status: True - name: Deploy LVMS kubernetes.core.k8s: diff --git a/roles/feature-redhat-internal-certificate/tasks/post-deploy.yaml b/roles/feature-redhat-internal-certificate/tasks/post-deploy.yaml index 43b7a0f..1c7b90c 100644 --- a/roles/feature-redhat-internal-certificate/tasks/post-deploy.yaml +++ b/roles/feature-redhat-internal-certificate/tasks/post-deploy.yaml @@ -17,7 +17,7 @@ engine_mount_point: apps path: "coe-lab/cluster-credential/stormshift-{{ inventory_hostname }}" - register: cluster_credential + register: cluster_credential - name: Temp folder ansible.builtin.tempfile: @@ -37,7 +37,7 @@ definition: apiVersion: v1 data: - ca-bundle.crt: | + ca-bundle.crt: | -----BEGIN CERTIFICATE----- MIIENDCCAxygAwIBAgIJANunI0D662cnMA0GCSqGSIb3DQEBCwUAMIGlMQswCQYD VQQGEwJVUzEXMBUGA1UECAwOTm9ydGggQ2Fyb2xpbmExEDAOBgNVBAcMB1JhbGVp @@ -103,7 +103,7 @@ kind: ConfigMap metadata: name: redhat-current-it-root-cas - namespace: openshift-config + namespace: openshift-config - name: Configure Red Hat Root CA kubernetes.core.k8s: @@ -119,7 +119,7 @@ trustedCA: name: redhat-current-it-root-cas -- name: Fetch cluster certificate +- name: Fetch cluster certificate community.hashi_vault.vault_kv2_get: url: "{{ lookup('ansible.builtin.env', 'RH_VAULT_URL' ) }}" auth_method: approle @@ -129,7 +129,7 @@ engine_mount_point: apps path: "coe-lab/certificate/api.{{ inventory_hostname }}.{{ cluster_base_domain }}" - register: certificate + register: certificate # certificate.secret.cert_and_intermediate_pem # certificate.secret.key @@ -153,7 +153,6 @@ - name: Apply certificate kubernetes.core.k8s: kubeconfig: "{{ temp.path }}/kubeconfig" - state: present definition: apiVersion: operator.openshift.io/v1 @@ -164,4 +163,47 @@ spec: defaultCertificate: name: redhat-cluster-certificate + wait: yes + register: apply_cert + +- name: Wait for IngressController to start Progressing + when: apply_cert is changed + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: IngressController + name: default + namespace: openshift-ingress-operator + wait: yes + wait_timeout: 480 + wait_condition: + type: Progressing + status: True + +- name: Wait for IngressController to be stable again + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: IngressController + name: default + namespace: openshift-ingress-operator + wait: yes + wait_timeout: 480 + wait_condition: + type: Progressing + status: False + +- name: Wait for API to be stable again + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: ClusterOperator + name: kube-apiserver + namespace: openshift-ingress-operator + wait: yes + wait_timeout: 480 + wait_condition: + type: Progressing + status: False + +# +# Todo: +# to avoid subsequent features to fail while it is restarting diff --git a/roles/feature-rhoai/defaults/main.yaml b/roles/feature-rhoai/defaults/main.yaml new file mode 100644 index 0000000..0332eb2 --- /dev/null +++ b/roles/feature-rhoai/defaults/main.yaml @@ -0,0 +1,8 @@ +--- +# defaults file +rhods_operator_channel: fast +service_mesh_operator_channel: stable +serverless_operator_channel: stable +certmgr_operator_channel: stable-v1 + +kserve_manifest_url: "https://github.com/kserve/kserve/releases/download/v0.14.1/kserve.yaml" diff --git a/roles/feature-rhoai/tasks/post-deploy.yaml b/roles/feature-rhoai/tasks/post-deploy.yaml new file mode 100644 index 0000000..a49b9f0 --- /dev/null +++ b/roles/feature-rhoai/tasks/post-deploy.yaml @@ -0,0 +1,280 @@ +# This deploys RHOAI Operator to a cluster +# Based on: +# https://docs.redhat.com/en/documentation/red_hat_openshift_ai_self-managed/2.16/html/installing_and_uninstalling_openshift_ai_self-managed/installing-and-deploying-openshift-ai_install#installing-openshift-ai-operator-using-cli_operator-install + +# TODO: Do we need to create Object Storage on ISAR ODF??? + + +- name: Get child cluster access + community.hashi_vault.vault_kv2_get: + url: "{{ lookup('ansible.builtin.env', 'RH_VAULT_URL' ) }}" + auth_method: approle + role_id: "{{ lookup('ansible.builtin.env', 'RH_VAULT_ROLE_ID' ) }}" + secret_id: "{{ lookup('ansible.builtin.env', 'RH_VAULT_SECRET_ID' ) }}" + ca_cert: "{{ lookup('ansible.builtin.env', 'RH_VAULT_CA_CERT_FILENAME' ) }}" + engine_mount_point: apps + + path: "coe-lab/cluster-credential/stormshift-{{ inventory_hostname }}" + register: cluster_credential + +- name: Temp folder + ansible.builtin.tempfile: + state: "directory" + register: temp + changed_when: False + +- name: Write kubeconfig + ansible.builtin.copy: + dest: "{{ temp.path }}/kubeconfig" + content: "{{ cluster_credential.secret.kubeconfig }}" + changed_when: False + +- name: Ensure cluster is online + wait_for: + host: "api.{{ inventory_hostname }}.{{ cluster_base_domain }}" + port: 443 + sleep: 1 + timeout: 10 + +# -------------------------------------------- +# ----- Service Mesh Operator +# -------------------------------------------- +- name: Install Service Mesh operator + kubernetes.core.k8s: + kubeconfig: "{{ temp.path }}/kubeconfig" + state: present + definition: + - kind: Subscription + apiVersion: operators.coreos.com/v1alpha1 + metadata: + name: servicemeshoperator + namespace: openshift-operators + spec: + name: servicemeshoperator + installPlanApproval: Automatic + channel: "{{ service_mesh_operator_channel }}" + source: redhat-operators + sourceNamespace: openshift-marketplace + +# -------------------------------------------- +# ----- Serverless Operator +# -------------------------------------------- +# See: https://docs.redhat.com/en/documentation/red_hat_openshift_serverless/1.33/html/installing_openshift_serverless/install-serverless-operator#serverless-install-cli_install-serverless-operator +- name: Install Serverless operator + kubernetes.core.k8s: + kubeconfig: "{{ temp.path }}/kubeconfig" + state: present + definition: + - apiVersion: v1 + kind: Namespace + metadata: + name: openshift-serverless + - apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: serverless-operators + namespace: openshift-serverless + spec: {} + - apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: serverless-operator + namespace: openshift-serverless + spec: + channel: "{{ serverless_operator_channel }}" + name: serverless-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +# -------------------------------------------- +# ----- Cert-Manager (Required for KServe) +# -------------------------------------------- +- name: Install Cert-Manager operator + kubernetes.core.k8s: + kubeconfig: "{{ temp.path }}/kubeconfig" + state: present + definition: + - apiVersion: v1 + kind: Namespace + metadata: + name: cert-manager-operator + - apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: openshift-cert-manager-operator + namespace: cert-manager-operator + spec: + targetNamespaces: + - "cert-manager-operator" + + - apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: openshift-cert-manager-operator + namespace: cert-manager-operator + spec: + channel: "{{ certmgr_operator_channel }}" + name: openshift-cert-manager-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic + +# -------------------------------------------- +# ----- KServe +# -------------------------------------------- +# Source: https://github.com/opendatahub-io/kserve +# This failed using k8s module, doing brute force "oc apply" here. +- name: "Probe for KServe" + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: CustomResourceDefinition + name: inferenceservices.serving.kserve.io + register: probe4kserve + +# Needs some retries for e.g. the cert-mgr operator to be fully deployed +- name: "Apply KServe manifests" + when: probe4kserve.resources | length == 0 + ansible.builtin.command: "oc apply --kubeconfig={{ temp.path }}/kubeconfig --server-side -f {{ kserve_manifest_url}}" + retries: 12 + delay: 10 + register: result + until: result.rc == 0 + +# -------------------------------------------- +# ----- RHOAI Operator +# -------------------------------------------- + +- name: Install RHOAI operator + register: rhoai_operator + kubernetes.core.k8s: + kubeconfig: "{{ temp.path }}/kubeconfig" + state: present + definition: + - kind: Namespace + apiVersion: v1 + metadata: + name: redhat-ods-operator + + - kind: OperatorGroup + apiVersion: operators.coreos.com/v1 + metadata: + name: rhods-operator + namespace: redhat-ods-operator + + - kind: Subscription + apiVersion: operators.coreos.com/v1alpha1 + metadata: + name: rhods-operator + namespace: redhat-ods-operator + spec: + name: rhods-operator + installPlanApproval: Automatic + channel: "{{ rhods_operator_channel }}" + source: redhat-operators + sourceNamespace: openshift-marketplace + +- name: Wait for Operator to be installed + when: rhoai_operator is changed + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: InstallPlan + api_version: operators.coreos.com/v1alpha1 + namespace: redhat-ods-operator + label_selectors: + - operators.coreos.com/rhods-operator.redhat-ods-operator = '' + wait: yes + wait_timeout: 300 + wait_condition: + type: Installed + status: True + +- name: Wait for DSCInitilization to start Progressing + when: rhoai_operator is changed + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: DSCInitialization + name: default-dsci + wait: yes + wait_timeout: 300 + wait_condition: + type: Progressing + status: True + +- name: Wait for DSCInitilization to stop Progressing + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: DSCInitialization + name: default-dsci + wait: yes + wait_timeout: 300 + wait_condition: + type: Progressing + status: False + +- name: Wait for DSCInitilization to be completed + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: DSCInitialization + name: default-dsci + wait: yes + wait_timeout: 300 + wait_condition: + type: Available + status: True + +- name: Wait for operator pod to be ready + kubernetes.core.k8s_info: + kubeconfig: "{{ temp.path }}/kubeconfig" + kind: Pod + namespace: redhat-ods-operator + label_selectors: + - "name = rhods-operator" + wait: yes + wait_timeout: 300 + wait_condition: + type: Ready + status: True + +- name: Install DataScienceCluster (time for coffee) + kubernetes.core.k8s: + kubeconfig: "{{ temp.path }}/kubeconfig" + state: present + wait: yes + wait_timeout: 300 + wait_condition: + type: Available + status: True + definition: + - apiVersion: datasciencecluster.opendatahub.io/v1 + kind: DataScienceCluster + metadata: + name: default-dsc + spec: + components: + codeflare: + managementState: Removed + dashboard: + managementState: Managed + datasciencepipelines: + managementState: Managed + kserve: + managementState: Managed + serving: + name: knative-serving + managementState: Managed + ingressGateway: + certificate: + secretName: knative-serving-cert + type: OpenshiftDefaultIngress + kueue: + managementState: Removed + modelmeshserving: + managementState: Removed + ray: + managementState: Removed + trainingoperator: + managementState: Removed + trustyai: + managementState: Removed + workbenches: + managementState: Removed diff --git a/roles/feature-rhoai/tasks/pre-destroy.yaml b/roles/feature-rhoai/tasks/pre-destroy.yaml new file mode 100644 index 0000000..be241f7 --- /dev/null +++ b/roles/feature-rhoai/tasks/pre-destroy.yaml @@ -0,0 +1,2 @@ +--- +# Nothing to implement here