From 5d379bb888209ab66e61fc3dbe57433daffdda33 Mon Sep 17 00:00:00 2001 From: Arka Pramanik Date: Tue, 23 Jul 2024 12:51:38 +0530 Subject: [PATCH] Update rollout-restart.sh logic --- .../cray-istio/files/rollout-restart.sh | 110 ------------ .../cray-istio/templates/rollout-restart.yaml | 170 ++++++++++++++++-- kubernetes/cray-istio/values.yaml | 3 + 3 files changed, 163 insertions(+), 120 deletions(-) delete mode 100644 kubernetes/cray-istio/files/rollout-restart.sh diff --git a/kubernetes/cray-istio/files/rollout-restart.sh b/kubernetes/cray-istio/files/rollout-restart.sh deleted file mode 100644 index 5de0e60..0000000 --- a/kubernetes/cray-istio/files/rollout-restart.sh +++ /dev/null @@ -1,110 +0,0 @@ -#!/bin/sh -# -# MIT License -# -# (C) Copyright 2024 Hewlett Packard Enterprise Development LP -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -# When upgrading from Istio 1.11.8 to 1.19.10 we need to rollout-restart because -# the istio-injection enabled namespaces doesn't get the latest image of istio. - -set -x - -# Function to check rollout status and handle errors -check_rollout_status() { - resource_type=$1 - resource_name=$2 - namespace=$3 - - kubectl rollout status $resource_type/$resource_name -n $namespace --timeout=1m - if [ $? -ne 0 ]; then - echo "Error: Rollout status check failed for $resource_type/$resource_name in namespace $namespace" >> errors.log - fi -} - -echo "**** Performing rollout restart for namespace: nexus ****" - -timeout 60 kubectl rollout restart deployment -n nexus -echo "Waiting for 2.5 minutes before nexus rollout is finished..." -sleep 150 - -# Check rollout status for nexus -echo "**** Checking rollout status for namespace: nexus ****" -deployments=$(kubectl get deployments -n nexus -o jsonpath='{.items[*].metadata.name}') -for deployment in $deployments; do - check_rollout_status deployment $deployment nexus -done - - -# If there were any errors in the nexus rollout status checks, exit with status 1 -if [ -f errors.log ]; then - echo "Errors were encountered during the rollout status checks for nexus:" - cat errors.log - rm errors.log - exit 1 -fi - -# Retrieve namespaces with istio-injection=enabled label except nexus (handled separately) to prevent IPBO issue -namespaces=$(kubectl get ns -l istio-injection=enabled -o jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep -v '^nexus$' | tr '\n' ' ') - -# Perform rollout restart for deployments, statefulsets, and daemonsets in each namespace -for namespace in $namespaces; do - echo "**** Performing rollout restart for namespace: $namespace ****" - timeout 60 kubectl rollout restart deployment -n $namespace - timeout 60 kubectl rollout restart statefulset -n $namespace - timeout 60 kubectl rollout restart daemonset -n $namespace -done - -echo "Rollout restart completed for all istio-injection enabled namespaces." -echo "Waiting for 5 minutes before checking rollout status..." -sleep 300 - -# Check rollout status for deployments, statefulsets, and daemonsets in each namespace -for namespace in $namespaces; do - echo "**** Checking rollout status for namespace: $namespace ****" - # Check rollout status for deployments - deployments=$(kubectl get deployments -n $namespace -o jsonpath='{.items[*].metadata.name}') - for deployment in $deployments; do - check_rollout_status deployment $deployment $namespace - done - - # Check rollout status for statefulsets - statefulsets=$(kubectl get statefulsets -n $namespace -o jsonpath='{.items[*].metadata.name}') - for statefulset in $statefulsets; do - check_rollout_status statefulset $statefulset $namespace - done - - # Check rollout status for daemonsets - daemonsets=$(kubectl get daemonsets -n $namespace -o jsonpath='{.items[*].metadata.name}') - for daemonset in $daemonsets; do - check_rollout_status daemonset $daemonset $namespace - done -done - -echo "Rollout restart and status check completed for all istio-injection enabled namespaces." - -# Report errors if any -if [ -f errors.log ]; then - echo "Errors were encountered during the rollout status checks:" - cat errors.log - rm errors.log -else - echo "No errors encountered during the rollout status checks." -fi diff --git a/kubernetes/cray-istio/templates/rollout-restart.yaml b/kubernetes/cray-istio/templates/rollout-restart.yaml index b42974b..82420e1 100644 --- a/kubernetes/cray-istio/templates/rollout-restart.yaml +++ b/kubernetes/cray-istio/templates/rollout-restart.yaml @@ -31,23 +31,23 @@ metadata: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: rollout-restart-clusterrole + name: rollout-restart-job-clusterrole namespace: istio-system annotations: helm.sh/hook: post-upgrade helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded rules: - apiGroups: [""] - resources: ["namespaces"] + resources: ["namespaces", "pods"] verbs: ["get", "list", "watch"] - apiGroups: ["apps"] - resources: ["deployments", "statefulsets", "daemonsets"] - verbs: ["get", "patch", "list"] + resources: ["deployments", "statefulsets", "daemonsets", "replicasets"] + verbs: ["get", "patch", "list", "watch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: rollout-restart-clusterrolebinding + name: rollout-restart-job-clusterrolebinding namespace: istio-system annotations: helm.sh/hook: post-upgrade @@ -55,7 +55,7 @@ metadata: roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: rollout-restart-clusterrole + name: rollout-restart-job-clusterrole subjects: - kind: ServiceAccount name: rollout-restart-job @@ -64,7 +64,7 @@ subjects: apiVersion: batch/v1 kind: Job metadata: - name: rollout-restart + name: "rollout-restart-post-upgrade" annotations: "helm.sh/hook": post-upgrade "helm.sh/hook-weight": "5" @@ -73,14 +73,164 @@ spec: ttlSecondsAfterFinished: 86400 # Clean up the job automatically after one day template: spec: - serviceAccountName: rollout-restart-job + serviceAccountName: "rollout-restart-job" restartPolicy: Never containers: - - name: rollout-restart + - name: rollout-restart-post-upgrade image: "{{ .Values.kubectl.image.repository }}:{{ .Values.kubectl.image.tag }}" command: - '/bin/sh' args: - '-c' - | - {{- .Files.Get "files/rollout-restart.sh" | nindent 14 }} + #!/bin/sh + + # Extract Istio version from values.yaml + + # Function to check if any container in a pod has the specified Istio image version + check_pod_istio_versions() { + namespace=$1 + pod=$2 + images=$(kubectl get pod $pod -n $namespace -o jsonpath="{.spec.containers[*].image}") + + # Check if any of the images is the specified Istio version + if echo $images | grep -q ":{{ .Values.istio_prev_version }}"; then + return 1 # Pod has the specified Istio versions + else + return 0 # Pod does not have the specified Istio versions + fi + } + + # Function to determine the controlling resource of a pod + get_controlling_resource() { + namespace=$1 + pod=$2 + owner_references=$(kubectl get pod $pod -n $namespace -o jsonpath="{.metadata.ownerReferences[0].kind}/{.metadata.ownerReferences[0].name}") + + if [ -n "$owner_references" ]; then + echo $owner_references + else + # Fallback to describe to get owner information if needed + controlling_resource=$(kubectl describe pod $pod -n $namespace | grep -E "Controlled By" | awk -F: '{print $2}' | xargs) + echo $controlling_resource + fi + } + + # Function to perform rollout restart and check status for a given resource + restart_and_check_status() { + namespace=$1 + resource_type=$2 + resource_name=$3 + + if [ "$resource_type" = "ReplicaSet" ]; then + # Find the corresponding Deployment + deployment=$(kubectl get replicasets $resource_name -n $namespace -o jsonpath="{.metadata.ownerReferences[0].name}") + if [ -n "$deployment" ]; then + resource_type="Deployment" + resource_name=$deployment + else + echo "No corresponding Deployment found for ReplicaSet $resource_name" + return 1 + fi + fi + + echo "Rolling out restart for $resource_type/$resource_name in namespace: $namespace" + timeout 60 kubectl rollout restart $resource_type/$resource_name -n $namespace + + echo "Checking rollout status for $resource_type/$resource_name in namespace: $namespace till 3 minutes" + kubectl rollout status $resource_type/$resource_name -n $namespace --timeout=3m + } + + # Function to check if all pods in a namespace are running + are_pods_running() { + namespace=$1 + pods=$(kubectl get pods -n $namespace -o jsonpath="{.items[*].status.phase}") + + for pod_status in $pods; do + if [ "$pod_status" != "Running" ]; then + return 1 + fi + done + return 0 + } + + # Get all namespaces + namespaces=$(kubectl get namespaces -l istio-injection=enabled -o jsonpath="{.items[*].metadata.name}") + + # Initialize an array to keep track of restarted resources + restarted_resources="" + + # First check the nexus namespace + nexus_namespace="nexus" + echo "Checking nexus namespace: $nexus_namespace" + pods=$(kubectl get pods -n $nexus_namespace -o jsonpath="{.items[*].metadata.name}") + + for pod in $pods; do + if ! check_pod_istio_versions $nexus_namespace $pod; then + echo "Pod $pod in namespace $nexus_namespace does not have the latest Istio version. Checking its controlling resource..." + controlling_resource=$(get_controlling_resource $nexus_namespace $pod) + + # Extract resource type and name from controlling_resource + if echo $controlling_resource | grep -qE "^(Deployment|StatefulSet|DaemonSet|ReplicaSet)/"; then + resource_type=$(echo $controlling_resource | cut -d'/' -f1) + resource_name=$(echo $controlling_resource | cut -d'/' -f2) + + resource_key="$nexus_namespace/$resource_type/$resource_name" + + if ! echo "$restarted_resources" | grep -q "$resource_key"; then + restart_and_check_status $nexus_namespace $resource_type $resource_name + restarted_resources="$restarted_resources $resource_key" + else + echo "Resource $resource_key has already been restarted, skipping..." + fi + else + echo "Skipping unknown or unhandled resource type: $controlling_resource for pod $pod" + fi + else + echo "Pod $pod in namespace $nexus_namespace not needed to be restarted." + fi + done + + # Wait for pods in nexus namespace to be in Running state + echo "Waiting for pods in nexus namespace to be in Running state..." + if ! are_pods_running $nexus_namespace; then + echo "Some pods in nexus namespace are not running. Exiting script." + exit 1 + fi + + # Proceed with other namespaces if nexus namespace is okay + echo "Checking remaining namespaces..." + for ns in $namespaces; do + if [ "$ns" = "$nexus_namespace" ]; then + continue + fi + + echo "Checking namespace: $ns" + pods=$(kubectl get pods -n $ns -o jsonpath="{.items[*].metadata.name}") + + for pod in $pods; do + if ! check_pod_istio_versions $ns $pod; then + echo "Pod $pod in namespace $ns does not have the latest Istio version. Checking its controlling resource..." + controlling_resource=$(get_controlling_resource $ns $pod) + + # Extract resource type and name from controlling_resource + if echo $controlling_resource | grep -qE "^(Deployment|StatefulSet|DaemonSet|ReplicaSet)/"; then + resource_type=$(echo $controlling_resource | cut -d'/' -f1) + resource_name=$(echo $controlling_resource | cut -d'/' -f2) + + resource_key="$ns/$resource_type/$resource_name" + + if ! echo "$restarted_resources" | grep -q "$resource_key"; then + restart_and_check_status $ns $resource_type $resource_name + restarted_resources="$restarted_resources $resource_key" + else + echo "Resource $resource_key has already been restarted, skipping..." + fi + else + echo "Skipping unknown or unhandled resource type: $controlling_resource for pod $pod" + fi + else + echo "Pod $pod in namespace $ns already has the latest Istio image versions" + fi + done + done diff --git a/kubernetes/cray-istio/values.yaml b/kubernetes/cray-istio/values.yaml index 2532d29..56cd139 100644 --- a/kubernetes/cray-istio/values.yaml +++ b/kubernetes/cray-istio/values.yaml @@ -1120,3 +1120,6 @@ istio: env: USE_ISTIO_JWT_FILTER: "true" PILOT_ENABLE_UNSAFE_REGEX: "true" + +# Previous version of Istio from the current Upgrade (for tracking old image of istio) +istio_prev_version: 1.11.8