From 56bbc6dccdab8a429a63f74ad0e185797cf8530e Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 19 Aug 2024 14:15:46 +1000 Subject: [PATCH 1/5] Light Pending phase refactor help with future work to deal with problem nodes --- .../cyclenoderequest/transitioner/node.go | 58 +++++++++ .../transitioner/transitions.go | 112 +++++++----------- .../cyclenoderequest/transitioner/util.go | 49 ++++++-- .../transitioner/util_test.go | 2 +- 4 files changed, 136 insertions(+), 85 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/node.go b/pkg/controller/cyclenoderequest/transitioner/node.go index d6b7b53..08ee3d7 100644 --- a/pkg/controller/cyclenoderequest/transitioner/node.go +++ b/pkg/controller/cyclenoderequest/transitioner/node.go @@ -3,6 +3,8 @@ package transitioner import ( "fmt" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" @@ -140,6 +142,62 @@ func (t *CycleNodeRequestTransitioner) addNamedNodesToTerminate(kubeNodes map[st return nil } +// Find all the nodes in kube and the cloud provider that match the node selector and nodegroups +// specified in the CNR. These are two separate sets and the contents of one does not affect the +// contents of the other. +func (t *CycleNodeRequestTransitioner) findAllNodesForCycle() (kubeNodes map[string]corev1.Node, cloudProviderInstances map[string]cloudprovider.Instance, err error) { + kubeNodes, err = t.listReadyNodes(true) + if err != nil { + return kubeNodes, cloudProviderInstances, err + } + + if len(kubeNodes) == 0 { + return kubeNodes, cloudProviderInstances, fmt.Errorf("no nodes matched selector") + } + + // Only retain nodes which still exist inside cloud provider + var nodeProviderIDs []string + + for _, node := range kubeNodes { + nodeProviderIDs = append(nodeProviderIDs, node.Spec.ProviderID) + } + + existingProviderIDs, err := t.rm.CloudProvider.InstancesExist(nodeProviderIDs) + if err != nil { + return kubeNodes, cloudProviderInstances, errors.Wrap(err, "failed to check instances that exist from cloud provider") + } + + existingKubeNodes := make(map[string]corev1.Node) + + for _, validProviderID := range existingProviderIDs { + if node, found := kubeNodes[validProviderID]; found { + existingKubeNodes[node.Spec.ProviderID] = node + } + } + + kubeNodes = existingKubeNodes + + if len(kubeNodes) == 0 { + return kubeNodes, cloudProviderInstances, fmt.Errorf("no existing nodes in cloud provider matched selector") + } + + nodeGroupNames := t.cycleNodeRequest.GetNodeGroupNames() + + // Describe the node group for the request + t.rm.LogEvent(t.cycleNodeRequest, "FetchingNodeGroup", "Fetching node group: %v", nodeGroupNames) + + if len(nodeGroupNames) == 0 { + return kubeNodes, cloudProviderInstances, fmt.Errorf("must have at least one nodegroup name defined") + } + + nodeGroups, err := t.rm.CloudProvider.GetNodeGroups(nodeGroupNames) + if err != nil { + return kubeNodes, cloudProviderInstances, err + } + + return kubeNodes, nodeGroups.Instances(), nil +} + // newCycleNodeRequestNode converts a corev1.Node to a v1.CycleNodeRequestNode. This is done multiple // times across the code, this function standardises the process func newCycleNodeRequestNode(kubeNode *corev1.Node, nodeGroupName string) v1.CycleNodeRequestNode { diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index c122b89..b439544 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -6,7 +6,6 @@ import ( "strings" "time" - corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -62,72 +61,51 @@ func (t *CycleNodeRequestTransitioner) transitionUndefined() (reconcile.Result, // 2. describes the node group and checks that the number of instances in the node group matches the number we // are planning on terminating func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, error) { - // Fetch the node names for the cycleNodeRequest, using the label selector provided - t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Selecting nodes with label selector") - - kubeNodes, err := t.listReadyNodes(true) + // Start the equilibrium wait timer, if this times out as the set of nodes in kube and + // the cloud provider is not considered valid, then transition to the Healing phase as + // cycling should not proceed. + timedOut, err := t.equilibriumWaitTimedOut() if err != nil { return t.transitionToHealing(err) } - if len(kubeNodes) == 0 { - return t.transitionToHealing(fmt.Errorf("no nodes matched selector")) - } - - // Only retain nodes which still exist inside cloud provider - var nodeProviderIDs []string - - for _, node := range kubeNodes { - nodeProviderIDs = append(nodeProviderIDs, node.Spec.ProviderID) - } - - existingProviderIDs, err := t.rm.CloudProvider.InstancesExist(nodeProviderIDs) - if err != nil { - return t.transitionToHealing(errors.Wrap(err, "failed to check instances that exist from cloud provider")) - } - - existingKubeNodes := make(map[string]corev1.Node) - - for _, validProviderID := range existingProviderIDs { - if node, found := kubeNodes[validProviderID]; found { - existingKubeNodes[node.Spec.ProviderID] = node - } - } - - kubeNodes = existingKubeNodes - - if len(kubeNodes) == 0 { - return t.transitionToHealing(fmt.Errorf("no existing nodes in cloud provider matched selector")) + if timedOut { + return t.transitionToHealing(fmt.Errorf( + "node count mismatch, number of kubernetes nodes does not match number of cloud provider instances after %v", + nodeEquilibriumWaitLimit, + )) } - nodeGroupNames := t.cycleNodeRequest.GetNodeGroupNames() - - // Describe the node group for the request - t.rm.LogEvent(t.cycleNodeRequest, "FetchingNodeGroup", "Fetching node group: %v", nodeGroupNames) - - if len(nodeGroupNames) == 0 { - return t.transitionToHealing(fmt.Errorf("must have at least one nodegroup name defined")) - } + // Fetch the node names for the cycleNodeRequest, using the label selector provided + t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Selecting nodes with label selector") - nodeGroups, err := t.rm.CloudProvider.GetNodeGroups(nodeGroupNames) + // Find all the nodes in kube and the cloud provider nodegroups selected by the CNR. These + // should be all the nodes in each, regardless of it they exist in both. + kubeNodes, nodeGroupInstances, err := t.findAllNodesForCycle() if err != nil { return t.transitionToHealing(err) } - // get instances inside cloud provider node groups - nodeGroupInstances := nodeGroups.Instances() + // Find all the nodes nodes that exist in both kube and the cloud provider nodegroups. This is + // the valid set of nodes and can be worked on. This is an AND condition on the two initial + // sets of nodes. + validKubeNodes, validNodeGroupInstances := findValidNodes(kubeNodes, nodeGroupInstances) + + // Find all the nodes that exist in either kube or the cloud provider nodegroups, but not both. + // The nodes in the cloud provider can either not exist or be detached from one of the nodegroups + // and this will be determined when dealt with. This is an XOR condition on the two initial sets + // of nodes. + nodesNotInCloudProvider, nodesNotInKube := findProblemNodes(kubeNodes, nodeGroupInstances) // Do some sanity checking before we start filtering things // Check the instance count of the node group matches the number of nodes found in Kubernetes - if len(kubeNodes) != len(nodeGroupInstances) { + if len(nodesNotInCloudProvider) > 0 || len(nodesNotInKube) > 0 { var offendingNodesInfo string - nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(kubeNodes, nodeGroupInstances) - - if len(nodesNotInCPNodeGroup) > 0 { + if len(nodesNotInCloudProvider) > 0 { providerIDs := make([]string, 0) - for providerID := range nodesNotInCPNodeGroup { + for providerID := range nodesNotInCloudProvider { providerIDs = append(providerIDs, fmt.Sprintf("id %q", providerID), ) @@ -156,22 +134,7 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er t.rm.LogEvent(t.cycleNodeRequest, "NodeCountMismatch", "node group: %v, kube: %v. %v", - len(nodeGroupInstances), len(kubeNodes), offendingNodesInfo) - - // If it doesn't, then retry for a while in case something just scaled the node group - timedOut, err := t.equilibriumWaitTimedOut() - if err != nil { - return t.transitionToHealing(err) - } - - if timedOut { - err := fmt.Errorf( - "node count mismatch, number of kubernetes nodes does not match number of cloud provider instances after %v", - nodeEquilibriumWaitLimit, - ) - - return t.transitionToHealing(err) - } + len(validNodeGroupInstances), len(validKubeNodes), offendingNodesInfo) return reconcile.Result{Requeue: true, RequeueAfter: requeueDuration}, nil } @@ -180,7 +143,7 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er if len(t.cycleNodeRequest.Spec.NodeNames) > 0 { // If specific node names are provided, check they actually exist in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding named nodes to NodesToTerminate") - err := t.addNamedNodesToTerminate(kubeNodes, nodeGroupInstances) + err := t.addNamedNodesToTerminate(validKubeNodes, validNodeGroupInstances) if err != nil { return t.transitionToHealing(err) } @@ -188,10 +151,10 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er // Otherwise just add all the nodes in the node group t.rm.LogEvent(t.cycleNodeRequest, "SelectingNodes", "Adding all node group nodes to NodesToTerminate") - for _, kubeNode := range kubeNodes { + for _, kubeNode := range validKubeNodes { // Check to ensure the kubeNode object maps to an existing node in the ASG // If this isn't the case, this is a phantom node. Fail the cnr to be safe. - nodeGroupName, ok := nodeGroupInstances[kubeNode.Spec.ProviderID] + nodeGroupName, ok := validNodeGroupInstances[kubeNode.Spec.ProviderID] if !ok { return t.transitionToHealing(fmt.Errorf("kubeNode %s not found in the list of instances in the ASG", kubeNode.Name)) } @@ -209,7 +172,7 @@ func (t *CycleNodeRequestTransitioner) transitionPending() (reconcile.Result, er } if len(t.cycleNodeRequest.Spec.HealthChecks) > 0 { - if err = t.performInitialHealthChecks(kubeNodes); err != nil { + if err = t.performInitialHealthChecks(validKubeNodes); err != nil { return t.transitionToHealing(err) } } @@ -595,9 +558,16 @@ func (t *CycleNodeRequestTransitioner) transitionHealing() (reconcile.Result, er // un-cordon after attach as well t.rm.LogEvent(t.cycleNodeRequest, "UncordoningNodes", "Uncordoning nodes in node group: %v", node.Name) - if err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + + err = retry.RetryOnConflict(retry.DefaultRetry, func() error { return k8s.UncordonNode(node.Name, t.rm.RawClient) - }); err != nil { + }) + + if apierrors.IsNotFound(err) { + continue + } + + if err != nil { return t.transitionToFailed(err) } } diff --git a/pkg/controller/cyclenoderequest/transitioner/util.go b/pkg/controller/cyclenoderequest/transitioner/util.go index 8bade32..ef429bc 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util.go +++ b/pkg/controller/cyclenoderequest/transitioner/util.go @@ -251,25 +251,48 @@ func (t *CycleNodeRequestTransitioner) checkIfTransitioning(numNodesToCycle, num return false, reconcile.Result{}, nil } -// findOffendingNodes finds the offending nodes information which cause number of nodes mismatch between -// cloud provider node group and nodes inside kubernetes cluster using label selector -func findOffendingNodes(kubeNodes map[string]corev1.Node, cloudProviderNodes map[string]cloudprovider.Instance) (map[string]corev1.Node, map[string]cloudprovider.Instance) { - var nodesNotInCPNodeGroup = make(map[string]corev1.Node) - var nodesNotInKube = make(map[string]cloudprovider.Instance) - - for _, kubeNode := range kubeNodes { - if _, ok := cloudProviderNodes[kubeNode.Spec.ProviderID]; !ok { - nodesNotInCPNodeGroup[kubeNode.Spec.ProviderID] = kubeNode +// findValidNodes performs an AND operation on the two sets of nodes. It finds all the nodes +// in both kube and the cloud provider nodegroups. This is considered the valid set of nodes +// that can be operated on. +func findValidNodes(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) (map[string]corev1.Node, map[string]cloudprovider.Instance) { + validKubeNodes := make(map[string]corev1.Node) + validNodegroupInstances := make(map[string]cloudprovider.Instance) + + for providerId, kubeNode := range kubeNodes { + if _, exists := nodeGroupInstances[providerId]; exists { + validKubeNodes[providerId] = kubeNode } } - for providerID, cpNode := range cloudProviderNodes { - if _, ok := kubeNodes[providerID]; !ok { - nodesNotInKube[providerID] = cpNode + for providerId, nodeGroupInstance := range nodeGroupInstances { + if _, exists := nodeGroupInstances[providerId]; exists { + validNodegroupInstances[providerId] = nodeGroupInstance } } - return nodesNotInCPNodeGroup, nodesNotInKube + return validKubeNodes, validNodegroupInstances +} + +// findProblemNodes performs an XOR operation on the two sets of nodes. It finds all the nodes +// in either kube or the cloud provider nodegroups, but not both. These are considered the +// problems sets of nodes that need to be dealt with before cycling can occur. +func findProblemNodes(kubeNodes map[string]corev1.Node, nodeGroupInstances map[string]cloudprovider.Instance) (map[string]corev1.Node, map[string]cloudprovider.Instance) { + problemKubeNodes := make(map[string]corev1.Node) + problemNodegroupInstances := make(map[string]cloudprovider.Instance) + + for providerId, kubeNode := range kubeNodes { + if _, exists := nodeGroupInstances[providerId]; !exists { + problemKubeNodes[providerId] = kubeNode + } + } + + for providerId, nodeGroupInstance := range nodeGroupInstances { + if _, exists := kubeNodes[providerId]; !exists { + problemNodegroupInstances[providerId] = nodeGroupInstance + } + } + + return problemKubeNodes, problemNodegroupInstances } // transitionToUnsuccessful transitions the current cycleNodeRequest to healing/failed diff --git a/pkg/controller/cyclenoderequest/transitioner/util_test.go b/pkg/controller/cyclenoderequest/transitioner/util_test.go index 22c326e..724bf10 100644 --- a/pkg/controller/cyclenoderequest/transitioner/util_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/util_test.go @@ -135,7 +135,7 @@ func TestFindOffendingNodes(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - nodesNotInCPNodeGroup, nodesNotInKube := findOffendingNodes(test.knodes, test.cnodes) + nodesNotInCPNodeGroup, nodesNotInKube := findProblemNodes(test.knodes, test.cnodes) assert.Equal(t, true, reflect.DeepEqual(test.expectNotInCPNodeGroup, nodesNotInCPNodeGroup)) assert.Equal(t, true, reflect.DeepEqual(test.expectNotInKube, nodesNotInKube)) }) From 68fc4cb691570a1ae5bc18213abe324182000116 Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 19 Aug 2024 16:38:33 +1000 Subject: [PATCH 2/5] Add test to confirm the new functionality --- pkg/cloudprovider/aws/fake/fake.go | 10 +++ .../transitioner/transitions_pending_test.go | 66 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/pkg/cloudprovider/aws/fake/fake.go b/pkg/cloudprovider/aws/fake/fake.go index 06aa72c..60435fc 100644 --- a/pkg/cloudprovider/aws/fake/fake.go +++ b/pkg/cloudprovider/aws/fake/fake.go @@ -111,6 +111,16 @@ func (m *Autoscaling) DescribeAutoScalingGroups(input *autoscaling.DescribeAutoS }, nil } +func (m *Autoscaling) AttachInstances(input *autoscaling.AttachInstancesInput) (*autoscaling.AttachInstancesOutput, error) { + for _, instanceId := range input.InstanceIds { + if instance, exists := m.Instances[*instanceId]; exists { + instance.AutoscalingGroupName = *input.AutoScalingGroupName + } + } + + return &autoscaling.AttachInstancesOutput{}, nil +} + // *************** EC2 *************** // func (m *Ec2) DescribeInstances(input *ec2.DescribeInstancesInput) (*ec2.DescribeInstancesOutput, error) { diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index c95ccaa..f947da2 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -6,6 +6,8 @@ import ( v1 "github.com/atlassian-labs/cyclops/pkg/apis/atlassian/v1" "github.com/atlassian-labs/cyclops/pkg/mock" + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/autoscaling" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -346,6 +348,70 @@ func TestPendingDetachedCloudProviderNode(t *testing.T) { assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase) } +// Test to ensure that Cyclops will not proceed if there is node detached from +// the nodegroup on the cloud provider. It should wait and especially should not +// succeed if the instance is re-attached by the final requeuing of the Pending +// phase which would occur after the timeout period. +func TestPendingReattachedCloudProviderNode(t *testing.T) { + nodegroup, err := mock.NewNodegroup("ng-1", 2) + if err != nil { + assert.NoError(t, err) + } + + // "detach" one instance from the asg + nodegroup[0].Nodegroup = "" + + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cnr-1", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + CycleSettings: v1.CycleSettings{ + Concurrency: 1, + Method: v1.CycleNodeRequestMethodDrain, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "customer": "kitt", + }, + }, + }, + Status: v1.CycleNodeRequestStatus{ + Phase: v1.CycleNodeRequestPending, + }, + } + + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + ) + + // Should requeue while it tries to wait + _, err = fakeTransitioner.Run() + assert.NoError(t, err) + assert.Equal(t, v1.CycleNodeRequestPending, cnr.Status.Phase) + + // Simulate waiting for 1s more than the wait limit + cnr.Status.EquilibriumWaitStarted = &metav1.Time{ + Time: time.Now().Add(-nodeEquilibriumWaitLimit - time.Second), + } + + fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{ + AutoScalingGroupName: aws.String("ng-1"), + InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}), + }) + + // "re-attach" one instance from the asg + fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1" + + // This time should transition to the healing phase + _, err = fakeTransitioner.Run() + assert.Error(t, err) + assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase) +} + // Test that if no nodegroup names are given. The CNR should transition to the // Healing phase since no nodes will match in the cloud provider. func TestPendingNoNodegroupNamesGiven(t *testing.T) { From 21b6fd56c19a6e9fa04f85e3d0bf0820b459e032 Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 19 Aug 2024 16:41:05 +1000 Subject: [PATCH 3/5] Fix test to pass the linting --- .../cyclenoderequest/transitioner/transitions_pending_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index f947da2..6e4fc06 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -398,11 +398,13 @@ func TestPendingReattachedCloudProviderNode(t *testing.T) { Time: time.Now().Add(-nodeEquilibriumWaitLimit - time.Second), } - fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{ + _, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{ AutoScalingGroupName: aws.String("ng-1"), InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}), }) + assert.NoError(t, err) + // "re-attach" one instance from the asg fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1" From ac3fff086bb8a5be532771092e9831ccaf64f45d Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 19 Aug 2024 16:45:29 +1000 Subject: [PATCH 4/5] Fix comments for the test --- .../transitioner/transitions_pending_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index 6e4fc06..dc00d3a 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -405,10 +405,11 @@ func TestPendingReattachedCloudProviderNode(t *testing.T) { assert.NoError(t, err) - // "re-attach" one instance from the asg + // "re-attach" the instance to the asg fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1" - // This time should transition to the healing phase + // This time should transition to the healing phase even though the state + // is correct because the timeout check happens first _, err = fakeTransitioner.Run() assert.Error(t, err) assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase) From ccc6ae50c5a27f24279e58a92cd3c4c41c23945a Mon Sep 17 00:00:00 2001 From: vportella Date: Mon, 19 Aug 2024 16:53:51 +1000 Subject: [PATCH 5/5] Add another test to simulate node state fixed within timeout period of Pending phase --- .../transitioner/transitions_pending_test.go | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go index dc00d3a..456b511 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions_pending_test.go @@ -295,7 +295,7 @@ func TestPendingNoKubeNodes(t *testing.T) { // Test to ensure that Cyclops will not proceed if there is node detached from // the nodegroup on the cloud provider. It should try to wait for the issue to -// resolve transition to the Healing phase if it doesn't. +// resolve to transition to the Healing phase if it doesn't. func TestPendingDetachedCloudProviderNode(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { @@ -348,11 +348,79 @@ func TestPendingDetachedCloudProviderNode(t *testing.T) { assert.Equal(t, v1.CycleNodeRequestHealing, cnr.Status.Phase) } +// Test to ensure that Cyclops will not proceed if there is node detached from +// the nodegroup on the cloud provider. It should try to wait for the issue to +// resolve and transition to Initialised when it does before reaching the +// timeout period. +func TestPendingReattachedCloudProviderNode(t *testing.T) { + nodegroup, err := mock.NewNodegroup("ng-1", 2) + if err != nil { + assert.NoError(t, err) + } + + // "detach" one instance from the asg + nodegroup[0].Nodegroup = "" + + cnr := &v1.CycleNodeRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: "cnr-1", + Namespace: "kube-system", + }, + Spec: v1.CycleNodeRequestSpec{ + NodeGroupsList: []string{"ng-1"}, + CycleSettings: v1.CycleSettings{ + Concurrency: 1, + Method: v1.CycleNodeRequestMethodDrain, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "customer": "kitt", + }, + }, + }, + Status: v1.CycleNodeRequestStatus{ + Phase: v1.CycleNodeRequestPending, + }, + } + + fakeTransitioner := NewFakeTransitioner(cnr, + WithKubeNodes(nodegroup), + WithCloudProviderInstances(nodegroup), + ) + + // Should requeue while it tries to wait + _, err = fakeTransitioner.Run() + assert.NoError(t, err) + assert.Equal(t, v1.CycleNodeRequestPending, cnr.Status.Phase) + + // Simulate waiting for 1s less than the wait limit + cnr.Status.EquilibriumWaitStarted = &metav1.Time{ + Time: time.Now().Add(-nodeEquilibriumWaitLimit + time.Second), + } + + _, err = fakeTransitioner.Autoscaling.AttachInstances(&autoscaling.AttachInstancesInput{ + AutoScalingGroupName: aws.String("ng-1"), + InstanceIds: aws.StringSlice([]string{nodegroup[0].InstanceID}), + }) + + assert.NoError(t, err) + + // "re-attach" the instance to the asg + fakeTransitioner.cloudProviderInstances[0].Nodegroup = "ng-1" + + // The CNR should transition to the Initialised phase because the state of + // the nodes is now correct and this happened within the timeout period. + _, err = fakeTransitioner.Run() + assert.NoError(t, err) + assert.Equal(t, v1.CycleNodeRequestInitialised, cnr.Status.Phase) + assert.Len(t, cnr.Status.NodesToTerminate, 2) +} + // Test to ensure that Cyclops will not proceed if there is node detached from // the nodegroup on the cloud provider. It should wait and especially should not // succeed if the instance is re-attached by the final requeuing of the Pending // phase which would occur after the timeout period. -func TestPendingReattachedCloudProviderNode(t *testing.T) { +func TestPendingReattachedCloudProviderNodeTooLate(t *testing.T) { nodegroup, err := mock.NewNodegroup("ng-1", 2) if err != nil { assert.NoError(t, err)