Skip to content

Commit

Permalink
feat: enhance reconcile process to fix inconsistency
Browse files Browse the repository at this point in the history
  • Loading branch information
soma00333 committed Jan 21, 2025
1 parent c2343a0 commit d3d6d29
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 11 deletions.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ require (
go.etcd.io/etcd/client/v2 v2.305.17 // indirect
go.etcd.io/etcd/pkg/v3 v3.5.17 // indirect
go.etcd.io/etcd/raft/v3 v3.5.17 // indirect
go.etcd.io/gofail v0.2.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect
go.opentelemetry.io/otel v1.28.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ go.etcd.io/etcd/raft/v3 v3.5.17 h1:wHPW/b1oFBw/+HjDAQ9vfr17OIInejTIsmwMZpK1dNo=
go.etcd.io/etcd/raft/v3 v3.5.17/go.mod h1:uapEfOMPaJ45CqBYIraLO5+fqyIY2d57nFfxzFwy4D4=
go.etcd.io/etcd/server/v3 v3.5.17 h1:xykBwLZk9IdDsB8z8rMdCCPRvhrG+fwvARaGA0TRiyc=
go.etcd.io/etcd/server/v3 v3.5.17/go.mod h1:40sqgtGt6ZJNKm8nk8x6LexZakPu+NDl/DCgZTZ69Cc=
go.etcd.io/gofail v0.2.0 h1:p19drv16FKK345a09a1iubchlw/vmRuksmRzgBIGjcA=
go.etcd.io/gofail v0.2.0/go.mod h1:nL3ILMGfkXTekKI3clMBNazKnjUZjYLKmBHzsVAnC1o=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 h1:9G6E0TXzGFVfTnawRzrPl83iHOAV7L8NJiR8RSGYV1g=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0/go.mod h1:azvtTADFQJA8mX80jIH/akaE7h+dbm/sVuaHqN13w74=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA=
Expand Down
44 changes: 33 additions & 11 deletions internal/controller/etcdcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,32 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
targetReplica := *sts.Spec.Replicas // Start with the current size of the stateful set

// TODO: finish the logic later
// The number of replicas in the StatefulSet doesn't match the number of etcd members in the cluster.
if int(targetReplica) != memberCnt {
// TODO: finish the logic later
// nolint:staticcheck // Temporarily disable staticcheck
logger.Info("The expected number of replicas doesn't match the number of etcd members in the cluster", "targetReplica", targetReplica, "memberCnt", memberCnt)
if int(targetReplica) < memberCnt {
// a new added learner hasn't started yet

// re-generate configuration for the new learner member;
// increase statefulsets's replica by 1
// A new member has been added to the etcd cluster
// but the corresponding Pod hasn't been created yet in the StatefulSet.
// Increase the StatefulSet replicas by 1 to match the new cluster member.
newReplicaCount := targetReplica + 1
logger.Info("Increasing StatefulSet replicas to match the new etcd learner.", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
if err != nil {
return ctrl.Result{}, err
}
} else {
// an already removed member hasn't stopped yet.

// Decrease the statefulsets's replica by 1
// A member has been removed from the etcd cluster
// but the corresponding Pod is still running.
// Decrease the StatefulSet replicas by 1 to remove the unneeded Pod.
logger.Info("An etcd member was removed from the cluster, but the StatefulSet hasn't scaled down yet.")
newReplicaCount := targetReplica - 1
logger.Info("Decreasing StatefulSet replicas to remove the unneeded Pod.", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
if err != nil {
return ctrl.Result{}, err
}
}
// return
return ctrl.Result{RequeueAfter: requeueDuration}, nil
}

var (
Expand Down Expand Up @@ -214,6 +225,10 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if _, err := etcdutils.AddMember(eps, []string{peerURL}, true); err != nil {
return ctrl.Result{}, err
}
// We will interrupt this state and crash the operator before updating the StatefulSet replicas.
// gofail: var CrashAfterAddMember struct{}
// logger.Info("gofail CrashAfterAddMember triggered")
// os.Exit(1)

logger.Info("Learner member added successfully", "peerURLs", peerURL)
} else {
Expand All @@ -228,6 +243,13 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := etcdutils.RemoveMember(eps, memberID); err != nil {
return ctrl.Result{}, err
}

// We will interrupt this state and crash the operator before updating the StatefulSet replicas.
// gofail: var CrashAfterRemoveMember struct{}
// logger.Info("gofail CrashAfterRemoveMember triggered")
// os.Exit(1)

logger.Info("Member removed successfully", "memberID", memberID)
}

sts, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, targetReplica, r.Scheme)
Expand Down

0 comments on commit d3d6d29

Please sign in to comment.