From 182fabfe28342abd8f623fd70f10cbe740863920 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 29 May 2024 13:37:30 +0100 Subject: [PATCH 1/2] raft: re-enable config change safety Config changes in this raft implementation require a safety constraint: the leader must not append a config change if it hasn't applied all config changes in its log. The DisableConfChangeValidation flag disables this check under the assumption that the state machine layer provides the equivalent guarantee. However, it is hard to argue that this is true in split leaseholder/leader scenarios. This commit re-enables this check, to bring the safety back. The other two state-machine-level checks concerned with entering and leaving joint configs can still be disabled. Epic: none Release note: none --- pkg/raft/raft.go | 12 +++- .../confchange_disable_validation.txt | 66 ++++++++++--------- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/pkg/raft/raft.go b/pkg/raft/raft.go index 83e16e54f812..5ee4e6482e71 100644 --- a/pkg/raft/raft.go +++ b/pkg/raft/raft.go @@ -1279,7 +1279,13 @@ func stepLeader(r *raft, m pb.Message) error { cc = ccc } if cc != nil { + // Per the "Apply" invariant in the config change safety argument[^1], + // the leader must not append a config change if it hasn't applied all + // config changes in its log. + // + // [^1]: https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411 alreadyPending := r.pendingConfIndex > r.raftLog.applied + alreadyJoint := len(r.trk.Config.Voters[1]) > 0 wantsLeaveJoint := len(cc.AsV2().Changes) == 0 @@ -1292,7 +1298,11 @@ func stepLeader(r *raft, m pb.Message) error { failedCheck = "not in joint state; refusing empty conf change" } - if failedCheck != "" && !r.disableConfChangeValidation { + // Allow disabling config change constraints that are guaranteed by the + // upper state machine layer (incorrect ones will apply as no-ops). + // + // NB: !alreadyPending requirement is always respected, for safety. + if alreadyPending || (failedCheck != "" && !r.disableConfChangeValidation) { r.logger.Infof("%x ignoring conf change %v at config %s: %s", r.id, cc, r.trk.Config, failedCheck) m.Entries[i] = pb.Entry{Type: pb.EntryNormal} } else { diff --git a/pkg/raft/testdata/confchange_disable_validation.txt b/pkg/raft/testdata/confchange_disable_validation.txt index 1a2bc4fdf812..66c1113505f5 100644 --- a/pkg/raft/testdata/confchange_disable_validation.txt +++ b/pkg/raft/testdata/confchange_disable_validation.txt @@ -32,45 +32,47 @@ l2 l3 ---- ok -# Entries both get appended. -process-ready 1 ----- -Ready MustSync=true: -Entries: -1/4 EntryNormal "foo" -1/5 EntryConfChangeV2 l2 l3 - -# Dummy entry comes up for application. -process-ready 1 ----- -Ready MustSync=false: -HardState Term:1 Vote:1 Commit:5 -CommittedEntries: -1/4 EntryNormal "foo" - -# Propose new config change. Note how it isn't rejected, -# which is due to DisableConfChangeValidation=true. -propose-conf-change 1 ----- -ok - -# Turn on autopilot: the first config change applies, the -# second one gets committed and also applies. -stabilize +# Entries both get appended and applied. +stabilize 1 ---- > 1 handling Ready Ready MustSync=true: Entries: - 1/6 EntryConfChangeV2 + 1/4 EntryNormal "foo" + 1/5 EntryConfChangeV2 l2 l3 +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/4 EntryNormal "foo" +> 1 handling Ready + Ready MustSync=false: CommittedEntries: 1/5 EntryConfChangeV2 l2 l3 INFO 1 switched to configuration voters=(1)&&(1) learners=(2 3) > 1 handling Ready Ready MustSync=false: - HardState Term:1 Vote:1 Commit:6 - CommittedEntries: - 1/6 EntryConfChangeV2 Messages: - 1->2 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2] - 1->3 MsgApp Term:1 Log:1/5 Commit:5 Entries:[1/6 EntryConfChangeV2] - INFO 1 switched to configuration voters=(1) learners=(2 3) + 1->2 MsgApp Term:1 Log:1/4 Commit:5 Entries:[1/5 EntryConfChangeV2 l2 l3] + 1->3 MsgApp Term:1 Log:1/4 Commit:5 Entries:[1/5 EntryConfChangeV2 l2 l3] + +# Propose new config change. Note how it isn't rejected, +# which is due to DisableConfChangeValidation=true. +propose-conf-change 1 +l4 +---- +ok + +# The new config change is appended to the log. +process-ready 1 +---- +Ready MustSync=true: +Entries: +1/6 EntryConfChangeV2 l4 + +# If we process-ready on node 1 now, the second config change will come up for +# application, and the node will panic with "config is already joint". The state +# machine must ensure not to apply it. +# +# TODO(pav-kv): support no-op command application in tests, or support asserting +# that the node panics. From 858781595ed3e60ffbf69b43480a0480033fec68 Mon Sep 17 00:00:00 2001 From: Pavel Kalinnikov Date: Wed, 29 May 2024 18:34:46 +0100 Subject: [PATCH 2/2] testdata: add test for dropping conf change This commit adds a test which ensures a config change is not proposed if the leader has another yet unapplied config change. Epic: none Release note: none --- .../testdata/confchange_drop_if_unapplied.txt | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 pkg/raft/testdata/confchange_drop_if_unapplied.txt diff --git a/pkg/raft/testdata/confchange_drop_if_unapplied.txt b/pkg/raft/testdata/confchange_drop_if_unapplied.txt new file mode 100644 index 000000000000..83f4f97f9125 --- /dev/null +++ b/pkg/raft/testdata/confchange_drop_if_unapplied.txt @@ -0,0 +1,59 @@ +# This test verifies that a config change is not proposed if the leader has +# unapplied config changes. This ensures a safety requirement stated in +# https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411 + +# The check should be performed even if conf change validation is disabled. +add-nodes 1 voters=(1) index=2 disable-conf-change-validation=true +---- +INFO 1 switched to configuration voters=(1) +INFO 1 became follower at term 0 +INFO newRaft 1 [peers: [1], term: 0, commit: 2, applied: 2, lastindex: 2, lastterm: 1] + +campaign 1 +---- +INFO 1 is starting a new election at term 0 +INFO 1 became candidate at term 1 + +stabilize log-level=none +---- +ok + +# Propose one config change. It should be accepted. +propose-conf-change 1 transition=explicit +l2 l3 +---- +ok + +# The first config change gets appended. +process-ready 1 +---- +Ready MustSync=true: +Entries: +1/4 EntryConfChangeV2 l2 l3 + +# Propose another config change. It should be rejected, because the first config +# change hasn't applied on the leader yet. +propose-conf-change 1 +l4 +---- +INFO 1 ignoring conf change {ConfChangeTransitionAuto [{ConfChangeAddLearnerNode 4}] []} at config voters=(1): possible unapplied conf change at index 4 (applied to 3) + +# The new config change is appended to the log as an empty entry. +stabilize 1 +---- +> 1 handling Ready + Ready MustSync=true: + HardState Term:1 Vote:1 Commit:4 + Entries: + 1/5 EntryNormal "" + CommittedEntries: + 1/4 EntryConfChangeV2 l2 l3 + INFO 1 switched to configuration voters=(1)&&(1) learners=(2 3) +> 1 handling Ready + Ready MustSync=false: + HardState Term:1 Vote:1 Commit:5 + CommittedEntries: + 1/5 EntryNormal "" + Messages: + 1->2 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryNormal ""] + 1->3 MsgApp Term:1 Log:1/4 Commit:4 Entries:[1/5 EntryNormal ""]