Skip to content

Commit

Permalink
Less disturbing divergent execution log if the cause is a protocol up…
Browse files Browse the repository at this point in the history
…date mismatch (#827)
  • Loading branch information
dhedey authored Jan 26, 2024
2 parents 7084e3b + d255f50 commit c5f5cf1
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 37 deletions.
3 changes: 3 additions & 0 deletions core-rust/state-manager/src/protocol/protocol_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ use ProtocolUpdateEnactmentCondition::*;

#[derive(Clone, Debug, Eq, PartialEq, ScryptoCategorize, ScryptoEncode, ScryptoDecode)]
pub struct ProtocolState {
/// A protocol version currently in use. The latest enacted version or the genesis version.
pub current_protocol_version: ProtocolVersionName,
/// A list of all protocol updates that have been enacted.
pub enacted_protocol_updates: BTreeMap<StateVersion, ProtocolVersionName>,
/// A list of protocol updates that haven't yet been enacted, but still can be in the future.
pub pending_protocol_updates: Vec<PendingProtocolUpdate>,
}

Expand Down
54 changes: 46 additions & 8 deletions core-rust/state-manager/src/state_computer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1195,14 +1195,11 @@ where
);
}

if series_executor.next_protocol_version() != commit_ledger_header.next_protocol_version {
panic!(
"resultant protocol update at version {} differs from the proof ({:?} != {:?})",
commit_state_version,
series_executor.next_protocol_version(),
commit_ledger_header.next_protocol_version
);
}
self.verify_post_commit_protocol_version(
commit_state_version,
&series_executor.next_protocol_version(),
&commit_ledger_header.next_protocol_version,
);

let final_ledger_hashes = series_executor.latest_ledger_hashes();
if final_ledger_hashes != &commit_ledger_header.hashes {
Expand Down Expand Up @@ -1270,6 +1267,47 @@ where
})
}

fn verify_post_commit_protocol_version(
&self,
commit_state_version: StateVersion,
next_version_from_local_execution: &Option<ProtocolVersionName>,
next_version_from_proof: &Option<ProtocolVersionName>,
) {
match (next_version_from_local_execution, next_version_from_proof) {
(Some(local_version), Some(proof_version)) if local_version == proof_version => {
// All good, local execution enacts the same protocol version as proof. No-op.
}
(Some(local_version), Some(proof_version)) => {
// We're enacting a different version than the proof
panic!(
"FATAL: At state version {} this node wants to enact {}, but this doesn't match the proof, which enacts {} (i.e. the validator set have enacted a different version). Make sure your node is running the latest software version.",
commit_state_version,
local_version,
proof_version,
);
}
(Some(local_version), None) => {
// We're enacting locally, but the proof doesn't enact anything
panic!(
"FATAL: At state version {} this node wants to enact {}, but this doesn't match the proof, which doesn't enact anything (i.e. the validator set haven't enacted it). Make sure your node is running the latest software version.",
commit_state_version,
local_version,
);
}
(None, Some(proof_version)) => {
// The proof enacts, but we locally don't
panic!(
"FATAL: At state version {} the validator set have enacted {}, but this node didn't expect it to happen (expected enactment at a different time, or protocol version unknown). Make sure your node is running the latest software version.",
commit_state_version,
proof_version,
);
}
(None, None) => {
// All good, nothing gets enacted. No-op.
}
}
}

pub fn handle_protocol_update(
&self,
protocol_version_name: &ProtocolVersionName,
Expand Down
109 changes: 91 additions & 18 deletions core/src/main/java/com/radixdlt/consensus/PendingVotes.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Maps;
import com.google.common.hash.HashCode;
import com.google.common.util.concurrent.RateLimiter;
import com.radixdlt.SecurityCritical;
import com.radixdlt.SecurityCritical.SecurityKind;
import com.radixdlt.consensus.bft.BFTValidatorId;
Expand Down Expand Up @@ -98,48 +99,120 @@
public final class PendingVotes {
private static final Logger log = LogManager.getLogger();

private final BFTValidatorId self;
private final Map<HashCode, ValidationState> voteState = Maps.newHashMap();
private final Map<HashCode, ValidationState> timeoutVoteState = Maps.newHashMap();
private final Map<BFTValidatorId, PreviousVote> previousVotes = Maps.newHashMap();
private final Hasher hasher;
private final EventDispatcher<ConsensusByzantineEvent> doubleVoteEventDispatcher;
private final BFTValidatorSet validatorSet;
private final Metrics metrics;
private final RateLimiter divergentVertexExecutionLogRateLimiter = RateLimiter.create(0.1);

public PendingVotes(
BFTValidatorId self,
Hasher hasher,
EventDispatcher<ConsensusByzantineEvent> doubleVoteEventDispatcher,
BFTValidatorSet validatorSet,
Metrics metrics) {
this.self = self;
this.hasher = Objects.requireNonNull(hasher);
this.doubleVoteEventDispatcher = Objects.requireNonNull(doubleVoteEventDispatcher);
this.validatorSet = Objects.requireNonNull(validatorSet);
this.metrics = metrics;
}

private void checkForDivergentVertexExecution(Vote vote) {
final var voteVertexId = vote.getVoteData().getProposed().getVertexId();
final var voteLedgerHeader = vote.getVoteData().getProposed().getLedgerHeader();
private void checkForDivergentVertexExecution(Vote incomingVote) {
final var voteVertexId = incomingVote.getVoteData().getProposed().getVertexId();
final var voteLedgerHeader = incomingVote.getVoteData().getProposed().getLedgerHeader();
for (var otherVote : this.previousVotes.entrySet()) {
final var otherVertexId = otherVote.getValue().proposedHeader().getVertexId();
final var otherLedgerHeader = otherVote.getValue().proposedHeader().getLedgerHeader();
if (voteVertexId.equals(otherVertexId) && !voteLedgerHeader.equals(otherLedgerHeader)) {
log.warn(
"Divergent vertex execution detected! An incoming vote (from {}) for vertex {} claims"
+ " the following resultant ledger header: {}, while validator {} thinks that the"
+ " resultant ledger header is {}. [this_vote={}, other_vote={}]",
vote.getAuthor(),
voteVertexId,
voteLedgerHeader,
otherVote.getKey(),
otherLedgerHeader,
vote,
otherVote);
if (!voteLedgerHeader
.nextProtocolVersion()
.equals(otherLedgerHeader.nextProtocolVersion())) {
logDivergentProtocolUpdateVote(incomingVote, otherVote.getKey(), otherVote.getValue());
} else {
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"Divergent vertex execution detected! An incoming vote (from {}) for vertex {}"
+ " claims the following resultant ledger header: {}, while validator {} thinks"
+ " that the resultant ledger header is {}. [this_vote={}, other_vote={}]",
incomingVote.getAuthor(),
voteVertexId,
voteLedgerHeader,
otherVote.getKey(),
otherLedgerHeader,
incomingVote,
otherVote);
}
}

// Regardless of the reason and whether the divergence applies to this node,
// we're bumping the metrics.
this.metrics.bft().divergentVertexExecutions().inc();
}
}
}

private void logDivergentProtocolUpdateVote(
Vote incomingVote, BFTValidatorId otherVoteAuthor, PreviousVote otherVote) {
// Protocol update-related divergence is most likely caused by some nodes running an
// outdated version. This is somewhat expected, so we're not going to log these occurrences
// unless they apply to this node.
final var isIncomingVoteFromSelf = self.equals(incomingVote.getAuthor());
final var isExistingVoteFromSelf = self.equals(otherVoteAuthor);
if (isIncomingVoteFromSelf || isExistingVoteFromSelf) {
final var selfProposedHeader =
isIncomingVoteFromSelf
? incomingVote.getVoteData().getProposed()
: otherVote.proposedHeader();
final var otherProposedHeader =
isIncomingVoteFromSelf
? otherVote.proposedHeader()
: incomingVote.getVoteData().getProposed();
final var selfNextVersion = selfProposedHeader.getLedgerHeader().nextProtocolVersion();
final var otherNextVersion = otherProposedHeader.getLedgerHeader().nextProtocolVersion();
final var otherAuthor = isIncomingVoteFromSelf ? otherVoteAuthor : incomingVote.getAuthor();
if (selfNextVersion.isPresent() && otherNextVersion.isEmpty()) {
// We're enacting, they don't: just a debug log.
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.debug(
"""
Received a BFT vote from {} that doesn't enact a protocol update,
while we're enacting {}.
This indicates that they're likely running an outdated node version.
This node is unaffected, unless other error messages follow.""",
otherAuthor,
selfNextVersion);
} else if (selfNextVersion.isEmpty() && otherNextVersion.isPresent()) {
// We're not enacting, but they are: we're potentially outdated, log an early warning
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"""
Received a BFT vote from {} that enacts a protocol update {}, but this node doesn't enact it.
This node might be running an outdated version (but it's not certain).""",
otherAuthor,
otherNextVersion);
}
} else if (selfNextVersion.isPresent() && otherNextVersion.isPresent()) {
// We're enacting a different version, one of us is likely outdated.
if (divergentVertexExecutionLogRateLimiter.tryAcquire()) {
log.warn(
"Received a BFT vote from {} that enacts a protocol update {}, but this node enacts"
+ " {}. It is likely that one of the nodes (this node or {}) is running an"
+ " outdated version.",
otherAuthor,
otherNextVersion,
selfNextVersion,
otherAuthor);
}
}
} /* else: no-op; we don't care about other cases */
} /* else: no-op; we don't care if the vote doesn't affect this node */
}

/**
* Inserts a vote for a given vertex, attempting to form either a quorum certificate for that
* vertex or a timeout certificate. A quorum will only be formed if permitted by the {@link
Expand All @@ -153,14 +226,14 @@ public VoteProcessingResult insertVote(Vote vote) {
final VoteData voteData = vote.getVoteData();
final HashCode voteDataHash = this.hasher.hashDsonEncoded(voteData);

// This doesn't do anything, other than logging and bumping the metrics,
// when divergent execution is detected (which should hopefully never happen).
checkForDivergentVertexExecution(vote);

if (!validatorSet.containsValidator(author)) {
return VoteProcessingResult.rejected(VoteRejectedReason.INVALID_AUTHOR);
}

// This doesn't do anything, other than logging and bumping the metrics,
// when divergent execution is detected (which should hopefully never happen).
checkForDivergentVertexExecution(vote);

if (!replacePreviousVote(author, vote, voteDataHash)) {
return VoteProcessingResult.rejected(VoteRejectedReason.DUPLICATE_VOTE);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ public BFTEventProcessor build() {
return EmptyBFTEventProcessor.INSTANCE;
}
final PendingVotes pendingVotes =
new PendingVotes(hasher, doubleVoteDispatcher, validatorSet, metrics);
new PendingVotes(self, hasher, doubleVoteDispatcher, validatorSet, metrics);

/* Setting up the following BFT event processing pipeline:
ObsoleteEventsFilter (filters out obsolete events for past rounds)
Expand Down
22 changes: 12 additions & 10 deletions core/src/test/java/com/radixdlt/consensus/PendingVotesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
import com.radixdlt.crypto.ECDSASecp256k1Signature;
import com.radixdlt.crypto.HashUtils;
import com.radixdlt.crypto.Hasher;
import com.radixdlt.lang.Option;
import com.radixdlt.monitoring.Metrics;
import com.radixdlt.monitoring.MetricsInitializer;
import com.radixdlt.utils.RandomHasher;
Expand All @@ -90,12 +91,13 @@
import org.junit.Test;

public class PendingVotesTest {
private BFTValidatorId self;
private Hasher hasher;

private Metrics metrics;

@Before
public void setup() {
this.self = BFTValidatorId.random();
this.hasher = new RandomHasher();
this.metrics = new MetricsInitializer().initialize();
}
Expand All @@ -116,7 +118,7 @@ public void when_inserting_valid_but_unaccepted_votes__then_no_qc_is_returned()
BFTValidatorSet validatorSet =
BFTValidatorSet.from(
Collections.singleton(BFTValidator.from(vote1.getAuthor(), UInt192.ONE)));
final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

VoteData voteData = mock(VoteData.class);
BFTHeader proposed = vote1.getVoteData().getProposed();
Expand Down Expand Up @@ -145,7 +147,7 @@ public void when_inserting_valid_and_accepted_votes__then_qc_is_formed() {
BFTHeader proposed = vote.getVoteData().getProposed();
when(voteData.getProposed()).thenReturn(proposed);

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

assertTrue(pendingVotes.insertVote(vote) instanceof VoteProcessingResult.QuorumReached);
}
Expand All @@ -167,7 +169,7 @@ public void when_inserting_valid_timeout_votes__then_tc_is_formed() {
BFTValidator.from(vote1.getAuthor(), UInt192.ONE),
BFTValidator.from(vote2.getAuthor(), UInt192.ONE)));

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

assertTrue(pendingVotes.insertVote(vote1) instanceof VoteProcessingResult.VoteAccepted);

Expand Down Expand Up @@ -197,7 +199,7 @@ public void when_voting_again__previous_vote_is_removed() {
BFTHeader proposed = vote.getVoteData().getProposed();
when(voteData.getProposed()).thenReturn(proposed);

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

// Preconditions
assertEquals(VoteProcessingResult.accepted(), pendingVotes.insertVote(vote));
Expand Down Expand Up @@ -230,7 +232,7 @@ public void when_voting_again__previous_timeoutvote_is_removed() {
BFTHeader proposed = vote.getVoteData().getProposed();
when(voteData.getProposed()).thenReturn(proposed);

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

// Preconditions
assertEquals(VoteProcessingResult.accepted(), pendingVotes.insertVote(vote));
Expand Down Expand Up @@ -265,7 +267,7 @@ public void when_submitting_a_duplicate_vote__then_can_be_replaced_if_has_timeou
BFTValidator.from(vote1.getAuthor(), UInt192.ONE),
BFTValidator.from(vote2.getAuthor(), UInt192.ONE)));

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

assertTrue(pendingVotes.insertVote(vote1) instanceof VoteProcessingResult.VoteAccepted);

Expand Down Expand Up @@ -299,7 +301,7 @@ public void divergent_vertex_execution_should_be_detected() {
final var vertexId = HashUtils.random256();

final var ledgerHeader1 = mock(LedgerHeader.class);
when(ledgerHeader1.getStateVersion()).thenReturn(1L);
when(ledgerHeader1.nextProtocolVersion()).thenReturn(Option.empty());
final var bftHeader1 = mock(BFTHeader.class);
when(bftHeader1.getLedgerHeader()).thenReturn(ledgerHeader1);
when(bftHeader1.getVertexId()).thenReturn(vertexId);
Expand All @@ -311,7 +313,7 @@ public void divergent_vertex_execution_should_be_detected() {
when(vote1.getVoteData()).thenReturn(voteData1);

final var ledgerHeader2 = mock(LedgerHeader.class);
when(ledgerHeader1.getStateVersion()).thenReturn(2L);
when(ledgerHeader2.nextProtocolVersion()).thenReturn(Option.empty());
final var bftHeader2 = mock(BFTHeader.class);
when(bftHeader2.getLedgerHeader()).thenReturn(ledgerHeader2);
when(bftHeader2.getVertexId()).thenReturn(vertexId);
Expand All @@ -328,7 +330,7 @@ public void divergent_vertex_execution_should_be_detected() {
BFTValidator.from(vote1.getAuthor(), UInt192.ONE),
BFTValidator.from(vote2.getAuthor(), UInt192.ONE)));

final var pendingVotes = new PendingVotes(hasher, e -> {}, validatorSet, metrics);
final var pendingVotes = new PendingVotes(self, hasher, e -> {}, validatorSet, metrics);

// Should still accept both votes...
assertTrue(pendingVotes.insertVote(vote1) instanceof VoteProcessingResult.VoteAccepted);
Expand Down

0 comments on commit c5f5cf1

Please sign in to comment.