Skip to content

Commit

Permalink
split scrubbing critevents (#2762)
Browse files Browse the repository at this point in the history
  • Loading branch information
WilyTiger authored Dec 27, 2024
1 parent e708405 commit 9e190a1
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 22 deletions.
3 changes: 2 additions & 1 deletion cloud/blockstore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ namespace NCloud::NBlockStore {
xxx(MirroredDiskDeviceReplacementForbidden) \
xxx(MirroredDiskDeviceReplacementFailure) \
xxx(MirroredDiskDeviceReplacementRateLimitExceeded) \
xxx(MirroredDiskChecksumMismatch) \
xxx(MirroredDiskMinorityChecksumMismatch) \
xxx(MirroredDiskMajorityChecksumMismatch) \
xxx(CounterUpdateRace) \
xxx(EndpointStartingError) \
xxx(ResyncFailed) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,12 +215,16 @@ void TMirrorPartitionActor::CompareChecksums(const TActorContext& ctx)
checksums[i]);
}
++ChecksumMismatches;
ReportMirroredDiskChecksumMismatch();

const bool hasQuorum = majorCount > checksums.size() / 2;
if (Config->GetResyncRangeAfterScrubbing() && hasQuorum) {
StartResyncRange(ctx);
return;
if (hasQuorum) {
ReportMirroredDiskMinorityChecksumMismatch();
if (Config->GetResyncRangeAfterScrubbing()) {
StartResyncRange(ctx);
return;
}
} else {
ReportMirroredDiskMajorityChecksumMismatch();
}
}

Expand Down
49 changes: 32 additions & 17 deletions cloud/blockstore/libs/storage/partition_nonrepl/part_mirror_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1190,11 +1190,12 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
WaitUntilScrubbingFinishesCurrentCycle(env);

auto& counters = env.StorageStatsServiceState->Counters;
auto mirroredDiskChecksumMismatch = critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskChecksumMismatch",
true);
auto mirroredDiskMinorityChecksumMismatch =
critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskMinorityChecksumMismatch",
true);

UNIT_ASSERT_VALUES_EQUAL(2, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(2, mirroredDiskMinorityChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(2, counters.Simple.ChecksumMismatches.Value);

const auto range3 = TBlockRange64::WithLength(1025, 50);
Expand All @@ -1207,7 +1208,7 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
WaitUntilScrubbingFinishesCurrentCycle(env);
WaitUntilScrubbingFinishesCurrentCycle(env);
UNIT_ASSERT_VALUES_EQUAL(3, counters.Simple.ChecksumMismatches.Value);
UNIT_ASSERT_VALUES_EQUAL(3, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(3, mirroredDiskMinorityChecksumMismatch->Val());

// at this point, scrubbing may not start from the beginning,
// so we need to wait for 2 cycles to be sure that
Expand All @@ -1217,7 +1218,7 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)

// check that all ranges was resynced and there is no more mismatches
UNIT_ASSERT_VALUES_EQUAL(3, counters.Simple.ChecksumMismatches.Value);
UNIT_ASSERT_VALUES_EQUAL(3, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(3, mirroredDiskMinorityChecksumMismatch->Val());
}

Y_UNIT_TEST(ShouldPostponeScrubbingIfIntersectingWritePending)
Expand Down Expand Up @@ -1271,18 +1272,18 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
});
env.WriteActor(env.ActorId, range, 'D');

auto mirroredDiskChecksumMismatch = counters->GetCounter(
"AppCriticalEvents/MirroredDiskChecksumMismatch",
auto mirroredDiskMinorityChecksumMismatch = counters->GetCounter(
"AppCriticalEvents/MirroredDiskMinorityChecksumMismatch",
true);
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());

rangeCount = 0;
ui32 iterations = 0;
while (rangeCount < 5 && iterations++ < 100) {
runtime.DispatchEvents({}, env.ScrubbingInterval);
}

UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());
}

Y_UNIT_TEST(ShouldNotFindMismatchIfChecksumIntersectedWithWrite)
Expand Down Expand Up @@ -1382,11 +1383,11 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
runtime.DispatchEvents({}, env.ScrubbingInterval);
}

auto mirroredDiskChecksumMismatch = counters->GetCounter(
"AppCriticalEvents/MirroredDiskChecksumMismatch",
auto mirroredDiskMinorityChecksumMismatch = counters->GetCounter(
"AppCriticalEvents/MirroredDiskMinorityChecksumMismatch",
true);

UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());
}

Y_UNIT_TEST(ShouldNotFindMismatchIfWriteRequestToOneReplicaHasError)
Expand Down Expand Up @@ -1445,12 +1446,12 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
runtime.AdvanceCurrentTime(UpdateCountersInterval);
runtime.DispatchEvents({}, TDuration::MilliSeconds(50));

auto mirroredDiskChecksumMismatch = critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskChecksumMismatch",
auto mirroredDiskMinorityChecksumMismatch = critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskMinorityChecksumMismatch",
true);
auto& counters = env.StorageStatsServiceState->Counters;

UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, counters.Simple.ScrubbingProgress.Value);

client.SendWriteBlocksLocalRequest(range, data);
Expand All @@ -1464,7 +1465,7 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
runtime.DispatchEvents({}, TDuration::MilliSeconds(50));
}

UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());
}

Y_UNIT_TEST(ShouldRejectRequestsIfRangeResyncingAfterChecksumMismatch)
Expand Down Expand Up @@ -1586,6 +1587,16 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)

TTestEnv env(runtime);
auto& counters = env.StorageStatsServiceState->Counters;
TDynamicCountersPtr critEventsCounters = new TDynamicCounters();
InitCriticalEventsCounter(critEventsCounters);
auto mirroredDiskMajorityChecksumMismatch =
critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskMajorityChecksumMismatch",
true);
auto mirroredDiskMinorityChecksumMismatch =
critEventsCounters->GetCounter(
"AppCriticalEvents/MirroredDiskMinorityChecksumMismatch",
true);

// Write different data to all replicas
const auto range = TBlockRange64::WithLength(2049, 50);
Expand All @@ -1599,6 +1610,8 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
WaitUntilScrubbingFinishesCurrentCycle(env);
UNIT_ASSERT_VALUES_EQUAL(2, counters.Simple.ChecksumMismatches.Value);
UNIT_ASSERT_VALUES_EQUAL(0, rangeResynced);
UNIT_ASSERT_VALUES_EQUAL(2, mirroredDiskMajorityChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(0, mirroredDiskMinorityChecksumMismatch->Val());

// Make data in 1st and 3rd replica the same.
env.WriteReplica(2, range, 'A');
Expand All @@ -1609,6 +1622,8 @@ Y_UNIT_TEST_SUITE(TMirrorPartitionTest)
WaitUntilScrubbingFinishesCurrentCycle(env);
UNIT_ASSERT_VALUES_EQUAL(3, counters.Simple.ChecksumMismatches.Value);
UNIT_ASSERT_VALUES_EQUAL(1, rangeResynced);
UNIT_ASSERT_VALUES_EQUAL(2, mirroredDiskMajorityChecksumMismatch->Val());
UNIT_ASSERT_VALUES_EQUAL(1, mirroredDiskMinorityChecksumMismatch->Val());
}

Y_UNIT_TEST(ShouldRejectReadUponChecksumMismatchIfRead2IsEnabled)
Expand Down

0 comments on commit 9e190a1

Please sign in to comment.