Skip to content

Commit

Permalink
limit the number of connect attempts for cached tablet in volume proxy (
Browse files Browse the repository at this point in the history
#799)

* limit the number of connect attemptr for cached tablet in volume proxy

* update default

* update

* switch from count to duration

* rearrange code

* update
  • Loading branch information
yegorskii committed Apr 3, 2024
1 parent d646bc7 commit 15db2bd
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 7 deletions.
4 changes: 4 additions & 0 deletions cloud/blockstore/config/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -951,4 +951,8 @@ message TStorageServiceConfig

// Max number of volume MetaHistory records displayed on volume monpage.
optional uint32 VolumeMetaHistoryDisplayedRecordLimit = 359;

// Duration of attempts to connect to tablet for cached tablets before
// switching to describe volume (in ms).
optional uint32 VolumeProxyCacheRetryDuration = 360;
}
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,8 @@ TDuration MSeconds(ui32 value)
\
xxx(UnconfirmedBlobCountHardLimit, ui32, 1000 )\
\
xxx(VolumeProxyCacheRetryDuration, TDuration, Seconds(15) )\
\
xxx(MaxShadowDiskFillBandwidth, ui32, 500 )\
xxx(MinAcquireShadowDiskRetryDelayWhenBlocked, TDuration, MSeconds(250) )\
xxx(MaxAcquireShadowDiskRetryDelayWhenBlocked, TDuration, Seconds(1) )\
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/storage/core/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,8 @@ class TStorageConfig
TDuration GetMaxAcquireShadowDiskRetryDelayWhenNonBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenBlocked() const;
TDuration GetMaxAcquireShadowDiskTotalTimeoutWhenNonBlocked() const;

TDuration GetVolumeProxyCacheRetryDuration() const;
};

ui64 GetAllocationUnit(
Expand Down
37 changes: 30 additions & 7 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ class TVolumeProxyActor final
, RefCount(refCount)
{}

TInstant DisconnectTs;
ui64 TabletId = 0;
int RefCount = 0;
};
Expand Down Expand Up @@ -488,11 +489,23 @@ void TVolumeProxyActor::HandleConnect(
msg->TabletId,
FormatError(error).data());

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end() && !it->second.DisconnectTs)
{
it->second.DisconnectTs = ctx.Now();
}

CancelActiveRequests(ctx, *conn);
DestroyConnection(ctx, *conn, error);
return;
}

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.DisconnectTs = {};
}

if (conn->State == FAILED) {
// Tablet recovered
conn->State = STARTED;
Expand Down Expand Up @@ -552,6 +565,12 @@ void TVolumeProxyActor::HandleDescribeResponse(
*conn,
volumeDescr.GetVolumeTabletId(),
msg->Path);

if (auto it = BaseDiskIdToTabletId.find(conn->DiskId);
it != BaseDiskIdToTabletId.end())
{
it->second.DisconnectTs = {};
}
}

template <typename TMethod>
Expand Down Expand Up @@ -585,13 +604,17 @@ void TVolumeProxyActor::HandleRequest(
{
auto itr = BaseDiskIdToTabletId.find(diskId);
if (itr != BaseDiskIdToTabletId.end()) {
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
auto deadline =
itr->second.DisconnectTs + Config->GetVolumeProxyCacheRetryDuration();
if (!itr->second.DisconnectTs || deadline > ctx.Now()) {
PostponeRequest(ctx, conn, IEventHandlePtr(ev.Release()));
StartConnection(
ctx,
conn,
itr->second.TabletId,
"PartitionConfig");
break;
}
}

conn.State = RESOLVING;
Expand Down
147 changes: 147 additions & 0 deletions cloud/blockstore/libs/storage/volume_proxy/volume_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,153 @@ Y_UNIT_TEST_SUITE(TVolumeProxyTest)
RebootTablet(runtime, volumeTabletId, service1.GetSender(), nodeIdx1);
UNIT_ASSERT_VALUES_EQUAL(2, disconnections);
}

Y_UNIT_TEST(ShouldRunDescribeForCachedTabletsIfDurationOfFailedConnectsExceedsThreshold)
{
constexpr ui32 timeout = 3'000;

NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryDuration(timeout);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.DestroyVolume();

TDispatchOptions options;
options.FinalEvents.emplace_back(TEvTabletPipe::EvClientDestroyed);
runtime.DispatchEvents(options);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

runtime.AdvanceCurrentTime(TDuration::MilliSeconds(timeout));

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
auto code = response->GetStatus();
UNIT_ASSERT_VALUES_EQUAL(
FACILITY_SCHEMESHARD,
FACILITY_FROM_CODE(code));
UNIT_ASSERT_VALUES_EQUAL(
NKikimrScheme::StatusPathDoesNotExist,
static_cast<NKikimrScheme::EStatus>(STATUS_FROM_CODE(code)));
}
}

Y_UNIT_TEST(ShouldResetFailCounterIfDisconnectedCachedVolumeIsOnlineAgain)
{
NProto::TStorageServiceConfig config;
config.SetVolumeProxyCacheRetryDuration(3'000);

TTestEnv env;
ui32 nodeIdx = SetupTestEnv(env, std::move(config));

auto& runtime = env.GetRuntime();
TServiceClient service(runtime, nodeIdx);

service.CreateVolume();
service.WaitForVolume();

ui64 volumeTabletId;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvSSProxy::EvDescribeVolumeResponse: {
auto* msg = event->template Get<TEvSSProxy::TEvDescribeVolumeResponse>();
const auto& volumeDescription =
msg->PathDescription.GetBlockStoreVolumeDescription();
volumeTabletId = volumeDescription.GetVolumeTabletId();
break;
}
}
return false;
}
);
service.DescribeVolume();

service.SendRequest(
MakeVolumeProxyServiceId(),
std::make_unique<TEvVolume::TEvMapBaseDiskIdToTabletId>(
DefaultDiskId,
volumeTabletId));

service.StatVolume();

RebootTablet(runtime, volumeTabletId, service.GetSender(), nodeIdx);

TActorId proxy;
bool failConnects = true;
runtime.SetEventFilter([&] (auto& runtime, auto& event) {
Y_UNUSED(runtime);
switch (event->GetTypeRewrite()) {
case TEvTabletPipe::EvClientConnected: {
auto* msg = event->template Get<TEvTabletPipe::TEvClientConnected>();
if (msg->TabletId == volumeTabletId) {
proxy = event->Recipient;
if (failConnects) {
auto& code =
const_cast<NKikimrProto::EReplyStatus&>(msg->Status);
code = NKikimrProto::ERROR;
}
}
break;
}
case TEvSSProxy::EvDescribeVolumeResponse: {
if (failConnects && event->Recipient == proxy) {
UNIT_ASSERT(false);
}
break;
}
}
return false;
}
);

{
service.SendStatVolumeRequest();
auto response = service.RecvStatVolumeResponse();
UNIT_ASSERT_VALUES_EQUAL(E_REJECTED, response->GetStatus());
}

failConnects = false;

service.StatVolume();
service.StatVolume();
}
}

} // namespace NCloud::NBlockStore::NStorage

0 comments on commit 15db2bd

Please sign in to comment.