Skip to content

Commit

Permalink
issue-2074: disable IO of broken devices (#2417)
Browse files Browse the repository at this point in the history
  • Loading branch information
sharpeye authored Nov 14, 2024
1 parent e1a6591 commit 65bc209
Show file tree
Hide file tree
Showing 41 changed files with 1,344 additions and 184 deletions.
17 changes: 17 additions & 0 deletions cloud/blockstore/config/disk.proto
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,23 @@ message TDiskAgentConfig
// 2 means that one IO service will be created for every two file paths.
// etc.
optional uint32 PathsPerFileIOService = 34;

// List of device UUIDs with suspended I/O.
// I/O operations for such a device will result in errors.
// Is used for the config cache file only.
repeated string DevicesWithSuspendedIO = 35;

// Disable devices that have been recognized as broken by the DR
optional bool DisableBrokenDevices = 36;

// Path to serial number mapping. For testing purposes only.
message TPathToSerialNumber
{
required string Path = 1;
required string SerialNumber = 2;
}

repeated TPathToSerialNumber PathToSerialNumberMapping = 37;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 4 additions & 2 deletions cloud/blockstore/libs/disk_agent/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,10 @@ bool AgentHasDevices(
const TString storagePath = storageConfig->GetCachedDiskAgentConfigPath();
const TString diskAgentPath = agentConfig->GetCachedConfigPath();
const TString& path = diskAgentPath.empty() ? storagePath : diskAgentPath;
auto cachedDevices = NStorage::LoadCachedConfig(path);
if (!cachedDevices.empty()) {

if (auto [config, _] = NStorage::LoadDiskAgentConfig(path);
config.FileDevicesSize() != 0)
{
return true;
}

Expand Down
9 changes: 6 additions & 3 deletions cloud/blockstore/libs/rdma_test/rdma_test_environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,18 @@ TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize)
TDuration::Zero(), // maxRequestDuration
TDuration::Zero() // shutdownTimeout
);

TVector<TString> uuids;
for (const auto& [key, value]: devices) {
uuids.push_back(key);
}
auto deviceClient = std::make_shared<TDeviceClient>(

DeviceClient = std::make_shared<TDeviceClient>(
TDuration::MilliSeconds(100),
uuids,
Logging->CreateLog("BLOCKSTORE_DISK_AGENT"));
deviceClient->AcquireDevices(

DeviceClient->AcquireDevices(
uuids,
ClientId,
TInstant::Now(),
Expand Down Expand Up @@ -60,7 +63,7 @@ TRdmaTestEnvironment::TRdmaTestEnvironment(size_t deviceSize, ui32 poolSize)
std::move(oldRequestCounters),
Logging,
Server,
std::move(deviceClient),
DeviceClient,
std::move(devices));

RdmaTarget->Start();
Expand Down
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/rdma_test/rdma_test_environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ struct TRdmaTestEnvironment
"console",
TLogSettings{TLOG_RESOURCES});

std::shared_ptr<TDeviceClient> DeviceClient;

TRdmaTestEnvironment(size_t deviceSize = 4_MB, ui32 poolSize = 1);

virtual ~TRdmaTestEnvironment();
Expand Down
4 changes: 4 additions & 0 deletions cloud/blockstore/libs/storage/disk_agent/disk_agent_actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ class TDiskAgentActor final

bool ShouldOffloadRequest(ui32 eventType) const;

void ProcessDevicesToDisableIO(
const NActors::TActorContext& ctx,
TVector<TString> devicesToDisableIO);

private:
STFUNC(StateInit);
STFUNC(StateWork);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void TDiskAgentActor::HandleDisableConcreteAgent(
if (record.DeviceUUIDsSize()) {
for (const auto& d: record.GetDeviceUUIDs()) {
State->DisableDevice(d);
State->ReportDisabledDeviceError(d);
}
} else {
HandlePoisonPill(nullptr, ctx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void TDiskAgentActor::InitAgent(const TActorContext& ctx)
std::move(r.Configs),
std::move(r.Errors),
std::move(r.ConfigMismatchErrors),
std::move(r.DevicesWithNewSerialNumber));
std::move(r.DevicesWithSuspendedIO));

actorSystem->Send(
new IEventHandle(
Expand Down
11 changes: 0 additions & 11 deletions cloud/blockstore/libs/storage/disk_agent/disk_agent_actor_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,17 +178,6 @@ void TDiskAgentActor::PerformIO(
started);
};

if (State->IsDeviceDisabled(deviceUUID)) {
LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT,
"Dropped %s request to device %s, session %s",
TMethod::Name,
deviceUUID.c_str(),
clientId.c_str());
State->ReportDisabledDeviceError(deviceUUID);
replyError(E_IO, "Device disabled");
return;
}

LOG_TRACE(ctx, TBlockStoreComponents::DISK_AGENT,
"%s [%s / %s]",
TMethod::Name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ void TDiskAgentActor::RenderDevices(IOutputStream& out) const
config.GetState(),
State->IsDeviceDisabled(uuid)
? EDeviceStateFlags::DISABLED
: EDeviceStateFlags::NONE);
: (State->IsDeviceSuspended(uuid)
? EDeviceStateFlags::SUSPENDED
: EDeviceStateFlags::NONE));
}
TABLED() {
if (config.GetStateTs()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#include <cloud/blockstore/libs/storage/api/disk_registry_proxy.h>
#include <cloud/blockstore/libs/storage/core/request_info.h>

#include <contrib/ydb/core/base/appdata.h>

namespace NCloud::NBlockStore::NStorage {

using namespace NActors;
Expand Down Expand Up @@ -87,6 +89,9 @@ void TRegisterActor::HandleRegisterAgentResponse(

auto response = std::make_unique<TEvDiskAgentPrivate::TEvRegisterAgentResponse>(
msg->GetError());
response->DevicesToDisableIO.assign(
msg->Record.GetDevicesToDisableIO().cbegin(),
msg->Record.GetDevicesToDisableIO().cend());
NCloud::Reply(ctx, *RequestInfo, std::move(response));
}

Expand All @@ -101,6 +106,47 @@ STFUNC(TRegisterActor::StateWork)
}
}

////////////////////////////////////////////////////////////////////////////////

class TUpdateDevicesWithSuspendedIOActor
: public TActorBootstrapped<TUpdateDevicesWithSuspendedIOActor>
{
private:
const TString CachePath;
const TVector<TString> DevicesToDisableIO;

public:
TUpdateDevicesWithSuspendedIOActor(
TString cachePath,
TVector<TString> devicesToSuspendIO)
: CachePath{std::move(cachePath)}
, DevicesToDisableIO{std::move(devicesToSuspendIO)}
{}

void Bootstrap(const TActorContext& ctx)
{
auto error = NStorage::UpdateDevicesWithSuspendedIO(
CachePath,
DevicesToDisableIO);

if (HasError(error)) {
LOG_ERROR_S(
ctx,
TBlockStoreComponents::DISK_AGENT,
"Can't update DevicesWithSuspendedIO in the config cache "
"file: "
<< FormatError(error));
} else {
LOG_INFO(
ctx,
TBlockStoreComponents::DISK_AGENT,
"DevicesWithSuspendedIO has been successfully updated");
}

Die(ctx);
}
};

} // namespace

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -135,15 +181,60 @@ void TDiskAgentActor::HandleRegisterAgent(
std::move(config));
}

void TDiskAgentActor::ProcessDevicesToDisableIO(
const NActors::TActorContext& ctx,
TVector<TString> devicesToDisableIO)
{
if (!AgentConfig->GetDisableBrokenDevices()) {
return;
}

const THashSet<TString> uuids(
devicesToDisableIO.cbegin(),
devicesToDisableIO.cend());

for (const auto& uuid: State->GetDeviceIds()) {
if (uuids.contains(uuid)) {
LOG_INFO_S(
ctx,
TBlockStoreComponents::DISK_AGENT,
"Disable device " << uuid);
State->DisableDevice(uuid);
} else {
State->EnableDevice(uuid);
}
}

TString cachePath = Config->GetCachedDiskAgentConfigPath().empty()
? AgentConfig->GetCachedConfigPath()
: Config->GetCachedDiskAgentConfigPath();

if (cachePath.empty()) {
return;
}

auto actor = std::make_unique<TUpdateDevicesWithSuspendedIOActor>(
std::move(cachePath),
std::move(devicesToDisableIO));

// Starting an actor on the IO pool to avoid file operations in the User
// pool
ctx.Register(
actor.release(),
TMailboxType::HTSwap,
NKikimr::AppData()->IOPoolId);
}

void TDiskAgentActor::HandleRegisterAgentResponse(
const TEvDiskAgentPrivate::TEvRegisterAgentResponse::TPtr& ev,
const TActorContext& ctx)
{
const auto* msg = ev->Get();
auto* msg = ev->Get();

if (!HasError(msg->GetError())) {
RegistrationState = ERegistrationState::Registered;
LOG_INFO(ctx, TBlockStoreComponents::DISK_AGENT, "Register completed");
ProcessDevicesToDisableIO(ctx, std::move(msg->DevicesToDisableIO));
} else {
LOG_WARN(ctx, TBlockStoreComponents::DISK_AGENT,
"Register failed: %s. Try later", FormatError(msg->GetError()).c_str());
Expand Down
Loading

0 comments on commit 65bc209

Please sign in to comment.