Skip to content

Commit

Permalink
issue-1444: Use kernel delay accounting to calculate cpu wait
Browse files Browse the repository at this point in the history
  • Loading branch information
antonmyagkov committed Nov 1, 2024
1 parent 7aa2d73 commit a08f79b
Show file tree
Hide file tree
Showing 32 changed files with 446 additions and 77 deletions.
12 changes: 12 additions & 0 deletions cloud/blockstore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,15 @@ message TMonitoringUrlData
optional string MonitoringNBSTVDashboard = 7;
};

////////////////////////////////////////////////////////////////////////////////
// StatsFetcher type

enum EStatsFetcherType
{
CGROUP = 0;
KERNEL_TASK_DELAYACCT = 1;
};

////////////////////////////////////////////////////////////////////////////////

message TDiagnosticsConfig
Expand Down Expand Up @@ -216,4 +225,7 @@ message TDiagnosticsConfig

// Performance measurements coefficients for local HDD disks.
optional TVolumePerfSettings LocalHDDPerfSettings = 51;

// Type of fetching CPU stats
optional EStatsFetcherType StatsFetcherType = 52;
}
4 changes: 2 additions & 2 deletions cloud/blockstore/libs/daemon/common/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -861,7 +861,7 @@ void TBootstrapBase::Start()
START_KIKIMR_COMPONENT(NotifyService);
START_COMMON_COMPONENT(Monitoring);
START_COMMON_COMPONENT(ProfileLog);
START_KIKIMR_COMPONENT(CgroupStatsFetcher);
START_KIKIMR_COMPONENT(StatsFetcher);
START_COMMON_COMPONENT(DiscoveryService);
START_COMMON_COMPONENT(TraceProcessor);
START_KIKIMR_COMPONENT(TraceSerializer);
Expand Down Expand Up @@ -957,7 +957,7 @@ void TBootstrapBase::Stop()
STOP_KIKIMR_COMPONENT(TraceSerializer);
STOP_COMMON_COMPONENT(TraceProcessor);
STOP_COMMON_COMPONENT(DiscoveryService);
STOP_KIKIMR_COMPONENT(CgroupStatsFetcher);
STOP_KIKIMR_COMPONENT(StatsFetcher);
STOP_COMMON_COMPONENT(ProfileLog);
STOP_COMMON_COMPONENT(Monitoring);
STOP_KIKIMR_COMPONENT(LogbrokerService);
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/daemon/common/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class TBootstrapBase
virtual IStartable* GetTraceSerializer() = 0;
virtual IStartable* GetLogbrokerService() = 0;
virtual IStartable* GetNotifyService() = 0;
virtual IStartable* GetCgroupStatsFetcher() = 0;
virtual IStartable* GetStatsFetcher() = 0;
virtual IStartable* GetIamTokenClient() = 0;
virtual IStartable* GetComputeClient() = 0;
virtual IStartable* GetKmsClient() = 0;
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/daemon/local/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class TBootstrapLocal final
IStartable* GetTraceSerializer() override { return nullptr; }
IStartable* GetLogbrokerService() override { return nullptr; }
IStartable* GetNotifyService() override { return nullptr; }
IStartable* GetCgroupStatsFetcher() override { return nullptr; }
IStartable* GetStatsFetcher() override { return nullptr; }
IStartable* GetIamTokenClient() override { return nullptr; }
IStartable* GetComputeClient() override { return nullptr; }
IStartable* GetKmsClient() override { return nullptr; }
Expand Down
54 changes: 49 additions & 5 deletions cloud/blockstore/libs/daemon/ydb/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,48 @@ NRdma::TClientConfigPtr CreateRdmaClientConfig(
return std::make_shared<NRdma::TClientConfig>(config->GetClient());
}

// One can use either a service name and have the stats file inferred from it,
// or provide the stats file explicitly.
NCloud::NStorage::IStatsFetcherPtr BuildStatsFetcher(
NProto::EStatsFetcherType statsFetcherType,
const TString& cpuWaitServiceName,
const TString& cpuWaitFilename,
const TLog& log,
ILoggingServicePtr logging,
IMonitoringServicePtr monitoring,
TCgroupStatsFetcherMonitoringSettings cgroupStatsFetcherMonitoringSettings)
{
switch (statsFetcherType) {
case NProto::EStatsFetcherType::CGROUP: {
if (cpuWaitServiceName.Empty() && cpuWaitFilename.Empty()) {
const auto& Log = log;
STORAGE_INFO(
"CpuWaitServiceName and CpuWaitFilename are empty, can't "
"build "
"CgroupStatsFetcher");
return CreateStatsFetcherStub();
}
TString statsFile =
cpuWaitFilename.Empty()
? NCloud::NStorage::BuildCpuWaitStatsFilename(
cpuWaitServiceName)
: cpuWaitFilename;

return CreateCgroupStatsFetcher(
"FILESTORE_CGROUPS",
std::move(logging),
std::move(monitoring),
statsFile,
std::move(cgroupStatsFetcherMonitoringSettings));
}
case NProto::EStatsFetcherType::KERNEL_TASK_DELAYACCT:
return CreateKernelTaskDelayAcctStatsFetcher(
"FILESTORE_KERNEL_TASK_DELAYACCT",
std::move(logging),
std::move(monitoring));
}
};

} // namespace

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -129,7 +171,7 @@ IStartable* TBootstrapYdb::GetYdbStorage() { return YdbStorage.get(); }
IStartable* TBootstrapYdb::GetTraceSerializer() { return TraceSerializer.get(); }
IStartable* TBootstrapYdb::GetLogbrokerService() { return LogbrokerService.get(); }
IStartable* TBootstrapYdb::GetNotifyService() { return NotifyService.get(); }
IStartable* TBootstrapYdb::GetCgroupStatsFetcher() { return CgroupStatsFetcher.get(); }
IStartable* TBootstrapYdb::GetStatsFetcher() { return StatsFetcher.get(); }
IStartable* TBootstrapYdb::GetIamTokenClient() { return IamTokenClient.get(); }
IStartable* TBootstrapYdb::GetComputeClient() { return ComputeClient.get(); }
IStartable* TBootstrapYdb::GetKmsClient() { return KmsClient.get(); }
Expand Down Expand Up @@ -492,11 +534,13 @@ void TBootstrapYdb::InitKikimrService()
.CounterName = "CpuWaitFailure",
};

CgroupStatsFetcher = CreateCgroupStatsFetcher(
"BLOCKSTORE_CGROUPS",
StatsFetcher = BuildStatsFetcher(
Configs->DiagnosticsConfig->GetStatsFetcherType(),
{},
Configs->DiagnosticsConfig->GetCpuWaitFilename(),
Log,
logging,
monitoring,
Configs->DiagnosticsConfig->GetCpuWaitFilename(),
std::move(cgroupStatsFetcherMonitoringSettings));

if (Configs->StorageConfig->GetBlockDigestsEnabled()) {
Expand Down Expand Up @@ -547,7 +591,7 @@ void TBootstrapYdb::InitKikimrService()
args.LogbrokerService = LogbrokerService;
args.NotifyService = NotifyService;
args.VolumeStats = VolumeStats;
args.CgroupStatsFetcher = CgroupStatsFetcher;
args.StatsFetcher = StatsFetcher;
args.RdmaServer = nullptr;
args.RdmaClient = RdmaClient;
args.Logging = logging;
Expand Down
4 changes: 2 additions & 2 deletions cloud/blockstore/libs/daemon/ydb/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ struct TBootstrapYdb final
ITraceSerializerPtr TraceSerializer;
NLogbroker::IServicePtr LogbrokerService;
NNotify::IServicePtr NotifyService;
NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
NIamClient::IIamTokenClientPtr IamTokenClient;
IComputeClientPtr ComputeClient;
IKmsClientPtr KmsClient;
Expand All @@ -106,7 +106,7 @@ struct TBootstrapYdb final
IStartable* GetTraceSerializer() override;
IStartable* GetLogbrokerService() override;
IStartable* GetNotifyService() override;
IStartable* GetCgroupStatsFetcher() override;
IStartable* GetStatsFetcher() override;
IStartable* GetIamTokenClient() override;
IStartable* GetComputeClient() override;
IStartable* GetKmsClient() override;
Expand Down
10 changes: 10 additions & 0 deletions cloud/blockstore/libs/diagnostics/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ namespace {
xxx(LocalHDDDowntimeThreshold, TDuration, TDuration::Seconds(15) )\
xxx(ReportHistogramAsMultipleCounters, bool, true )\
xxx(ReportHistogramAsSingleCounter, bool, false )\
xxx(StatsFetcherType, NProto::EStatsFetcherType, NProto::EStatsFetcherType::CGROUP )\
// BLOCKSTORE_DIAGNOSTICS_CONFIG

#define BLOCKSTORE_DIAGNOSTICS_DECLARE_CONFIG(name, type, value) \
Expand Down Expand Up @@ -307,3 +308,12 @@ void Out<NCloud::TRequestThresholds>(
{
OutRequestThresholds(out, value);
}

template <>
void Out<NCloud::NBlockStore::NProto::EStatsFetcherType>(
IOutputStream& out,
NCloud::NBlockStore::NProto::EStatsFetcherType statsFetcherType)
{
out << NCloud::NBlockStore::NProto::EStatsFetcherType_Name(
statsFetcherType);
}
2 changes: 2 additions & 0 deletions cloud/blockstore/libs/diagnostics/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ class TDiagnosticsConfig
TRequestThresholds GetRequestThresholds() const;
EHistogramCounterOptions GetHistogramCounterOptions() const;

NProto::EStatsFetcherType GetStatsFetcherType() const;

void Dump(IOutputStream& out) const;
void DumpHtml(IOutputStream& out) const;
};
Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/init/server/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ class TStorageServicesInitializer final
auto volumeBalancerService = CreateVolumeBalancerActor(
Args.StorageConfig,
Args.VolumeStats,
Args.CgroupStatsFetcher,
Args.StatsFetcher,
Args.VolumeBalancerSwitch,
MakeStorageServiceId());

Expand Down
2 changes: 1 addition & 1 deletion cloud/blockstore/libs/storage/init/server/actorsystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct TServerActorSystemArgs
IVolumeStatsPtr VolumeStats;
NRdma::IServerPtr RdmaServer;
NRdma::IClientPtr RdmaClient;
NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
TManuallyPreemptedVolumesPtr PreemptedVolumes;
NNvme::INvmeManagerPtr NvmeManager;
IVolumeBalancerSwitchPtr VolumeBalancerSwitch;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ using namespace NActors;
IActorPtr CreateVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher,
NCloud::NStorage::IStatsFetcherPtr statFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId)
{
return std::make_unique<TVolumeBalancerActor>(
std::move(storageConfig),
std::move(volumeStats),
std::move(cgroupStatFetcher),
std::move(statFetcher),
std::move(volumeBalancerSwitch),
serviceActorId);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ namespace NCloud::NBlockStore::NStorage {
NActors::IActorPtr CreateVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatFetcher,
NCloud::NStorage::IStatsFetcherPtr cgroupStatFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,12 @@ STFUNC(TRemoteVolumeStatActor::StateWork)
TVolumeBalancerActor::TVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher,
NCloud::NStorage::IStatsFetcherPtr statsFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
TActorId serviceActorId)
: StorageConfig(std::move(storageConfig))
, VolumeStats(std::move(volumeStats))
, CgroupStatsFetcher(std::move(cgroupStatsFetcher))
, StatsFetcher(std::move(statsFetcher))
, VolumeBalancerSwitch(std::move(volumeBalancerSwitch))
, ServiceActorId(serviceActorId)
, State(std::make_unique<TVolumeBalancerState>(StorageConfig))
Expand Down Expand Up @@ -245,7 +245,7 @@ void TVolumeBalancerActor::HandleGetVolumeStatsResponse(
auto now = ctx.Now();

auto interval = (now - LastCpuWaitQuery).MicroSeconds();
auto cpuLack = CpuLackPercentsMultiplier * CgroupStatsFetcher->GetCpuWait().MicroSeconds();
auto cpuLack = CpuLackPercentsMultiplier * StatsFetcher->GetCpuWait().MicroSeconds();
cpuLack /= interval;
*CpuWait = cpuLack;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class TVolumeBalancerActor final
private:
const TStorageConfigPtr StorageConfig;
const IVolumeStatsPtr VolumeStats;
const NCloud::NStorage::ICgroupStatsFetcherPtr CgroupStatsFetcher;
const NCloud::NStorage::IStatsFetcherPtr StatsFetcher;
const IVolumeBalancerSwitchPtr VolumeBalancerSwitch;
const NActors::TActorId ServiceActorId;

Expand All @@ -49,7 +49,7 @@ class TVolumeBalancerActor final
TVolumeBalancerActor(
TStorageConfigPtr storageConfig,
IVolumeStatsPtr volumeStats,
NCloud::NStorage::ICgroupStatsFetcherPtr cgroupStatsFetcher,
NCloud::NStorage::IStatsFetcherPtr statsFetcher,
IVolumeBalancerSwitchPtr volumeBalancerSwitch,
NActors::TActorId serviceActorId);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ struct TVolumeStatsTestMock final

////////////////////////////////////////////////////////////////////////////////

struct TCgroupStatsFetcherMock: public NCloud::NStorage::ICgroupStatsFetcher
struct TCgroupStatsFetcherMock: public NCloud::NStorage::IStatsFetcher
{
TDuration Value;

Expand Down
12 changes: 12 additions & 0 deletions cloud/filestore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,15 @@ message TMonitoringUrlData
optional string MonitoringProject = 3;
};

////////////////////////////////////////////////////////////////////////////////
// StatsFetcher type

enum EStatsFetcherType
{
CGROUP = 0;
KERNEL_TASK_DELAYACCT = 1;
};

////////////////////////////////////////////////////////////////////////////////

message TDiagnosticsConfig
Expand Down Expand Up @@ -103,4 +112,7 @@ message TDiagnosticsConfig

// Report histogram as a single counter (THistogramCounter)
optional bool ReportHistogramAsSingleCounter = 25;

// Type of fetching CPU stats
optional EStatsFetcherType StatsFetcherType = 26;
}
Loading

0 comments on commit a08f79b

Please sign in to comment.