Skip to content

Commit

Permalink
issue-2381: 1. Collecting tablet metrics for ListNodes/GetNodeAttr/Cr…
Browse files Browse the repository at this point in the history
…eateHandle/DestroyHandle/CreateNode/RenameNode/UnlinkNode/StatFileStore requests; 2. Tablet-level filesystem performance profiles + CurrentLoad and Suffer metrics (#2581)

* issue-2381: 1. Collecting tablet metrics for ListNodes/GetNodeAttr/CreateHandle/DestroyHandle/CreateNode/RenameNode/UnlinkNode/StatFileStore requests; 2. WIP for tablet-level filesystem performance profiles and Suffer-like metric

* issue-2381: minor cleanup + TODO impl

* issue-2381: CurrentLoad and Suffer metrics calculation, some fixes, some very basic uts (which actually check that _something_ is calculated, no rigorous calculations were made)
  • Loading branch information
qkrorlqr authored Nov 29, 2024
1 parent 84eedea commit 7aed05a
Show file tree
Hide file tree
Showing 23 changed files with 607 additions and 25 deletions.
40 changes: 37 additions & 3 deletions cloud/filestore/config/diagnostics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,37 @@ message TMonitoringUrlData
optional string MonitoringProject = 3;
};

////////////////////////////////////////////////////////////////////////////////
// Performance profile settings for performance diagnostics (e.g. Suffer metric
// calculation).

message TRequestPerformanceProfile
{
// Requests per second.
optional uint64 RPS = 1;
// Throughput in any kind of units - for Read/Write it should be bytes per
// second, for ListNodes it can be entries per second, for other requests
// it can be 0 until we come up with something more useful.
optional uint64 Throughput = 2;
};

////////////////////////////////////////////////////////////////////////////////
// Performance profile for filesystems / tablets / etc.

message TFileSystemPerformanceProfile
{
optional TRequestPerformanceProfile Read = 1;
optional TRequestPerformanceProfile Write = 2;
optional TRequestPerformanceProfile ListNodes = 3;
optional TRequestPerformanceProfile GetNodeAttr = 4;
optional TRequestPerformanceProfile CreateHandle = 5;
optional TRequestPerformanceProfile DestroyHandle = 6;
optional TRequestPerformanceProfile CreateNode = 7;
optional TRequestPerformanceProfile RenameNode = 8;
optional TRequestPerformanceProfile UnlinkNode = 9;
optional TRequestPerformanceProfile StatFileStore = 10;
};

////////////////////////////////////////////////////////////////////////////////

message TDiagnosticsConfig
Expand All @@ -27,15 +58,12 @@ message TDiagnosticsConfig
optional string BastionNameSuffix = 1;

// [obsolete]
// Solomon cluster name (e.g. yandexcloud_prod_vla).
// optional string SolomonClusterName = 2;

// [obsolete]
// Solomon host url.
// optional string SolomonUrl = 3;

// [obsolete]
// Project name in solomon.
// optional string SolomonProject = 4;

// Monserver will listen on this port.
Expand Down Expand Up @@ -103,4 +131,10 @@ message TDiagnosticsConfig

// Report histogram as a single counter (THistogramCounter)
optional bool ReportHistogramAsSingleCounter = 25;

// Performance profile for HDD filesystems.
optional TFileSystemPerformanceProfile HDDFileSystemPerformanceProfile = 26;

// Performance profile for SSD filesystems.
optional TFileSystemPerformanceProfile SSDFileSystemPerformanceProfile = 27;
}
57 changes: 57 additions & 0 deletions cloud/filestore/libs/diagnostics/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ namespace {
xxx(MonitoringUrlData, TMonitoringUrlData, {} )\
xxx(ReportHistogramAsMultipleCounters, bool, true )\
xxx(ReportHistogramAsSingleCounter, bool, false )\
\
xxx(HDDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\
xxx(SSDFileSystemPerformanceProfile, TFileSystemPerformanceProfile, {} )\
// FILESTORE_DIAGNOSTICS_CONFIG

#define FILESTORE_DIAGNOSTICS_DECLARE_CONFIG(name, type, value) \
Expand Down Expand Up @@ -77,6 +80,32 @@ ConvertValue<TMonitoringUrlData, NProto::TMonitoringUrlData>(
return TMonitoringUrlData(value);
}

TRequestPerformanceProfile ConvertValue(
const NProto::TRequestPerformanceProfile& value)
{
return {value.GetRPS(), value.GetThroughput()};
}

template <>
TFileSystemPerformanceProfile ConvertValue<
TFileSystemPerformanceProfile,
NProto::TFileSystemPerformanceProfile>
(
const NProto::TFileSystemPerformanceProfile& value)
{
return {
ConvertValue(value.GetRead()),
ConvertValue(value.GetWrite()),
ConvertValue(value.GetListNodes()),
ConvertValue(value.GetGetNodeAttr()),
ConvertValue(value.GetCreateHandle()),
ConvertValue(value.GetDestroyHandle()),
ConvertValue(value.GetCreateNode()),
ConvertValue(value.GetRenameNode()),
ConvertValue(value.GetUnlinkNode()),
ConvertValue(value.GetStatFileStore())};
}

} // namespace

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -170,3 +199,31 @@ void Out<NCloud::NFileStore::TMonitoringUrlData>(
v.SetMonitoringProject(value.MonitoringProject);
SerializeToTextFormat(v, out);
}

void ConvertFromValue(
const NCloud::NFileStore::TRequestPerformanceProfile& v,
NCloud::NFileStore::NProto::TRequestPerformanceProfile* p)
{
p->SetRPS(v.RPS);
p->SetThroughput(v.Throughput);
}

template <>
void Out<NCloud::NFileStore::TFileSystemPerformanceProfile>(
IOutputStream& out,
const NCloud::NFileStore::TFileSystemPerformanceProfile& value)
{
NCloud::NFileStore::NProto::TFileSystemPerformanceProfile v;
ConvertFromValue(value.Read, v.MutableRead());
ConvertFromValue(value.Write, v.MutableWrite());
ConvertFromValue(value.ListNodes, v.MutableListNodes());
ConvertFromValue(value.GetNodeAttr, v.MutableGetNodeAttr());
ConvertFromValue(value.CreateHandle, v.MutableCreateHandle());
ConvertFromValue(value.DestroyHandle, v.MutableDestroyHandle());
ConvertFromValue(value.CreateNode, v.MutableCreateNode());
ConvertFromValue(value.RenameNode, v.MutableRenameNode());
ConvertFromValue(value.UnlinkNode, v.MutableUnlinkNode());
ConvertFromValue(value.StatFileStore, v.MutableStatFileStore());

SerializeToTextFormat(v, out);
}
66 changes: 66 additions & 0 deletions cloud/filestore/libs/diagnostics/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,69 @@ namespace NCloud::NFileStore {

////////////////////////////////////////////////////////////////////////////////

struct TRequestPerformanceProfile
{
ui64 RPS = 0;
ui64 Throughput = 0;

TRequestPerformanceProfile() = default;

TRequestPerformanceProfile(ui64 rps, ui64 throughput)
: RPS(rps)
, Throughput(throughput)
{}

TRequestPerformanceProfile(const TRequestPerformanceProfile& rhs) = default;
TRequestPerformanceProfile& operator=(
const TRequestPerformanceProfile& rhs) = default;
};

struct TFileSystemPerformanceProfile
{
TRequestPerformanceProfile Read;
TRequestPerformanceProfile Write;
TRequestPerformanceProfile ListNodes;
TRequestPerformanceProfile GetNodeAttr;
TRequestPerformanceProfile CreateHandle;
TRequestPerformanceProfile DestroyHandle;
TRequestPerformanceProfile CreateNode;
TRequestPerformanceProfile RenameNode;
TRequestPerformanceProfile UnlinkNode;
TRequestPerformanceProfile StatFileStore;

TFileSystemPerformanceProfile() = default;

TFileSystemPerformanceProfile(
TRequestPerformanceProfile read,
TRequestPerformanceProfile write,
TRequestPerformanceProfile listNodes,
TRequestPerformanceProfile getNodeAttr,
TRequestPerformanceProfile createHandle,
TRequestPerformanceProfile destroyHandle,
TRequestPerformanceProfile createNode,
TRequestPerformanceProfile renameNode,
TRequestPerformanceProfile unlinkNode,
TRequestPerformanceProfile statFileStore)
: Read(read)
, Write(write)
, ListNodes(listNodes)
, GetNodeAttr(getNodeAttr)
, CreateHandle(createHandle)
, DestroyHandle(destroyHandle)
, CreateNode(createNode)
, RenameNode(renameNode)
, UnlinkNode(unlinkNode)
, StatFileStore(statFileStore)
{}

TFileSystemPerformanceProfile(
const TFileSystemPerformanceProfile& rhs) = default;
TFileSystemPerformanceProfile& operator=(
const TFileSystemPerformanceProfile& rhs) = default;
};

////////////////////////////////////////////////////////////////////////////////

struct TMonitoringUrlData: public TAtomicRefCount<TMonitoringUrlData>
{
TString MonitoringClusterName;
Expand Down Expand Up @@ -71,6 +134,9 @@ class TDiagnosticsConfig
bool GetReportHistogramAsSingleCounter() const;
EHistogramCounterOptions GetHistogramCounterOptions() const;

TFileSystemPerformanceProfile GetHDDFileSystemPerformanceProfile() const;
TFileSystemPerformanceProfile GetSSDFileSystemPerformanceProfile() const;

void Dump(IOutputStream& out) const;
void DumpHtml(IOutputStream& out) const;
};
Expand Down
3 changes: 3 additions & 0 deletions cloud/filestore/libs/storage/init/actorsystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,11 @@ class TCustomLocalServiceInitializer final
const TAppData* appData) override
{
auto config = Args.StorageConfig;
auto diagConfig = Args.DiagnosticsConfig;

auto tabletFactory =
[config,
diagConfig,
profileLog = Args.ProfileLog,
traceSerializer = Args.TraceSerializer,
metricsRegistry = MetricsRegistry] (
Expand All @@ -219,6 +221,7 @@ class TCustomLocalServiceInitializer final
owner,
storage,
config,
diagConfig,
std::move(profileLog),
std::move(traceSerializer),
std::move(metricsRegistry),
Expand Down
2 changes: 2 additions & 0 deletions cloud/filestore/libs/storage/tablet/tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ IActorPtr CreateIndexTablet(
const TActorId& owner,
TTabletStorageInfoPtr storage,
TStorageConfigPtr config,
TDiagnosticsConfigPtr diagConfig,
IProfileLogPtr profileLog,
ITraceSerializerPtr traceSerializer,
NMetrics::IMetricsRegistryPtr metricsRegistry,
Expand All @@ -23,6 +24,7 @@ IActorPtr CreateIndexTablet(
owner,
std::move(storage),
std::move(config),
std::move(diagConfig),
std::move(profileLog),
std::move(traceSerializer),
std::move(metricsRegistry),
Expand Down
1 change: 1 addition & 0 deletions cloud/filestore/libs/storage/tablet/tablet.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ NActors::IActorPtr CreateIndexTablet(
const NActors::TActorId& owner,
NKikimr::TTabletStorageInfoPtr storage,
TStorageConfigPtr config,
TDiagnosticsConfigPtr diagConfig,
IProfileLogPtr profileLog,
ITraceSerializerPtr traceSerializer,
NMetrics::IMetricsRegistryPtr metricsRegistry,
Expand Down
14 changes: 14 additions & 0 deletions cloud/filestore/libs/storage/tablet/tablet_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ TIndexTabletActor::TIndexTabletActor(
const TActorId& owner,
TTabletStorageInfoPtr storage,
TStorageConfigPtr config,
TDiagnosticsConfigPtr diagConfig,
IProfileLogPtr profileLog,
ITraceSerializerPtr traceSerializer,
NMetrics::IMetricsRegistryPtr metricsRegistry,
Expand All @@ -54,6 +55,7 @@ TIndexTabletActor::TIndexTabletActor(
}
)
, Config(std::move(config))
, DiagConfig(std::move(diagConfig))
, UseNoneCompactionPolicy(useNoneCompactionPolicy)
, BlobCodec(NBlockCodecs::Codec(Config->GetBlobCompressionCodec()))
{
Expand Down Expand Up @@ -921,6 +923,9 @@ STFUNC(TIndexTabletActor::StateInit)
HFunc(
TEvIndexTabletPrivate::TEvNodeUnlinkedInShard,
HandleNodeUnlinkedInShard);
HFunc(
TEvIndexTabletPrivate::TEvGetShardStatsCompleted,
HandleGetShardStatsCompleted);

FILESTORE_HANDLE_REQUEST(WaitReady, TEvIndexTablet)

Expand Down Expand Up @@ -962,6 +967,9 @@ STFUNC(TIndexTabletActor::StateWork)
HFunc(
TEvIndexTabletPrivate::TEvNodeUnlinkedInShard,
HandleNodeUnlinkedInShard);
HFunc(
TEvIndexTabletPrivate::TEvGetShardStatsCompleted,
HandleGetShardStatsCompleted);

HFunc(TEvents::TEvWakeup, HandleWakeup);
HFunc(TEvents::TEvPoisonPill, HandlePoisonPill);
Expand Down Expand Up @@ -1020,6 +1028,9 @@ STFUNC(TIndexTabletActor::StateZombie)
HFunc(
TEvIndexTabletPrivate::TEvNodeUnlinkedInShard,
HandleNodeUnlinkedInShard);
HFunc(
TEvIndexTabletPrivate::TEvGetShardStatsCompleted,
HandleGetShardStatsCompleted);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
Expand Down Expand Up @@ -1060,6 +1071,9 @@ STFUNC(TIndexTabletActor::StateBroken)
HFunc(
TEvIndexTabletPrivate::TEvNodeUnlinkedInShard,
HandleNodeUnlinkedInShard);
HFunc(
TEvIndexTabletPrivate::TEvGetShardStatsCompleted,
HandleGetShardStatsCompleted);

default:
HandleUnexpectedEvent(ev, TFileStoreComponents::TABLET);
Expand Down
Loading

0 comments on commit 7aed05a

Please sign in to comment.