From a64abe0f17161fb510fadb1fe1cbbebea0d5ec35 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 23 Aug 2021 09:28:19 -0700 Subject: [PATCH 01/35] Parallel --- .gitignore | 2 +- FabricObserver.Extensibility/ObserverBase.cs | 96 ++- .../Utilities/FabricResourceUsageData.cs | 2 +- .../Utilities/ObserverConstants.cs | 1 + .../ProcessInfo/LinuxProcessInfoProvider.cs | 5 + FabricObserver/Observers/AppObserver.cs | 613 ++++++++++-------- FabricObserver/Observers/ContainerObserver.cs | 213 +++--- .../Observers/FabricSystemObserver.cs | 85 +-- FabricObserver/Observers/NetworkObserver.cs | 2 +- FabricObserver/Observers/ObserverManager.cs | 45 +- .../PackageRoot/Config/Settings.xml | 3 + .../ApplicationManifest.xml | 18 +- FabricObserverTests/ObserverTest.cs | 152 +++-- .../Config/AppObserver.config.oldstyle.json | 10 - TelemetryLib/ClusterInformation.cs | 10 +- TelemetryLib/TelemetryConstants.cs | 2 +- TelemetryLib/TelemetryEvents.cs | 5 +- 17 files changed, 695 insertions(+), 569 deletions(-) diff --git a/.gitignore b/.gitignore index 111653e8..42a0c818 100644 --- a/.gitignore +++ b/.gitignore @@ -335,4 +335,4 @@ ASALocalRun/ **/PublishProfiles/Cloud.xml /FabricObserver/observer_logs /FabricObserver/PackageRoot/Data/Plugins/SampleNewObserver.dll -/nuget.exe \ No newline at end of file +/nuget.exe diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 443739f4..cc853bc4 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -4,6 +4,7 @@ // ------------------------------------------------------------ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; @@ -30,13 +31,9 @@ public abstract class ObserverBase : IObserver private const string FabricSystemAppName = "fabric:/System"; private bool disposed; private Dictionary ServiceDumpCountDictionary; + private object lockObj = new object(); - public bool EnableProcessDumps - { - get;set; - } - - /* Process dump settings. TODO: Only AppObserver and Windows is supported today. */ + // Process dump settings. Only AppObserver and Windows is supported. \\ public string DumpsPath { get; set; @@ -57,7 +54,7 @@ public DumpType DumpType get; set; } = DumpType.MiniPlus; - /* End AO procsess dump settings. */ + // End AO procsess dump settings. \\ public string ObserverName { @@ -225,10 +222,10 @@ public bool HasActiveFabricErrorOrWarning get; set; } - public List AppNames + public ConcurrentQueue AppNames { get; set; - } = new List(); + } = new ConcurrentQueue(); public int MonitoredServiceProcessCount { @@ -370,11 +367,6 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel EnableETWLogging = IsEtwProviderEnabled }; - if (string.IsNullOrWhiteSpace(DumpsPath)) - { - SetDumpPath(); - } - ConfigurationSettings = new ConfigSettings( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config")?.Settings, ConfigurationSectionName); @@ -618,21 +610,24 @@ public bool DumpWindowsServiceProcess(int processId, string procName, string met using (FileStream file = File.Create(dumpFilePath)) { - if (!NativeMethods.MiniDumpWriteDump( - processHandle, - (uint)processId, - file.SafeFileHandle, - miniDumpType, - IntPtr.Zero, - IntPtr.Zero, - IntPtr.Zero)) + lock (lockObj) { - throw new Win32Exception(Marshal.GetLastWin32Error()); - } + if (!NativeMethods.MiniDumpWriteDump( + processHandle, + (uint)processId, + file.SafeFileHandle, + miniDumpType, + IntPtr.Zero, + IntPtr.Zero, + IntPtr.Zero)) + { + throw new Win32Exception(Marshal.GetLastWin32Error()); + } - if (!string.IsNullOrWhiteSpace(metric)) - { - ServiceDumpCountDictionary[dumpKey] = (ServiceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); + if (!string.IsNullOrWhiteSpace(metric)) + { + ServiceDumpCountDictionary[dumpKey] = (ServiceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); + } } } } @@ -872,12 +867,16 @@ public void ProcessResourceDataReportHealth( using (var proc = Process.GetProcessById(pid)) { string procName = proc?.ProcessName; - _ = DumpWindowsServiceProcess(pid, procName, data.Property); + + lock (lockObj) + { + _ = DumpWindowsServiceProcess(pid, procName, data.Property); + } } } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { - ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); + ObserverLogger.LogWarning($"Unable to generate dmp file:{Environment.NewLine}{e}"); } } } @@ -1062,7 +1061,7 @@ public void ProcessResourceDataReportHealth( if (AppNames.All(a => a != appName?.OriginalString)) { - AppNames.Add(appName?.OriginalString); + AppNames.Enqueue(appName?.OriginalString); } // Generate a Service Fabric Health Report. @@ -1159,13 +1158,19 @@ public void ProcessResourceDataReportHealth( if (data.Data is List list) { // List impl. - list.TrimExcess(); - list.Clear(); + lock (lockObj) + { + list.TrimExcess(); + list.Clear(); + } } else { // CircularBufferCollection impl. - data.Data.Clear(); + lock (lockObj) + { + data.Data.Clear(); + } } } @@ -1211,31 +1216,6 @@ protected virtual void Dispose(bool disposing) } } - private void SetDumpPath() - { - if (ObserverName != ObserverConstants.AppObserverName) - { - return; - } - - // This only needs to be set once. - if (!string.IsNullOrWhiteSpace(DumpsPath) && Directory.Exists(DumpsPath)) - { - return; - } - - try - { - DumpsPath = Path.Combine(ObserverLogger.LogFolderBasePath, ObserverName, "MemoryDumps"); - Directory.CreateDirectory(DumpsPath); - } - catch (Exception e) when (e is ArgumentException || e is IOException || e is NotSupportedException || e is UnauthorizedAccessException) - { - ObserverLogger.LogWarning($"Unable to create dump directory {DumpsPath}."); - return; - } - } - private void SetObserverConfiguration() { // Archive file lifetime - ObserverLogger files. diff --git a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs index 8eda8028..fd16d11a 100644 --- a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs +++ b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs @@ -178,7 +178,7 @@ public string ActiveErrorOrWarningCode /// Returns true or false depending upon computed health state based on supplied threshold value. public bool IsUnhealthy(TU threshold) { - if (Data.Count < 1 || Convert.ToDouble(threshold) < 1) + if (Data.Count < 1 || Convert.ToDouble(threshold) <= 0) { return false; } diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 4e1e1c08..e86317dc 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -20,6 +20,7 @@ public sealed class ObserverConstants public const string FabricObserverName = "FabricObserver"; public const string FabricObserverETWEventName = "FabricObserverDataEvent"; public const string ObserverFailureHealthStateLevelParameter = "ObserverFailureHealthStateLevel"; + public const string EnableConcurrentExecution = "EnableConcurrentExecution"; // The name of the package that contains this Observer's configuration public const string ObserverConfigurationPackageName = "Config"; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index e135d9ff..db817631 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -170,6 +170,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService for (int i = 0; i < sPids.Length; ++i) { + if (sProcNames[i] == "ps" || sProcNames[i] == "bash") + { + continue; + } + if (int.TryParse(sPids[i], out int childProcId)) { childProcesses.Add((sProcNames[i], childProcId)); diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 51864a99..9632a4c4 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -4,6 +4,7 @@ // ------------------------------------------------------------ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; @@ -30,21 +31,23 @@ public class AppObserver : ObserverBase { // Health Report data containers - For use in analysis to determine health state. // These lists are cleared after each healthy iteration. - private List> AllAppCpuData; - private List> AllAppMemDataMb; - private List> AllAppMemDataPercent; - private List> AllAppTotalActivePortsData; - private List> AllAppEphemeralPortsData; - private List> AllAppHandlesData; + // Change this data structure to concurrentqueue.... + private ConcurrentQueue> AllAppCpuData; + private ConcurrentQueue> AllAppMemDataMb; + private ConcurrentQueue> AllAppMemDataPercent; + private ConcurrentQueue> AllAppTotalActivePortsData; + private ConcurrentQueue> AllAppEphemeralPortsData; + private ConcurrentQueue> AllAppHandlesData; // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. - private List userTargetList; + private ConcurrentQueue userTargetList; // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. - private List deployedTargetList; + private ConcurrentQueue deployedTargetList; private readonly ConfigSettings configSettings; private string fileName; private readonly Stopwatch stopwatch; + private readonly object lockObj = new object(); public int MaxChildProcTelemetryDataCount { @@ -56,7 +59,7 @@ public bool EnableChildProcessMonitoring get; set; } - public List ReplicaOrInstanceList + public ConcurrentQueue ReplicaOrInstanceList { get; set; } @@ -66,6 +69,11 @@ public string ConfigPackagePath get; set; } + public bool EnableProcessDumps + { + get; set; + } + /// /// Initializes a new instance of the class. /// @@ -79,7 +87,6 @@ public AppObserver(FabricClient fabricClient, StatelessServiceContext context) public override async Task ObserveAsync(CancellationToken token) { - // If set, this observer will only run during the supplied interval. if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) { return; @@ -100,11 +107,10 @@ public override async Task ObserveAsync(CancellationToken token) await MonitorDeployedAppsAsync(token); await ReportAsync(token); - - // The time it took to run this observer. stopwatch.Stop(); CleanUp(); RunDuration = stopwatch.Elapsed; + ObserverLogger.LogWarning($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); if (EnableVerboseLogging) { @@ -126,24 +132,25 @@ public override Task ReportAsync(CancellationToken token) List childProcessTelemetryDataList = null; TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); - for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) + _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList[i]; + var repOrInst = ReplicaOrInstanceList.ElementAt(i); string processName = null; int processId = 0; ApplicationInfo app = null; bool hasChildProcs = EnableChildProcessMonitoring && repOrInst.ChildProcesses != null; - + if (hasChildProcs) { childProcessTelemetryDataList = new List(); } - app = deployedTargetList.Find( + app = deployedTargetList.First( a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || - (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); + (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); + try { using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); @@ -152,9 +159,9 @@ public override Task ReportAsync(CancellationToken token) } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { - continue; + return; } - + string appNameOrType = GetAppNameOrType(repOrInst); var id = $"{appNameOrType}:{processName}"; @@ -165,23 +172,26 @@ public override Task ReportAsync(CancellationToken token) // Please use ContainerObserver for SF container app service monitoring. if (processName == "Fabric") { - continue; + return; } - fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - - // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. - // This means a new folder will be added to the base path. - if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) + lock (lockObj) { - CsvFileLogger.DataLogFolder = processName; - } + fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - // Log pid.. - CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); + // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. + // This means a new folder will be added to the base path. + if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) + { + CsvFileLogger.DataLogFolder = processName; + } + + // Log pid.. + CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); - // Log resource usage data to CSV files. - LogAllAppResourceDataToCsv(id); + // Log resource usage data to CSV files. + LogAllAppResourceDataToCsv(id); + } } // CPU - Parent process @@ -191,18 +201,18 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppCpuData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppCpuData, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Memory MB - Parent process @@ -212,18 +222,18 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppMemDataMb, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppMemDataMb, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); + } - // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Memory Percent - Parent process @@ -233,39 +243,37 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppMemDataPercent, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppMemDataPercent, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); } - // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.MemoryErrorLimitPercent, - app.MemoryWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.MemoryErrorLimitPercent, + app.MemoryWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Active - Parent process if (AllAppTotalActivePortsData.Any(x => x.Id == id)) { var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); - + if (hasChildProcs) { - ProcessChildProcs(ref AllAppTotalActivePortsData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppTotalActivePortsData, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); } - // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.NetworkErrorActivePorts, - app.NetworkWarningActivePorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.NetworkErrorActivePorts, + app.NetworkWarningActivePorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process @@ -275,20 +283,19 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppEphemeralPortsData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppEphemeralPortsData, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); } - // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.NetworkErrorEphemeralPorts, - app.NetworkWarningEphemeralPorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.NetworkErrorEphemeralPorts, + app.NetworkWarningEphemeralPorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } - + // Allocated (in use) Handles - Parent process if (AllAppHandlesData.Any(x => x.Id == id)) { @@ -296,57 +303,62 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - ProcessChildProcs(ref AllAppHandlesData, ref childProcessTelemetryDataList, repOrInst, ref app, ref parentFrud, token); + ProcessChildProcs(AllAppHandlesData, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); } - // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.ErrorOpenFileHandles, - app.WarningOpenFileHandles, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.ErrorOpenFileHandles, + app.WarningOpenFileHandles, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Child proc info telemetry. if (IsEtwEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) { - var data = new + lock (lockObj) { - ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList) - }; + var data = new + { + ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList) + }; - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); + } } if (IsTelemetryEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) { _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); - } - - childProcessTelemetryDataList = null; - } + } + }); return Task.CompletedTask; } private void ProcessChildProcs( - ref List> fruds, - ref List childProcessTelemetryDataList, + ConcurrentQueue> fruds, + List childProcessTelemetryDataList, ReplicaOrInstanceMonitoringInfo repOrInst, - ref ApplicationInfo app, - ref FabricResourceUsageData parentFrud, + ApplicationInfo app, + FabricResourceUsageData parentFrud, CancellationToken token) where T : struct { token.ThrowIfCancellationRequested(); - try + if (childProcessTelemetryDataList == null) { + return; + } + + try + { string metric = parentFrud.Property; var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref fruds, repOrInst, ref app, token); + var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(fruds, repOrInst, app, token); double sumAllValues = Sum + parentDataAvg; childProcInfo.Metric = metric; childProcInfo.Value = sumAllValues; @@ -361,9 +373,9 @@ private void ProcessChildProcs( } private (ChildProcessTelemetryData childProcInfo, double Sum) ProcessChildFrudsGetDataSum( - ref List> fruds, + ConcurrentQueue> fruds, ReplicaOrInstanceMonitoringInfo repOrInst, - ref ApplicationInfo app, + ApplicationInfo app, CancellationToken token) where T : struct { var childProcs = repOrInst.ChildProcesses; @@ -396,15 +408,15 @@ private void ProcessChildProcs( try { - if (fruds.Any(x => x.Id.Contains(childProcName))) + if (fruds.Any(x => x != null && x.Id.Contains(childProcName))) { - var childFruds = fruds.Where(x => x.Id.Contains(childProcName)).ToList(); + var childFruds = fruds.Where(x => x != null && x.Id.Contains(childProcName)).ToList(); metric = childFruds[0].Property; for (int j = 0; j < childFruds.Count; ++j) { token.ThrowIfCancellationRequested(); - + var frud = childFruds[j]; double value = frud.AverageDataValue; sumValues += Math.Round(value, 0); @@ -412,7 +424,7 @@ private void ProcessChildProcs( if (IsEtwEnabled || IsTelemetryEnabled) { var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = value }; - childProcessInfoData.ChildProcessInfo.Add(childProcInfo); + childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } // Windows process dump support for descendant/child processes \\ @@ -420,61 +432,75 @@ private void ProcessChildProcs( if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && app.DumpProcessOnError && EnableProcessDumps) { string prop = frud.Property; + bool dump = false; switch (prop) { case ErrorWarningProperty.TotalCpuTime: + // Test error threshold breach for supplied metric. if (frud.IsUnhealthy(app.CpuErrorLimitPercent)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalCpuTime); - app.DumpProcessOnError = false; + dump = true; } break; case ErrorWarningProperty.TotalMemoryConsumptionMb: if (frud.IsUnhealthy(app.MemoryErrorLimitMb)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalMemoryConsumptionMb); - app.DumpProcessOnError = false; + dump = true; } break; case ErrorWarningProperty.TotalMemoryConsumptionPct: if (frud.IsUnhealthy(app.MemoryErrorLimitPercent)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalMemoryConsumptionPct); - app.DumpProcessOnError = false; + dump = true; } break; case ErrorWarningProperty.TotalActivePorts: if (frud.IsUnhealthy(app.NetworkErrorActivePorts)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalActivePorts); - app.DumpProcessOnError = false; + dump = true; } break; case ErrorWarningProperty.TotalEphemeralPorts: if (frud.IsUnhealthy(app.NetworkErrorEphemeralPorts)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalEphemeralPorts); - app.DumpProcessOnError = false; + dump = true; } break; case ErrorWarningProperty.TotalFileHandles: if (frud.IsUnhealthy(app.ErrorOpenFileHandles)) { - DumpWindowsServiceProcess(childPid, childProcName, ErrorWarningProperty.TotalFileHandles); - app.DumpProcessOnError = false; + dump = true; } break; } + + lock (lockObj) + { + if (dump) + { + _ = DumpWindowsServiceProcess(childPid, childProcName, prop); + } + } } - // Remove child FRUD from ref FRUD. - fruds.Remove(frud); + // Remove child FRUD from FRUDs. + lock (lockObj) + { + var tempQueue = new ConcurrentQueue>(); + + foreach (var f in fruds.Where(fr => fr != frud)) + { + tempQueue.Enqueue(f); + } + + fruds = tempQueue; + } } childFruds?.Clear(); @@ -483,12 +509,11 @@ private void ProcessChildProcs( } catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException) { - continue; + } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { ObserverLogger.LogWarning($"Error processing child processes:{Environment.NewLine}{e}"); - continue; } } @@ -515,9 +540,9 @@ private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst // be up to date across observer loop iterations. private async Task InitializeAsync() { - ReplicaOrInstanceList = new List(); - userTargetList = new List(); - deployedTargetList = new List(); + ReplicaOrInstanceList = new ConcurrentQueue(); + userTargetList = new ConcurrentQueue(); + deployedTargetList = new ConcurrentQueue(); /* Child/Descendant proc monitoring config */ if (bool.TryParse( @@ -540,6 +565,11 @@ private async Task InitializeAsync() if (bool.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableProcessDumpsParameter), out bool enableDumps)) { EnableProcessDumps = enableDumps; + + if (string.IsNullOrWhiteSpace(DumpsPath) && enableDumps) + { + SetDumpPath(); + } } if (Enum.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.DumpTypeParameter), out DumpType dumpType)) @@ -622,7 +652,12 @@ private async Task InitializeAsync() } await using Stream stream = new FileStream(appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read); - userTargetList.AddRange(JsonHelper.ReadFromJsonStream(stream)); + var appInfo = JsonHelper.ReadFromJsonStream(stream); + + foreach (var app in appInfo) + { + userTargetList.Enqueue(app); + } // Does the configuration have any objects (targets) defined? if (userTargetList.Count == 0) @@ -676,7 +711,7 @@ private async Task InitializeAsync() // Support for specifying single configuration item for all or * applications. if (userTargetList != null && userTargetList.Any(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*")) { - ApplicationInfo application = userTargetList.Find(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); + ApplicationInfo application = userTargetList.First(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); // Get info for 50 apps at a time that are deployed to the same node this FO instance is running on. var deployedAppQueryDesc = new PagedDeployedApplicationQueryDescription(NodeName) @@ -746,7 +781,7 @@ private async Task InitializeAsync() // then the threshold specified for fabric:/MyApp will remain in place for that app target. So, target specificity overrides any global setting. if (userTargetList.Any(a => a.TargetApp == app.ApplicationName.OriginalString || a.TargetAppType == app.ApplicationTypeName)) { - var existingAppConfig = userTargetList.Find(a => a.TargetApp == app.ApplicationName.OriginalString || a.TargetAppType == app.ApplicationTypeName); + var existingAppConfig = userTargetList.First(a => a.TargetApp == app.ApplicationName.OriginalString || a.TargetAppType == app.ApplicationTypeName); if (existingAppConfig == null) { @@ -794,12 +829,19 @@ private async Task InitializeAsync() WarningOpenFileHandles = application.WarningOpenFileHandles }; - userTargetList.Add(appConfig); + userTargetList.Enqueue(appConfig); } } // Remove the All or * config item. - _ = userTargetList.Remove(application); + var tempQueue = new ConcurrentQueue(); + + foreach (var a in userTargetList.Where(a => !a.Equals(application))) + { + tempQueue.Enqueue(a); + } + + userTargetList = tempQueue; apps.Clear(); apps = null; } @@ -810,7 +852,7 @@ private async Task InitializeAsync() { Token.ThrowIfCancellationRequested(); Uri appUri = null; - ApplicationInfo application = userTargetList[i]; + ApplicationInfo application = userTargetList.ElementAt(i); if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { @@ -862,16 +904,21 @@ private async Task InitializeAsync() } int repCount = ReplicaOrInstanceList.Count; - + // For use in internal telemetry. MonitoredServiceProcessCount = repCount; MonitoredAppCount = deployedTargetList.Count; + if (!EnableVerboseLogging) + { + return true; + } + for (int i = 0; i < repCount; ++i) { Token.ThrowIfCancellationRequested(); - var rep = ReplicaOrInstanceList[i]; + var rep = ReplicaOrInstanceList.ElementAt(i); try { @@ -881,7 +928,6 @@ private async Task InitializeAsync() if (p.ProcessName == "Fabric") { - MonitoredServiceProcessCount--; continue; } @@ -889,7 +935,6 @@ private async Task InitializeAsync() // If it is not, then this would mean the process has exited so move on to next process. if (p.HasExited) { - MonitoredServiceProcessCount--; continue; } @@ -897,42 +942,57 @@ private async Task InitializeAsync() } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) { - MonitoredServiceProcessCount--; + } } return true; } - private async Task MonitorDeployedAppsAsync(CancellationToken token) + private void SetDumpPath() + { + try + { + DumpsPath = Path.Combine(ObserverLogger.LogFolderBasePath, ObserverName, "MemoryDumps"); + Directory.CreateDirectory(DumpsPath); + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is NotSupportedException || e is UnauthorizedAccessException) + { + ObserverLogger.LogWarning($"Unable to create dump directory {DumpsPath}."); + return; + } + } + + private Task MonitorDeployedAppsAsync(CancellationToken token) { int capacity = ReplicaOrInstanceList.Count; - AllAppCpuData ??= new List>(capacity); - AllAppMemDataMb ??= new List>(capacity); - AllAppMemDataPercent ??= new List>(capacity); - AllAppTotalActivePortsData ??= new List>(capacity); - AllAppEphemeralPortsData ??= new List>(capacity); - AllAppHandlesData ??= new List>(capacity); - - for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) + AllAppCpuData ??= new ConcurrentQueue>(); + AllAppMemDataMb ??= new ConcurrentQueue>(); + AllAppMemDataPercent ??= new ConcurrentQueue>(); + AllAppTotalActivePortsData ??= new ConcurrentQueue>(); + AllAppEphemeralPortsData ??= new ConcurrentQueue>(); + AllAppHandlesData ??= new ConcurrentQueue>(); + var exceptions = new ConcurrentQueue(); + + _ = Parallel.For(0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList[i]; + var repOrInst = ReplicaOrInstanceList.ElementAt(i); var timer = new Stopwatch(); int parentPid = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false; - var application = deployedTargetList?.Find( + var application = deployedTargetList?.First( app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString.ToLower() || !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - List<(string procName, int Pid)> procTree = null; + ConcurrentQueue<(string procName, int Pid)> procList = null; if (application?.TargetApp == null && application?.TargetAppType == null) { - continue; + return; } try @@ -946,21 +1006,21 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // This is strange and can happen during a redeployment. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && parentProc?.ProcessName == "Idle") { - continue; + return; } // This will throw Win32Exception if process is running at higher elevation than FO. // If it is not, then this would mean the process has exited so move on to next process. if (parentProc.HasExited) { - continue; + return; } } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) || ObserverManager.ObserverFailureHealthStateLevel == HealthState.Unknown) { - continue; + return; } if (e is Win32Exception exception && exception.NativeErrorCode == 5 || e.Message.ToLower().Contains("access is denied")) @@ -988,12 +1048,12 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - $"UserAccountPrivilege({parentProc?.ProcessName})", - HealthState.Warning, - message, - ObserverName, - token, - repOrInst?.ServiceName?.OriginalString); + $"UserAccountPrivilege({parentProc?.ProcessName})", + HealthState.Warning, + message, + ObserverName, + token, + repOrInst?.ServiceName?.OriginalString); } // ETW. @@ -1012,7 +1072,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } } - continue; + return; } string parentProcName = parentProc?.ProcessName; @@ -1021,7 +1081,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Please use ContainerObserver for SF container app service monitoring. if (parentProcName == null || parentProcName == "Fabric") { - continue; + return; } string appNameOrType = GetAppNameOrType(repOrInst); @@ -1037,63 +1097,65 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } // Add new resource data structures for each app service process where the metric is specified in configuration for related observation. - if (AllAppCpuData.All(list => list.Id != id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) + // 1 thread safe + // 2 or lock + + if (AllAppCpuData.All(list => list?.Id != id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) { - AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); + AllAppCpuData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); } - if (AllAppCpuData.Any(list => list.Id == id)) + if (AllAppCpuData.Any(list => list?.Id == id)) { checkCpu = true; } - if (AllAppMemDataMb.All(list => list.Id != id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0)) + if (AllAppMemDataMb.All(list => list?.Id != id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0)) { - AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, id, capacity, UseCircularBuffer)); + AllAppMemDataMb.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, id, capacity, UseCircularBuffer)); } - - if (AllAppMemDataMb.Any(list => list.Id == id)) + + if (AllAppMemDataMb.Any(list => list?.Id == id)) { checkMemMb = true; } - if (AllAppMemDataPercent.All(list => list.Id != id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0)) + if (AllAppMemDataPercent.All(list => list?.Id != id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0)) { - AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, id, capacity, UseCircularBuffer)); + AllAppMemDataPercent.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, id, capacity, UseCircularBuffer)); } - if (AllAppMemDataPercent.Any(list => list.Id == id)) + if (AllAppMemDataPercent.Any(list => list?.Id == id)) { checkMemPct = true; } - if (AllAppTotalActivePortsData.All(list => list.Id != id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0)) + if (AllAppTotalActivePortsData.All(list => list?.Id != id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0)) { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, id, 1)); + AllAppTotalActivePortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, id, 1, false)); } - - if (AllAppTotalActivePortsData.Any(list => list.Id == id)) + + if (AllAppTotalActivePortsData.Any(list => list?.Id == id)) { checkAllPorts = true; } - if (AllAppEphemeralPortsData.All(list => list.Id != id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0)) + if (AllAppEphemeralPortsData.All(list => list?.Id != id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0)) { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, id, 1)); + AllAppEphemeralPortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, id, 1, false)); } - - if (AllAppEphemeralPortsData.Any(list => list.Id == id)) + + if (AllAppEphemeralPortsData.Any(list => list?.Id == id)) { checkEphemeralPorts = true; } - // File Handles (FD on linux) - if (AllAppHandlesData.All(list => list.Id != id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) + if (AllAppHandlesData.All(list => list?.Id != id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, id, 1)); + AllAppHandlesData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, id, 1, false)); } - if (AllAppHandlesData.Any(list => list.Id == id)) + if (AllAppHandlesData.Any(list => list?.Id == id)) { checkHandles = true; } @@ -1101,22 +1163,24 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Get list of child processes of parentProc should they exist. // In order to provide accurate resource usage of an SF service process we need to also account for // any processes (children) that the service process (parent) created/spawned. - procTree = new List<(string procName, int Pid)> - { - // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, - // then only the parent process will be in this list. - (parentProc.ProcessName, parentProc.Id) - }; + procList = new ConcurrentQueue<(string procName, int Pid)>(); + + // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, + // then only the parent process will be in this list. + procList.Enqueue((parentProc.ProcessName, parentProc.Id)); if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) { - procTree.AddRange(repOrInst.ChildProcesses); + for (int k = 0; k < repOrInst.ChildProcesses.Count; ++k) + { + procList.Enqueue(repOrInst.ChildProcesses[k]); + } } - for (int j = 0; j < procTree.Count; ++j) + for (int j = 0; j < procList.Count; ++j) { - int procId = procTree[j].Pid; - string procName = procTree[j].procName; + int procId = procList.ElementAt(j).Pid; + string procName = procList.ElementAt(j).procName; TimeSpan duration = TimeSpan.FromSeconds(1); if (MonitorDuration > TimeSpan.MinValue) @@ -1127,7 +1191,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) { - continue; + return; } /* Warm up Windows perf counters. */ @@ -1149,15 +1213,16 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) { if (procId == parentPid) { - AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + AllAppHandlesData.FirstOrDefault(x => x?.Id == id).Data.Add(handles); } else { - if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procName}")) + // Do NOT do this... + if (!AllAppHandlesData.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppHandlesData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(handles); + AllAppHandlesData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(handles); } } } @@ -1168,16 +1233,16 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Parent process (the service process). if (procId == parentPid) { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x?.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } else { // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procName}")) + if (!AllAppTotalActivePortsData.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppTotalActivePortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } } @@ -1186,15 +1251,15 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) { if (procId == parentPid) { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppEphemeralPortsData.FirstOrDefault(x => x?.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } else { - if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procName}")) + if (!AllAppEphemeralPortsData.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppEphemeralPortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppEphemeralPortsData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } } @@ -1220,15 +1285,15 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + AllAppCpuData.FirstOrDefault(x => x?.Id == id).Data.Add(cpu); } else { - if (!AllAppCpuData.Any(x => x.Id == $"{id}:{procName}")) + if (!AllAppCpuData.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppCpuData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(cpu); + AllAppCpuData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(cpu); } } } @@ -1244,15 +1309,15 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); + AllAppMemDataMb.FirstOrDefault(x => x?.Id == id).Data.Add(processMem); } else { - if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{procName}")) + if (!AllAppMemDataMb.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppMemDataMb.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(processMem); + AllAppMemDataMb.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(processMem); } } @@ -1272,36 +1337,44 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataPercent.FirstOrDefault(x => x?.Id == id).Data.Add(Math.Round(usedPct, 1)); } else { - if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{procName}")) + if (!AllAppMemDataPercent.Any(x => x?.Id == $"{id}:{procName}")) { - AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppMemDataPercent.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataPercent.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(Math.Round(usedPct, 1)); } } } - await Task.Delay(250, Token).ConfigureAwait(false); + Thread.Sleep(150); } timer.Stop(); timer.Reset(); - - await Task.Delay(250, Token).ConfigureAwait(false); } } - catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) { - ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}"); - - // Fix the bug.. - throw; + state.Stop(); } - } + catch (Exception e) + { + exceptions.Enqueue(e); + } + }); + + if (exceptions.Count > 0) + { + var aggEx = new AggregateException(exceptions); + ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{aggEx}"); + throw new AggregateException(aggEx); + } + + return Task.CompletedTask; } private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) @@ -1354,7 +1427,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat appList = null; } - for (int i = 0; i < deployedApps.Count; ++i) + _ = Parallel.For (0, deployedApps.Count, ObserverManager.ParallelOptions, (i, state) => { Token.ThrowIfCancellationRequested(); @@ -1362,12 +1435,12 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat string[] filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. - var serviceFilter = userTargetList.Find(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) - && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); + var serviceFilter = userTargetList.FirstOrDefault(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) + && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); ServiceFilterType filterType = ServiceFilterType.None; - + if (serviceFilter != null) { if (!string.IsNullOrWhiteSpace(serviceFilter.ServiceExcludeList)) @@ -1382,15 +1455,27 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } } - var replicasOrInstances = await GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType); + var replicasOrInstances = GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType).GetAwaiter().GetResult(); - ReplicaOrInstanceList.AddRange(replicasOrInstances); + lock (lockObj) + { + foreach (var rep in replicasOrInstances) + { + ReplicaOrInstanceList.Enqueue(rep); + } + } - deployedTargetList.AddRange(userTargetList.Where( - x => (x.TargetApp != null || x.TargetAppType != null) - && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()))); - } + lock (lockObj) + { + var targets = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) + && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower())); + foreach (var target in targets) + { + deployedTargetList.Enqueue(target); + } + } + }); deployedApps.Clear(); deployedApps = null; @@ -1409,23 +1494,23 @@ private async Task> GetDeployedPrimaryRepl var replicaMonitoringList = new List(deployedReplicaList.Count); SetInstanceOrReplicaMonitoringList( - appName, - serviceFilterList, - filterType, - appTypeName, - deployedReplicaList, - ref replicaMonitoringList); + appName, + serviceFilterList, + filterType, + appTypeName, + deployedReplicaList, + replicaMonitoringList); return replicaMonitoringList; } private void SetInstanceOrReplicaMonitoringList( - Uri appName, - string[] filterList, - ServiceFilterType filterType, - string appTypeName, - DeployedServiceReplicaList deployedReplicaList, - ref List replicaMonitoringList) + Uri appName, + string[] filterList, + ServiceFilterType filterType, + string appTypeName, + DeployedServiceReplicaList deployedReplicaList, + List replicaMonitoringList) { for (int i = 0; i < deployedReplicaList.Count; ++i) { @@ -1529,37 +1614,37 @@ private void CleanUp() ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; - if (AllAppCpuData != null && AllAppCpuData.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppCpuData != null && AllAppCpuData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppCpuData?.Clear(); AllAppCpuData = null; } - if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppEphemeralPortsData?.Clear(); AllAppEphemeralPortsData = null; } - if (AllAppHandlesData != null && AllAppHandlesData.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppHandlesData != null && AllAppHandlesData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppHandlesData?.Clear(); AllAppHandlesData = null; } - if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppMemDataMb?.Clear(); AllAppMemDataMb = null; } - if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppMemDataPercent?.Clear(); AllAppMemDataPercent = null; } - if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => !frud.ActiveErrorOrWarning)) + if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) { AllAppTotalActivePortsData?.Clear(); AllAppTotalActivePortsData = null; @@ -1581,14 +1666,14 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalCpuTime, "Average", - Math.Round(AllAppCpuData.Find(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppCpuData.First(x => x.Id == appName).AverageDataValue)); CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalCpuTime, "Peak", - Math.Round(AllAppCpuData.FirstOrDefault(x => x.Id == appName).MaxDataValue)); + Math.Round(AllAppCpuData.First(x => x.Id == appName).MaxDataValue)); } // Memory - MB @@ -1599,14 +1684,14 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalMemoryConsumptionMb, "Average", - Math.Round(AllAppMemDataMb.FirstOrDefault(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppMemDataMb.First(x => x.Id == appName).AverageDataValue)); CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalMemoryConsumptionMb, "Peak", - Math.Round(Convert.ToDouble(AllAppMemDataMb.FirstOrDefault(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppMemDataMb.First(x => x.Id == appName).MaxDataValue))); } if (AllAppMemDataPercent.Any(x => x.Id == appName)) @@ -1616,7 +1701,7 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalMemoryConsumptionPct, "Average", - Math.Round(AllAppMemDataPercent.FirstOrDefault(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppMemDataPercent.First(x => x.Id == appName).AverageDataValue)); CsvFileLogger.LogData( fileName, @@ -1634,7 +1719,7 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalActivePorts, "Total", - Math.Round(Convert.ToDouble(AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppTotalActivePortsData.First(x => x.Id == appName).MaxDataValue))); } if (AllAppEphemeralPortsData.Any(x => x.Id == appName)) @@ -1645,7 +1730,7 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalEphemeralPorts, "Total", - Math.Round(Convert.ToDouble(AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppEphemeralPortsData.First(x => x.Id == appName).MaxDataValue))); } if (AllAppHandlesData.Any(x => x.Id == appName)) @@ -1656,7 +1741,7 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalFileHandles, "Total", - Math.Round(AllAppHandlesData.FirstOrDefault(x => x.Id == appName).MaxDataValue)); + Math.Round(AllAppHandlesData.First(x => x.Id == appName).MaxDataValue)); } DataTableFileLogger.Flush(); diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index c91fd002..5189adbf 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -16,21 +16,23 @@ using FabricObserver.Observers.Utilities; using FabricObserver.Observers.MachineInfoModel; using System.Fabric.Description; +using System.Collections.Concurrent; namespace FabricObserver.Observers { public class ContainerObserver : ObserverBase { - private List> allCpuDataPercentage; - private List> allMemDataMB; + private ConcurrentQueue> allCpuDataPercentage; + private ConcurrentQueue> allMemDataMB; // userTargetList is the list of ApplicationInfo objects representing apps supplied in configuration. private List userTargetList; // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. - private List deployedTargetList; + private ConcurrentQueue deployedTargetList; private List ReplicaOrInstanceList; private readonly string ConfigPackagePath; + private readonly object lockObj = new object(); private string ConfigurationFilePath = string.Empty; public ContainerObserver(FabricClient fabricClient, StatelessServiceContext context) @@ -38,7 +40,6 @@ public ContainerObserver(FabricClient fabricClient, StatelessServiceContext cont { var configSettings = new MachineInfoModel.ConfigSettings(context); ConfigPackagePath = configSettings.ConfigPackagePath; - } // OsbserverManager passes in a special token to ObserveAsync and ReportAsync that enables it to stop this observer outside of @@ -58,8 +59,8 @@ public override async Task ObserveAsync(CancellationToken token) return; } - await MonitorContainersAsync().ConfigureAwait(true); - await ReportAsync(token).ConfigureAwait(true); + MonitorContainers(); + await ReportAsync(token); CleanUp(); runDurationTimer.Stop(); RunDuration = runDurationTimer.Elapsed; @@ -68,88 +69,86 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { + if (deployedTargetList.Count == 0) + { + return Task.CompletedTask; + } + TimeSpan timeToLive = GetHealthReportTimeToLive(); - foreach (var app in deployedTargetList) + _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { - if (!ReplicaOrInstanceList.Any(rep => rep.ApplicationName.OriginalString == app.TargetApp)) + var repOrInst = ReplicaOrInstanceList[i]; + ApplicationInfo app = deployedTargetList.First( + a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || + (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); + + string serviceName = repOrInst.ServiceName.OriginalString.Replace(app.TargetApp, "").Replace("/", ""); + string cpuId = $"{serviceName}_cpu"; + string memId = $"{serviceName}_mem"; + var cpuFrudInst = allCpuDataPercentage.FirstOrDefault(cpu => cpu.Id == cpuId); + var memFrudInst = allMemDataMB.FirstOrDefault(mem => mem.Id == memId); + + if (EnableCsvLogging) { - continue; - } + var csvFileName = $"{serviceName}Data{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; + string appName = repOrInst.ApplicationName.OriginalString.Replace("fabric:/", ""); + string id = $"{appName}:{serviceName}"; - foreach (var repOrInst in ReplicaOrInstanceList.Where(rep => rep.ApplicationName.OriginalString == app.TargetApp)) - { - string serviceName = repOrInst.ServiceName.OriginalString.Replace(app.TargetApp, "").Replace("/", ""); - string cpuId = $"{serviceName}_cpu"; - string memId = $"{serviceName}_mem"; - var cpuFrudInst = allCpuDataPercentage.Find(cpu => cpu.Id == cpuId); - var memFrudInst = allMemDataMB.Find(mem => mem.Id == memId); - - if (EnableCsvLogging) + // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. + // This means a new folder will be added to the base path. + if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) { - var csvFileName = $"{serviceName}Data{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - string appName = repOrInst.ApplicationName.OriginalString.Replace("fabric:/", ""); - string id = $"{appName}:{serviceName}"; - - // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. - // This means a new folder will be added to the base path. - if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) - { - CsvFileLogger.DataLogFolder = serviceName; - } + CsvFileLogger.DataLogFolder = serviceName; + } - // Log resource usage data to local CSV file(s). - // CPU Time + // Log resource usage data to local CSV file(s). + // CPU Time + lock (lockObj) + { CsvFileLogger.LogData( csvFileName, id, ErrorWarningProperty.TotalCpuTime, - "Average", + "Total", Math.Round(cpuFrudInst.AverageDataValue)); + } - CsvFileLogger.LogData( - csvFileName, - id, - ErrorWarningProperty.TotalCpuTime, - "Peak", - Math.Round(cpuFrudInst.MaxDataValue)); - - - // Memory - MB + // Memory - MB + lock (lockObj) + { CsvFileLogger.LogData( csvFileName, id, ErrorWarningProperty.TotalMemoryConsumptionMb, - "Average", + "Total", Math.Round(memFrudInst.AverageDataValue)); - - CsvFileLogger.LogData( - csvFileName, - id, - ErrorWarningProperty.TotalMemoryConsumptionMb, - "Peak", - Math.Round(memFrudInst.MaxDataValue)); - } + } - // Report -> Send Telemetry/Write ETW/Create SF Health Warnings (if threshold breach) + // Report -> Send Telemetry/Write ETW/Create SF Health Warnings (if threshold breach) + lock (lockObj) + { ProcessResourceDataReportHealth( - cpuFrudInst, - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - timeToLive, - HealthReportType.Application, - repOrInst); + cpuFrudInst, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + timeToLive, + HealthReportType.Application, + repOrInst); + } + lock (lockObj) + { ProcessResourceDataReportHealth( - memFrudInst, - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - timeToLive, - HealthReportType.Application, - repOrInst); + memFrudInst, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + timeToLive, + HealthReportType.Application, + repOrInst); } - } + }); return Task.CompletedTask; } @@ -167,7 +166,7 @@ private async Task InitializeAsync(CancellationToken token) } userTargetList = new List(); - deployedTargetList = new List(); + deployedTargetList = new ConcurrentQueue(); ReplicaOrInstanceList = new List(); using (Stream stream = new FileStream(ConfigurationFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) @@ -227,7 +226,7 @@ private async Task InitializeAsync(CancellationToken token) foreach (var app in apps) { - Token.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); if (app.ApplicationName.OriginalString == "fabric:/System") { @@ -247,7 +246,7 @@ private async Task InitializeAsync(CancellationToken token) if (userTargetList.Any(a => a.TargetApp == app.ApplicationName.OriginalString)) { - var existingAppConfig = userTargetList.Find(a => a.TargetApp == app.ApplicationName.OriginalString); + var existingAppConfig = userTargetList.FirstOrDefault(a => a.TargetApp == app.ApplicationName.OriginalString); if (existingAppConfig == null) { @@ -272,7 +271,7 @@ private async Task InitializeAsync(CancellationToken token) CpuWarningLimitPercent = application.CpuWarningLimitPercent, }; - userTargetList.Add(appConfig); + userTargetList.Add(appConfig); } } @@ -283,7 +282,6 @@ private async Task InitializeAsync(CancellationToken token) } int settingsFail = 0; - MonitoredAppCount = userTargetList.Count; foreach (var application in userTargetList) { @@ -326,7 +324,7 @@ private async Task InitializeAsync(CancellationToken token) null, ConfigurationSettings.AsyncTimeout, token), - Token); + token); if (codepackages.Count == 0) { @@ -340,7 +338,7 @@ private async Task InitializeAsync(CancellationToken token) continue; } - deployedTargetList.Add(application); + deployedTargetList.Enqueue(application); await SetInstanceOrReplicaMonitoringList(new Uri(application.TargetApp), filteredServiceList, filterType, null).ConfigureAwait(false); } catch (Exception e) when (e is FabricException || e is TimeoutException) @@ -349,8 +347,14 @@ private async Task InitializeAsync(CancellationToken token) } } + MonitoredAppCount = deployedTargetList.Count; MonitoredServiceProcessCount = ReplicaOrInstanceList.Count; + if (!EnableVerboseLogging) + { + return true; + } + foreach (var rep in ReplicaOrInstanceList) { token.ThrowIfCancellationRequested(); @@ -361,7 +365,7 @@ private async Task InitializeAsync(CancellationToken token) return true; } - private async Task MonitorContainersAsync() + private void MonitorContainers() { /* docker stats --no-stream --format "table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}" @@ -376,56 +380,29 @@ 9e380a42233c sf-243-2d2f9fde-fb93-4e77-a5d2-df1600000000_3161e2ee-3d8f-2d45-b1 fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc96-cd4e596b6b6a 0.05% 27.19MiB / 15.45GiB */ - int listcapacity = ReplicaOrInstanceList.Count; - - if (allCpuDataPercentage == null) - { - allCpuDataPercentage = new List>(listcapacity); - } - - if (allMemDataMB == null) - { - allMemDataMB = new List>(listcapacity); - } + allCpuDataPercentage ??= new ConcurrentQueue>(); + allMemDataMB ??= new ConcurrentQueue>(); try { - foreach (ReplicaOrInstanceMonitoringInfo repOrInst in ReplicaOrInstanceList) + _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { - // This is how long each measurement sequence for each container can last. - // You can set this timespan value in ApplicationManifest_Modified.xml, see ContainerObserverMonitorDuration Parameter. - TimeSpan duration = TimeSpan.FromSeconds(3); - int frudCapacity; - Token.ThrowIfCancellationRequested(); - if (MonitorDuration > TimeSpan.MinValue) - { - duration = MonitorDuration; - } - + var repOrInst = ReplicaOrInstanceList[i]; string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; string containerId = string.Empty; - if (UseCircularBuffer) - { - frudCapacity = DataCapacity > 0 ? DataCapacity : 5; - } - else - { - frudCapacity = (int)duration.TotalSeconds * 4; - } - if (!allCpuDataPercentage.Any(frud => frud.Id == cpuId)) { - allCpuDataPercentage.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, frudCapacity, UseCircularBuffer)); + allCpuDataPercentage.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, 1, false)); } if (!allMemDataMB.Any(frud => frud.Id == memId)) { - allMemDataMB.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, frudCapacity, UseCircularBuffer)); + allMemDataMB.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); } var monitorTimer = Stopwatch.StartNew(); @@ -454,19 +431,22 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc }; using Process p = Process.Start(ps); - List output = new List(frudCapacity); + List output = new List(2); string l; - while ((l = await p.StandardOutput.ReadLineAsync()) != null) + while ((l = p.StandardOutput.ReadLine()) != null) { - output.Add(l); + lock (lockObj) + { + output.Add(l); + } } foreach (string line in output) { Token.ThrowIfCancellationRequested(); - if (line.Contains("CPU")) + if (string.IsNullOrWhiteSpace(line) || line.Contains("CPU")) { continue; } @@ -478,7 +458,7 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc if (stats.Length < 4) { ObserverLogger.LogWarning($"docker stats not returning expected information: stats.Count = {stats.Length}. Expected 4."); - return; + state.Stop(); } if (!stats[1].Contains(repOrInst.ServicePackageActivationId)) @@ -500,14 +480,17 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc double mem_working_set_mb = double.TryParse(stats[3].Replace("MiB", ""), out double memMib) ? memMib : 0; allMemDataMB?.FirstOrDefault(f => f.Id == memId)?.Data.Add(mem_working_set_mb); - await Task.Delay(150, Token); + Thread.Sleep(150); } - output.Clear(); - output = null; - } + lock (lockObj) + { + output.Clear(); + output = null; + } + }); } - catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + catch (AggregateException e) when (!(e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException)) { ObserverLogger.LogWarning($"Failure in ObserveAsync:{Environment.NewLine}{e}"); diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 556cdad3..869ea274 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -4,6 +4,7 @@ // ------------------------------------------------------------ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; @@ -33,15 +34,16 @@ public class FabricSystemObserver : ObserverBase private Stopwatch stopwatch; // Health Report data container - For use in analysis to determine health state. - private List> allCpuData; - private List> allMemData; - private List> allActiveTcpPortData; - private List> allEphemeralTcpPortData; - private List> allHandlesData; + private ConcurrentQueue> allCpuData; + private ConcurrentQueue> allMemData; + private ConcurrentQueue> allActiveTcpPortData; + private ConcurrentQueue> allEphemeralTcpPortData; + private ConcurrentQueue> allHandlesData; // Windows only. (EventLog). private List evtRecordList = null; private bool monitorWinEventLog; + private readonly object lockObj = new object(); /// /// Initializes a new instance of the class. @@ -136,7 +138,7 @@ public override async Task ObserveAsync(CancellationToken token) { Initialize(); - for (int i = 0; i < processWatchList.Length; ++i) + _ = Parallel.For (0, processWatchList.Length, ObserverManager.ParallelOptions, async (i, state) => { Token.ThrowIfCancellationRequested(); @@ -151,15 +153,15 @@ public override async Task ObserveAsync(CancellationToken token) dotnet = "dotnet "; } - await GetProcessInfoAsync($"{dotnet}{procName}").ConfigureAwait(true); + await GetProcessInfoAsync($"{dotnet}{procName}"); } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { - + return; } - } + }); } - catch (Exception e) when (!(e is OperationCanceledException)) + catch (AggregateException e) when (!(e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException)) { ObserverLogger.LogError( $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}"); @@ -520,7 +522,6 @@ private void Initialize() // fabric:/System MonitoredAppCount = 1; MonitoredServiceProcessCount = processWatchList.Length; - int listcapacity = processWatchList.Length; int frudCapacity = 4; if (UseCircularBuffer) @@ -539,11 +540,11 @@ private void Initialize() // CPU data if (allCpuData == null && (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0)) { - allCpuData = new List>(listcapacity); + allCpuData = new ConcurrentQueue>(); foreach (var proc in processWatchList) { - allCpuData.Add( + allCpuData.Enqueue( new FabricResourceUsageData( ErrorWarningProperty.TotalCpuTime, proc, @@ -555,11 +556,11 @@ private void Initialize() // Memory data if (allMemData == null && (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0)) { - allMemData = new List>(listcapacity); + allMemData = new ConcurrentQueue>(); foreach (var proc in processWatchList) { - allMemData.Add( + allMemData.Enqueue( new FabricResourceUsageData( ErrorWarningProperty.TotalMemoryConsumptionMb, proc, @@ -571,11 +572,11 @@ private void Initialize() // Ports if (allActiveTcpPortData == null && (ActiveTcpPortCountError > 0 || ActiveTcpPortCountWarning > 0)) { - allActiveTcpPortData = new List>(listcapacity); + allActiveTcpPortData = new ConcurrentQueue>(); foreach (var proc in processWatchList) { - allActiveTcpPortData.Add( + allActiveTcpPortData.Enqueue( new FabricResourceUsageData( ErrorWarningProperty.TotalActivePorts, proc, @@ -586,11 +587,11 @@ private void Initialize() if (allEphemeralTcpPortData == null && (ActiveEphemeralPortCountError > 0 || ActiveEphemeralPortCountWarning > 0)) { - allEphemeralTcpPortData = new List>(listcapacity); + allEphemeralTcpPortData = new ConcurrentQueue>(); foreach (var proc in processWatchList) { - allEphemeralTcpPortData.Add( + allEphemeralTcpPortData.Enqueue( new FabricResourceUsageData( ErrorWarningProperty.TotalEphemeralPorts, proc, @@ -602,11 +603,11 @@ private void Initialize() // Handles if (allHandlesData == null && (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0)) { - allHandlesData = new List>(listcapacity); + allHandlesData = new ConcurrentQueue>(); foreach (var proc in processWatchList) { - allHandlesData.Add( + allHandlesData.Enqueue( new FabricResourceUsageData( ErrorWarningProperty.TotalFileHandles, proc, @@ -901,7 +902,7 @@ private async Task GetProcessInfoAsync(string procName) } private void ProcessResourceDataList( - List> data, + ConcurrentQueue> data, T thresholdError, T thresholdWarning) where T : struct @@ -913,15 +914,15 @@ private void ProcessResourceDataList( fileName = $"FabricSystemServices{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; } - for (int i = 0; i < data.Count; ++i) + _ = Parallel.For (0, data.Count, ObserverManager.ParallelOptions, (i, state) => { Token.ThrowIfCancellationRequested(); - var dataItem = data[i]; + var dataItem = data.ElementAt(i); if (dataItem.Data.Count == 0 || dataItem.AverageDataValue <= 0) { - continue; + return; } if (EnableCsvLogging) @@ -955,7 +956,10 @@ private void ProcessResourceDataList( if (procId > 0) { - CsvFileLogger.LogData(fileName, dataItem.Id, "ProcessId", "", procId); + lock (lockObj) + { + CsvFileLogger.LogData(fileName, dataItem.Id, "ProcessId", "", procId); + } } } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException) @@ -963,49 +967,52 @@ private void ProcessResourceDataList( } - CsvFileLogger.LogData(fileName, dataItem.Id, dataLogMonitorType, "Average", Math.Round(dataItem.AverageDataValue, 2)); - CsvFileLogger.LogData(fileName, dataItem.Id, dataLogMonitorType, "Peak", Math.Round(Convert.ToDouble(dataItem.MaxDataValue))); + lock (lockObj) + { + CsvFileLogger.LogData(fileName, dataItem.Id, dataLogMonitorType, "Average", Math.Round(dataItem.AverageDataValue, 2)); + CsvFileLogger.LogData(fileName, dataItem.Id, dataLogMonitorType, "Peak", Math.Round(Convert.ToDouble(dataItem.MaxDataValue))); + } } // This function will clear Data items in list (will call Clear() on the supplied FabricResourceUsageData instance's Data field..) ProcessResourceDataReportHealth( - dataItem, - thresholdError, - thresholdWarning, - GetHealthReportTimeToLive(), - HealthReportType.Application); - } + dataItem, + thresholdError, + thresholdWarning, + GetHealthReportTimeToLive(), + HealthReportType.Application); + }); } private void CleanUp() { processWatchList = null; - if (allCpuData != null && !allCpuData.Any(frud => frud.ActiveErrorOrWarning)) + if (allCpuData != null && !allCpuData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) { allCpuData?.Clear(); allCpuData = null; } - if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) + if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) { allEphemeralTcpPortData?.Clear(); allEphemeralTcpPortData = null; } - if (allHandlesData != null && !allHandlesData.Any(frud => frud.ActiveErrorOrWarning)) + if (allHandlesData != null && !allHandlesData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) { allHandlesData?.Clear(); allHandlesData = null; } - if (allMemData != null && !allMemData.Any(frud => frud.ActiveErrorOrWarning)) + if (allMemData != null && !allMemData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) { allMemData?.Clear(); allMemData = null; } - if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) + if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) { allActiveTcpPortData?.Clear(); allActiveTcpPortData = null; diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 09a7920c..f2a1b7b0 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -635,7 +635,7 @@ private void SetHealthState(Endpoint endpoint, string targetApp, bool passed) if (!AppNames.Contains(targetApp)) { - AppNames.Add(targetApp); + AppNames.Enqueue(targetApp); } } } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 0d1be3ce..3b0bf8be 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -18,9 +18,9 @@ using FabricObserver.Observers.Utilities.Telemetry; using Microsoft.Extensions.DependencyInjection; using FabricObserver.TelemetryLib; -using Newtonsoft.Json; using HealthReport = FabricObserver.Observers.Utilities.HealthReport; using System.Fabric.Description; +using System.Runtime; namespace FabricObserver.Observers { @@ -138,6 +138,20 @@ public TimeSpan OperationalTelemetryRunInterval get; private set; } = TimeSpan.FromHours(4); + /// + /// This is for observers that support parallelized monitor loops. + /// AppObserver, ContainerObserver, FabricSystemObserver. + /// + public static ParallelOptions ParallelOptions + { + get; set; + } + + public static bool EnableConcurrentExecution + { + get; set; + } + /// /// Initializes a new instance of the class. /// This is only used by unit tests. @@ -216,6 +230,13 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie HealthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); SetPropertiesFromConfigurationParameters(); + + ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = EnableConcurrentExecution && Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = linkedSFRuntimeObserverTokenSource?.Token ?? token, + TaskScheduler = TaskScheduler.Default + }; } public async Task StartObserversAsync() @@ -288,6 +309,13 @@ make that connection. You should generally not have to call GC.Collect from user GC.Collect(0, GCCollectionMode.Forced, true, false); GC.Collect(1, GCCollectionMode.Forced, true, false); + // LOH + if (EnableConcurrentExecution) + { + GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; + GC.Collect(2, GCCollectionMode.Forced, true, true); + } + if (ObserverExecutionLoopSleepSeconds > 0) { await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds), token); @@ -742,6 +770,21 @@ private void SetPropertiesFromConfigurationParameters(ConfigurationSettings sett { ApplicationName = FabricServiceContext.CodePackageActivationContext.ApplicationName; + // Parallelization settings for capable hardware. \\ + + if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableConcurrentExecution, settings), out bool enableConcurrency)) + { + EnableConcurrentExecution = enableConcurrency; + } + + ParallelOptions = new ParallelOptions + { + // Parallelism only makes sense for capable CPU configurations. The minimum requirement is 4 logical processors; which would map to more than 1 available core. + MaxDegreeOfParallelism = EnableConcurrentExecution && Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = linkedSFRuntimeObserverTokenSource?.Token ?? token, + TaskScheduler = TaskScheduler.Default + }; + // ETW - Overridable if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableETWProvider, settings), out bool etwEnabled)) { diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 3c582295..5cb77d18 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -89,6 +89,9 @@ + + + - + - - + + @@ -48,9 +49,9 @@ - - - + + + @@ -328,8 +329,9 @@ - + + diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 00db59e7..186acc35 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -271,6 +271,13 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() ObserverManager.FabricClientInstance = client; ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new AppObserver(client, context) { @@ -279,7 +286,7 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() ReplicaOrInstanceList = new List() }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -308,6 +315,13 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is ObserverManager.FabricClientInstance = client; ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new AppObserver(client, context) { @@ -316,7 +330,7 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is ReplicaOrInstanceList = new List() }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -346,7 +360,7 @@ public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() ClusterObserverManager.EtwEnabled = true; var obs = new ClusterObserver.ClusterObserver(); - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -399,7 +413,7 @@ public async Task CertificateObserver_validCerts() ClusterCertSecondaryThumbprint = string.Empty }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -460,7 +474,7 @@ public async Task CertificateObserver_expiredAndexpiringCerts() ClusterCertSecondaryThumbprint = string.Empty }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -502,7 +516,7 @@ public async Task NodeObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Fa CpuWarningUsageThresholdPct = 10000 }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -545,7 +559,7 @@ public async Task NodeObserver_Negative_Integer_CPU_Mem_Ports_Firewalls_Values_N ActivePortsWarningThreshold = -100 }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // Bad values don't crash Initialize. Assert.IsFalse(obs.IsUnhealthy); @@ -585,7 +599,7 @@ public async Task NodeObserver_Negative_Integer_Thresholds_CPU_Mem_Ports_Firewal ActivePortsWarningThreshold = -100 }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // Bad values don't crash Initialize. Assert.IsFalse(obs.IsUnhealthy); @@ -629,7 +643,7 @@ public async Task OSObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarnin // This is required since output files are only created if fo api app is also deployed to cluster.. - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -679,7 +693,7 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarn }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -736,7 +750,7 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin ApplicationName = "fabric:/TestApp0" }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -782,7 +796,7 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_IsHealthy_NoW ObserverManager.EtwEnabled = false; using var obs = new NetworkObserver(client, context); - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // Observer ran to completion with no errors. // The supplied config does not include deployed app network configs, so @@ -822,7 +836,7 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFi IsObserverWebApiAppDeployed = true }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // Observer ran to completion with no errors. // The supplied config does not include deployed app network configs, so @@ -877,7 +891,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin }; using var obsMgr = new ObserverManager(obs, client); - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -918,7 +932,7 @@ public async Task SFConfigurationObserver_ObserveAsync_Successful_Observer_IsHea ClusterManifestPath = Path.Combine(Environment.CurrentDirectory, "clusterManifest.xml") }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -968,6 +982,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = false; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -976,7 +997,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth MonitorDuration = TimeSpan.FromSeconds(1) }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -1003,14 +1024,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth } using var client = new FabricClient(FabricClientRole.User); - var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); - - // This is meant to be run on your dev machine's one node test cluster. - if (nodeList?.Count > 1) - { - return; - } - var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -1018,6 +1031,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1026,24 +1046,16 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth MemWarnUsageThresholdMb = 5 // This will definitely cause Warning alerts. }; - using var obsMgr = new ObserverManager(obs, client) - { - ApplicationName = "fabric:/TestApp0" - }; + await obs.ObserveAsync(token); - await obs.ObserveAsync(token).ConfigureAwait(true); - - // observer ran to completion with no errors. + // observer ran to completion. Assert.IsTrue(obs.LastRunDateTime > startDateTime); - // Experiment with err/warn detection/reporting behavior. // observer detected errors or warnings for supplied threshold(s). Assert.IsTrue(obs.HasActiveFabricErrorOrWarning); // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); - await obsMgr.StopObserversAsync().ConfigureAwait(true); - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } @@ -1075,6 +1087,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1082,12 +1101,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ActiveTcpPortCountWarning = 5 // This will definitely cause Warning. }; - using var obsMgr = new ObserverManager(obs, client) - { - ApplicationName = "fabric:/TestApp0" - }; - - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -1098,12 +1112,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); - - // Verify that all health reports have been cleared after StopObserversAsync runs. - await obsMgr.StopObserversAsync().ConfigureAwait(true); - - // Verify that all health reports have been cleared after StopObserversAsync runs. - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } /// @@ -1134,6 +1142,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1141,12 +1156,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ActiveEphemeralPortCountWarning = 1 // This will definitely cause Warning. }; - using var obsMgr = new ObserverManager(obs, client) - { - ApplicationName = "fabric:/TestApp0" - }; - - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -1157,9 +1167,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); - await obsMgr.StopObserversAsync().ConfigureAwait(true); - await Task.Delay(1000).ConfigureAwait(true); - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } /// @@ -1190,6 +1197,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1197,12 +1211,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth AllocatedHandlesWarning = 100 // This will definitely cause Warning. }; - using var obsMgr = new ObserverManager(obs, client) - { - ApplicationName = "fabric:/TestApp0" - }; - - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -1213,9 +1222,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); - await obsMgr.StopObserversAsync().ConfigureAwait(true); - await Task.Delay(1000).ConfigureAwait(true); - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } /// @@ -1246,6 +1252,13 @@ public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Un ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1253,7 +1266,7 @@ public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Un CpuWarnUsageThresholdPct = -42 }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); @@ -1293,6 +1306,13 @@ public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Thresho ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; + ObserverManager.EnableConcurrentExecution = true; + ObserverManager.ParallelOptions = new ParallelOptions + { + MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = token, + TaskScheduler = TaskScheduler.Default + }; using var obs = new FabricSystemObserver(client, context) { @@ -1300,7 +1320,7 @@ public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Thresho CpuWarnUsageThresholdPct = 420 }; - await obs.ObserveAsync(token).ConfigureAwait(true); + await obs.ObserveAsync(token); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); diff --git a/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json b/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json index 7d9b7732..93fe5805 100644 --- a/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json +++ b/FabricObserverTests/PackageRoot/Config/AppObserver.config.oldstyle.json @@ -9,11 +9,6 @@ "cpuWarningLimitPercent": 90, "memoryWarningLimitPercent": 60 }, - { - "targetApp": "Malformed AppName App1", - "cpuWarningLimitPercent": 70, - "memoryWarningLimitPercent": 20 - }, { "targetAppType": "ClusterObserverType", "cpuWarningLimitPercent": 30, @@ -23,10 +18,5 @@ "targetApp": "fabric:/BadApp", "cpuWarningLimitPercent": 60, "memoryWarningLimitPercent": 30 - }, - { - "targetApp": "Malformed AppName App1", - "cpuWarningLimitPercent": 70, - "memoryWarningLimitPercent": 20 } ] diff --git a/TelemetryLib/ClusterInformation.cs b/TelemetryLib/ClusterInformation.cs index caaa1b27..4fce68f7 100644 --- a/TelemetryLib/ClusterInformation.cs +++ b/TelemetryLib/ClusterInformation.cs @@ -25,6 +25,7 @@ public sealed class ClusterIdentificationUtility private static string paasClusterId; private static string diagnosticsClusterId; private static XmlDocument clusterManifestXdoc; + private static readonly object lockObj = new object(); /// /// Gets ClusterID, tenantID and ClusterType for current ServiceFabric cluster @@ -50,10 +51,13 @@ public sealed class ClusterIdentificationUtility { using (var xreader = XmlReader.Create(sreader, new XmlReaderSettings { XmlResolver = null })) { - clusterManifestXdoc?.Load(xreader); + lock (lockObj) + { + clusterManifestXdoc?.Load(xreader); - // Get values from cluster manifest, clusterId if it exists in either Paas or Diagnostics section. - GetValuesFromClusterManifest(); + // Get values from cluster manifest, clusterId if it exists in either Paas or Diagnostics section. + GetValuesFromClusterManifest(); + } if (paasClusterId != null) { diff --git a/TelemetryLib/TelemetryConstants.cs b/TelemetryLib/TelemetryConstants.cs index 5a6c1464..0cff1900 100644 --- a/TelemetryLib/TelemetryConstants.cs +++ b/TelemetryLib/TelemetryConstants.cs @@ -12,6 +12,6 @@ public static class TelemetryConstants internal const string ClusterTypeSfrp = "SFRP"; internal const string ClusterTypePaasV1 = "PaasV1"; internal const int AsyncOperationTimeoutSeconds = 120; - internal const string AIKey = "$Token$"; + public const string AIKey = "$Token$"; } } \ No newline at end of file diff --git a/TelemetryLib/TelemetryEvents.cs b/TelemetryLib/TelemetryEvents.cs index c8f646cf..4506a186 100644 --- a/TelemetryLib/TelemetryEvents.cs +++ b/TelemetryLib/TelemetryEvents.cs @@ -41,7 +41,10 @@ public TelemetryEvents( string config = File.ReadAllText(Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "FOAppInsightsOperational.config")); appInsightsTelemetryConf = TelemetryConfiguration.CreateFromConfiguration(config); appInsightsTelemetryConf.InstrumentationKey = TelemetryConstants.AIKey; - telemetryClient = new TelemetryClient(appInsightsTelemetryConf); + telemetryClient = new TelemetryClient(appInsightsTelemetryConf) + { + InstrumentationKey = TelemetryConstants.AIKey + }; var (ClusterId, TenantId, ClusterType) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).GetAwaiter().GetResult(); clusterId = ClusterId; tenantId = TenantId; From a1ee25c9cda0efd2ccdfc36a1fabb0bcb5fd47aa Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 24 Aug 2021 12:45:58 -0700 Subject: [PATCH 02/35] parallel impl (appobs, fso, contobs) --- .../Utilities/ProcessInfo/LinuxProcFS.cs | 1 - FabricObserver/Observers/AppObserver.cs | 75 ++++++++----------- FabricObserver/Observers/ObserverManager.cs | 9 +-- TelemetryLib/TelemetryConstants.cs | 2 +- 4 files changed, 36 insertions(+), 51 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs index e39af8c7..b43d1bed 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs @@ -6,7 +6,6 @@ using System.Collections.Generic; using System.IO; using System.Text; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 9632a4c4..a09a62ed 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -40,7 +40,7 @@ public class AppObserver : ObserverBase private ConcurrentQueue> AllAppHandlesData; // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. - private ConcurrentQueue userTargetList; + private List userTargetList; // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. private ConcurrentQueue deployedTargetList; @@ -110,11 +110,11 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Stop(); CleanUp(); RunDuration = stopwatch.Elapsed; - ObserverLogger.LogWarning($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); - + if (EnableVerboseLogging) { - ObserverLogger.LogInfo($"Run Duration: {RunDuration}"); + ObserverLogger.LogInfo($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + + $"Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); } stopwatch.Reset(); @@ -129,7 +129,7 @@ public override Task ReportAsync(CancellationToken token) } // For use in process family tree monitoring. - List childProcessTelemetryDataList = null; + ConcurrentQueue childProcessTelemetryDataList = null; TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => @@ -144,7 +144,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - childProcessTelemetryDataList = new List(); + childProcessTelemetryDataList = new ConcurrentQueue(); } app = deployedTargetList.First( @@ -323,7 +323,7 @@ public override Task ReportAsync(CancellationToken token) { var data = new { - ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList) + ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList.ToList()) }; ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); @@ -332,7 +332,10 @@ public override Task ReportAsync(CancellationToken token) if (IsTelemetryEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) { - _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); + lock (lockObj) + { + _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList.ToList(), token); + } } }); @@ -341,7 +344,7 @@ public override Task ReportAsync(CancellationToken token) private void ProcessChildProcs( ConcurrentQueue> fruds, - List childProcessTelemetryDataList, + ConcurrentQueue childProcessTelemetryDataList, ReplicaOrInstanceMonitoringInfo repOrInst, ApplicationInfo app, FabricResourceUsageData parentFrud, @@ -362,7 +365,7 @@ private void ProcessChildProcs( double sumAllValues = Sum + parentDataAvg; childProcInfo.Metric = metric; childProcInfo.Value = sumAllValues; - childProcessTelemetryDataList.Add(childProcInfo); + childProcessTelemetryDataList.Enqueue(childProcInfo); parentFrud.Data.Clear(); parentFrud.Data.Add((T)Convert.ChangeType(sumAllValues, typeof(T))); } @@ -541,7 +544,7 @@ private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst private async Task InitializeAsync() { ReplicaOrInstanceList = new ConcurrentQueue(); - userTargetList = new ConcurrentQueue(); + userTargetList = new List(); deployedTargetList = new ConcurrentQueue(); /* Child/Descendant proc monitoring config */ @@ -653,11 +656,7 @@ private async Task InitializeAsync() await using Stream stream = new FileStream(appObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read); var appInfo = JsonHelper.ReadFromJsonStream(stream); - - foreach (var app in appInfo) - { - userTargetList.Enqueue(app); - } + userTargetList.AddRange(appInfo); // Does the configuration have any objects (targets) defined? if (userTargetList.Count == 0) @@ -829,19 +828,12 @@ private async Task InitializeAsync() WarningOpenFileHandles = application.WarningOpenFileHandles }; - userTargetList.Enqueue(appConfig); + userTargetList.Add(appConfig); } } // Remove the All or * config item. - var tempQueue = new ConcurrentQueue(); - - foreach (var a in userTargetList.Where(a => !a.Equals(application))) - { - tempQueue.Enqueue(a); - } - - userTargetList = tempQueue; + _ = userTargetList.Remove(application); apps.Clear(); apps = null; } @@ -1427,7 +1419,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat appList = null; } - _ = Parallel.For (0, deployedApps.Count, ObserverManager.ParallelOptions, (i, state) => + for (int i = 0; i < deployedApps.Count; ++i) { Token.ThrowIfCancellationRequested(); @@ -1435,9 +1427,10 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat string[] filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. - var serviceFilter = userTargetList.FirstOrDefault(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) - && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); + var serviceFilter = + userTargetList.FirstOrDefault(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) + && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); ServiceFilterType filterType = ServiceFilterType.None; @@ -1457,25 +1450,20 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat var replicasOrInstances = GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType).GetAwaiter().GetResult(); - lock (lockObj) + foreach (var rep in replicasOrInstances) { - foreach (var rep in replicasOrInstances) - { - ReplicaOrInstanceList.Enqueue(rep); - } + ReplicaOrInstanceList.Enqueue(rep); } + - lock (lockObj) - { - var targets = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) + var targets = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower())); - foreach (var target in targets) - { - deployedTargetList.Enqueue(target); - } + foreach (var target in targets) + { + deployedTargetList.Enqueue(target); } - }); + } deployedApps.Clear(); deployedApps = null; @@ -1521,8 +1509,7 @@ private void SetInstanceOrReplicaMonitoringList( switch (deployedReplica) { - case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary || - statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary: + case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary: { if (filterList != null && filterType != ServiceFilterType.None) { diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 3b0bf8be..789a6352 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -1074,17 +1074,16 @@ await File.WriteAllLinesAsync( { IsObserverRunning = false; - if (ex.InnerException is FabricException || - ex.InnerException is OperationCanceledException || - ex.InnerException is TaskCanceledException) + if (ex.InnerExceptions != null && + (ex.InnerExceptions.Any(e => e is FabricException) || + ex.InnerExceptions.Any(e => e is OperationCanceledException) || + ex.InnerExceptions.Any(e => e is TaskCanceledException))) { if (isConfigurationUpdateInProgress) { IsObserverRunning = false; - return true; } - continue; } diff --git a/TelemetryLib/TelemetryConstants.cs b/TelemetryLib/TelemetryConstants.cs index 0cff1900..383b76ec 100644 --- a/TelemetryLib/TelemetryConstants.cs +++ b/TelemetryLib/TelemetryConstants.cs @@ -12,6 +12,6 @@ public static class TelemetryConstants internal const string ClusterTypeSfrp = "SFRP"; internal const string ClusterTypePaasV1 = "PaasV1"; internal const int AsyncOperationTimeoutSeconds = 120; - public const string AIKey = "$Token$"; + public const string AIKey = "c065641b-ec84-43fe-a8e7-c2bcbb697995"; } } \ No newline at end of file From a65f67f4f545bb55c5f38960f710ff925542a08f Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 15 Sep 2021 13:58:11 -0700 Subject: [PATCH 03/35] Latest --- .../CpuUtilization/CpuUtilizationProvider.cs | 5 +- .../Utilities/ObserverConstants.cs | 7 +- .../OperatingSystemInfo/LinuxInfoProvider.cs | 8 +- .../WindowsInfoProvider.cs | 155 ++++++++--------- .../ProcessInfo/WindowsProcessInfoProvider.cs | 6 +- .../Telemetry/AppInsightsTelemetry.cs | 1 - .../Telemetry/LogAnalyticsTelemetry.cs | 1 - FabricObserver/FabricObserver.csproj | 4 +- FabricObserver/Observers/AppObserver.cs | 3 +- .../Observers/CertificateObserver.cs | 1 + FabricObserver/Observers/ContainerObserver.cs | 126 +++++++++++--- .../Observers/FabricSystemObserver.cs | 5 + FabricObserver/Observers/NodeObserver.cs | 33 ++-- FabricObserver/Observers/OSObserver.cs | 17 +- FabricObserver/Observers/ObserverManager.cs | 158 +++++++++++------- .../PackageRoot/Config/Settings.xml | 3 +- .../ApplicationManifest.xml | 8 +- .../FabricObserverTests.csproj | 2 +- FabricObserverTests/ObserverTest.cs | 7 +- .../FabricObserverOperationalEventData.cs | 34 ++-- TelemetryLib/TelemetryEvents.cs | 35 ++-- 21 files changed, 390 insertions(+), 229 deletions(-) diff --git a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs index dda84542..001c7232 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs @@ -5,7 +5,6 @@ using System; using System.Runtime.InteropServices; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { @@ -39,6 +38,10 @@ public static CpuUtilizationProvider Instance return instance; } + set + { + instance = value; + } } public abstract void Dispose(); diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index e86317dc..98478573 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -19,8 +19,8 @@ public sealed class ObserverConstants public const string AsyncClusterOperationTimeoutSeconds = "ClusterOperationTimeoutSeconds"; public const string FabricObserverName = "FabricObserver"; public const string FabricObserverETWEventName = "FabricObserverDataEvent"; - public const string ObserverFailureHealthStateLevelParameter = "ObserverFailureHealthStateLevel"; public const string EnableConcurrentExecution = "EnableConcurrentExecution"; + public const string ObserverFailureHealthStateLevelParameter = "ObserverFailureHealthStateLevel"; // The name of the package that contains this Observer's configuration public const string ObserverConfigurationPackageName = "Config"; @@ -64,7 +64,8 @@ public sealed class ObserverConstants public const string MaxDumpsParameter = "MaxDumps"; public const string MaxDumpsTimeWindowParameter = "MaxDumpsTimeWindow"; - // AzureStorageObserver + // AzureStorageUploadObserver + public const string AzureStorageUploadObserverName = "AzureStorageUploadObserver"; public const string AzureStorageConnectionStringParameter = "AzureStorageConnectionString"; public const string AzureBlobContainerNameParameter = "BlobContainerName"; public const string AzureStorageAccountNameParameter = "AzureStorageAccountName"; @@ -123,7 +124,7 @@ public sealed class ObserverConstants public const string NodeObserverNetworkWarningEphemeralPorts = "NetworkWarningEphemeralPorts"; public const string NodeObserverNetworkErrorFirewallRules = "NetworkErrorFirewallRules"; public const string NodeObserverNetworkWarningFirewallRules = "NetworkWarningFirewallRules"; - + // For use by Linux File Descriptors monitor. public const string NodeObserverLinuxFileHandlesErrorLimitPct = "LinuxFileHandlesErrorLimitPercent"; public const string NodeObserverLinuxFileHandlesWarningLimitPct = "LinuxFileHandlesWarningLimitPercent"; diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs index ef95cf00..e4b35597 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs @@ -42,10 +42,10 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte (int lowPort, int highPort) = TupleGetDynamicPortRange(); int count = GetPortCount(processId, line => - { - int port = GetPortFromNetstatOutput(line); - return port >= lowPort && port <= highPort; - }, context); + { + int port = GetPortFromNetstatOutput(line); + return port >= lowPort && port <= highPort; + }, context); return count; } diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 835a8c93..65723fc3 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -11,7 +11,6 @@ using System.IO; using System.Linq; using System.Management; -using System.Runtime.InteropServices; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; @@ -22,25 +21,6 @@ public class WindowsInfoProvider : OSInfoProvider { private const string TcpProtocol = "tcp"; - public override (long TotalMemoryGb, long MemoryInUseMb, double PercentInUse) TupleGetMemoryInfo() - { - NativeMethods.PerfomanceInfoData perfData; - - try - { - perfData = GetWindowsPerformanceInfo(); - double used = ((double)(perfData.PhysicalTotalBytes - perfData.PhysicalAvailableBytes)) / perfData.PhysicalTotalBytes; - double usedPct = used * 100; - - return (perfData.PhysicalTotalBytes / 1024 / 1024 / 1024, perfData.InUse / 1024 / 1024, Math.Round(usedPct, 2)); - } - catch (Win32Exception e) - { - Logger.LogWarning($"Error getting performance information:{Environment.NewLine}{e}"); - return (-1, -1, -1); - } - } - public override (int LowPort, int HighPort) TupleGetDynamicPortRange() { using (var p = new Process()) @@ -85,12 +65,13 @@ public override (int LowPort, int HighPort) TupleGetDynamicPortRange() return (lowPortRange, highPortRange); } catch (Exception e) when ( - e is ArgumentException - || e is IOException - || e is InvalidOperationException - || e is RegexMatchTimeoutException - || e is Win32Exception) + e is ArgumentException + || e is IOException + || e is InvalidOperationException + || e is RegexMatchTimeoutException + || e is Win32Exception) { + } } @@ -163,6 +144,8 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) { while (enumerator.MoveNext()) { + cancellationToken.ThrowIfCancellationRequested(); + try { using (ManagementObject mObj = (ManagementObject)enumerator.Current) @@ -178,7 +161,7 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) object totalVisibleObj = mObj.Properties["TotalVisibleMemorySize"].Value; object installDateObj = mObj.Properties["InstallDate"].Value; object lastBootDateObj = mObj.Properties["LastBootUpTime"].Value; - + osInfo.Name = captionObj?.ToString(); if (int.TryParse(numProcsObj?.ToString(), out int numProcesses)) @@ -190,16 +173,16 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) osInfo.NumberOfProcesses = -1; } - osInfo.Status = statusObj?.ToString(); - osInfo.Language = osLanguageObj?.ToString(); - osInfo.Version = versionObj?.ToString(); - osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(installDateObj?.ToString()).ToUniversalTime().ToString("o"); - osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(lastBootDateObj?.ToString()).ToUniversalTime().ToString("o"); - osInfo.FreePhysicalMemoryKB = ulong.TryParse(freePhysicalObj?.ToString(), out ulong freePhysical) ? freePhysical : 0; - osInfo.FreeVirtualMemoryKB = ulong.TryParse(freeVirtualTotalObj?.ToString(), out ulong freeVirtual) ? freeVirtual : 0; - osInfo.TotalVirtualMemorySizeKB = ulong.TryParse(totalVirtualObj?.ToString(), out ulong totalVirtual) ? totalVirtual : 0; + osInfo.Status = statusObj?.ToString(); + osInfo.Language = osLanguageObj?.ToString(); + osInfo.Version = versionObj?.ToString(); + osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(installDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(lastBootDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.FreePhysicalMemoryKB = ulong.TryParse(freePhysicalObj?.ToString(), out ulong freePhysical) ? freePhysical : 0; + osInfo.FreeVirtualMemoryKB = ulong.TryParse(freeVirtualTotalObj?.ToString(), out ulong freeVirtual) ? freeVirtual : 0; + osInfo.TotalVirtualMemorySizeKB = ulong.TryParse(totalVirtualObj?.ToString(), out ulong totalVirtual) ? totalVirtual : 0; osInfo.TotalVisibleMemorySizeKB = ulong.TryParse(totalVisibleObj?.ToString(), out ulong totalVisible) ? totalVisible : 0; - } + } } catch (ManagementException me) { @@ -215,7 +198,9 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) finally { results?.Dispose(); + results = null; win32OsInfo?.Dispose(); + win32OsInfo = null; } return Task.FromResult(osInfo); @@ -233,47 +218,6 @@ public override int GetTotalAllocatedFileHandlesCount() return -1; } - /// - /// Windows performance information. - /// - /// PerformanceInfoData structure. - public static NativeMethods.PerfomanceInfoData GetWindowsPerformanceInfo() - { - if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - throw new PlatformNotSupportedException("This only works on Windows."); - } - - NativeMethods.PerfomanceInfoData perfData = new NativeMethods.PerfomanceInfoData(); - - if (!NativeMethods.GetPerformanceInfo( - out NativeMethods.PsApiPerformanceInformation perfInfo, - (uint)Marshal.SizeOf(typeof(NativeMethods.PsApiPerformanceInformation)))) - { - throw new Win32Exception(Marshal.GetLastWin32Error()); - } - - perfData.CommitTotalPages = perfInfo.CommitTotal.ToInt64(); - perfData.CommitLimitPages = perfInfo.CommitLimit.ToInt64(); - perfData.CommitPeakPages = perfInfo.CommitPeak.ToInt64(); - - long pageSize = perfInfo.PageSize.ToInt64(); - perfData.PhysicalTotalBytes = perfInfo.PhysicalTotal.ToInt64() * pageSize; - perfData.PhysicalAvailableBytes = perfInfo.PhysicalAvailable.ToInt64() * pageSize; - perfData.InUse = perfData.PhysicalTotalBytes - perfData.PhysicalAvailableBytes; - perfData.SystemCacheBytes = perfInfo.SystemCache.ToInt64() * pageSize; - perfData.KernelTotalBytes = perfInfo.KernelTotal.ToInt64() * pageSize; - perfData.KernelPagedBytes = perfInfo.KernelPaged.ToInt64() * pageSize; - perfData.KernelNonPagedBytes = perfInfo.KernelNonPaged.ToInt64() * pageSize; - perfData.PageSizeBytes = pageSize; - - perfData.HandlesCount = perfInfo.HandlesCount; - perfData.ProcessCount = perfInfo.ProcessCount; - perfData.ThreadCount = perfInfo.ThreadCount; - - return perfData; - } - private int GetTcpPortCount(int processId = -1, bool ephemeral = false) { var tempLocalPortData = new List<(int Pid, int Port)>(); @@ -354,7 +298,7 @@ private int GetTcpPortCount(int processId = -1, bool ephemeral = false) { continue; } - + tempLocalPortData.Add((pid, localPort)); } @@ -429,5 +373,62 @@ private static (int, int) TupleGetLocalPortPidPairFromNetStatString(string netst return (-1, -1); } } + + public override (long TotalMemoryGb, long MemoryInUseMb, double PercentInUse) TupleGetMemoryInfo() + { + ManagementObjectSearcher win32OsInfo = null; + ManagementObjectCollection results = null; + + try + { + win32OsInfo = new ManagementObjectSearcher("SELECT TotalVisibleMemorySize, FreePhysicalMemory FROM Win32_OperatingSystem"); + results = win32OsInfo.Get(); + + using (ManagementObjectCollection.ManagementObjectEnumerator enumerator = results.GetEnumerator()) + { + while (enumerator.MoveNext()) + { + try + { + using (ManagementObject mObj = (ManagementObject)enumerator.Current) + { + object freePhysicalObj = mObj.Properties["FreePhysicalMemory"].Value; + object totalVisibleObj = mObj.Properties["TotalVisibleMemorySize"].Value; + ulong freePhysicalMemoryKB = ulong.TryParse(freePhysicalObj?.ToString(), out ulong freePhysical) ? freePhysical : 0; + ulong totalVisibleMemorySizeKB = ulong.TryParse(totalVisibleObj?.ToString(), out ulong totalVisible) ? totalVisible : 0; + + if (totalVisibleMemorySizeKB == 0) + { + return (0, 0, 0); + } + + ulong inUse = totalVisibleMemorySizeKB - freePhysicalMemoryKB; + double used = ((double)(totalVisibleMemorySizeKB - freePhysicalMemoryKB)) / totalVisibleMemorySizeKB; + double usedPct = used * 100; + + return ((long)totalVisibleMemorySizeKB / 1024 / 1024, (long)inUse / 1024, usedPct); + } + } + catch (ManagementException me) + { + Logger.LogInfo($"Handled ManagementException in GetOSInfoAsync retrieval:{Environment.NewLine}{me}"); + } + catch (Exception e) + { + Logger.LogInfo($"Bug? => Exception in GetOSInfoAsync:{Environment.NewLine}{e}"); + } + } + } + } + finally + { + results?.Dispose(); + results = null; + win32OsInfo?.Dispose(); + win32OsInfo = null; + } + + return (0, 0, 0); + } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 635d2bea..2e493328 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -40,7 +40,7 @@ public override float GetProcessWorkingSetMb(int processId, bool getPrivateWorki return memoryCounters.WorkingSetSize.ToInt64() / 1024 / 1024; } } - catch(Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { Logger.LogWarning($"Exception getting working set for process {processId}:{Environment.NewLine}{e}"); return 0F; @@ -62,7 +62,7 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService using (var searcher = new ManagementObjectSearcher(query)) { var results = searcher.Get(); - + if (results.Count == 0) { return 0F; @@ -131,7 +131,7 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return childProcesses.Take(MaxDescendants).ToList(); } - for (int j = 0; j < c1.Count; ++j) + for (int j = 0; j < c1.Count; ++j) { List<(string ProcName, int Pid)> c2 = TupleGetChildProcessInfo(c1[j].Pid); diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 5f7e2380..6fef01be 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -508,7 +508,6 @@ public Task ReportMetricAsync( // Track the telemetry. telemetryClient.TrackMetric(mt); - return Task.CompletedTask; } } diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs index 5050f4ff..a9d09ca9 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -23,7 +23,6 @@ namespace FabricObserver.Observers.Utilities.Telemetry // LogAnalyticsTelemetry class is partially (SendTelemetryAsync/GetSignature) based on public sample: https://dejanstojanovic.net/aspnet/2018/february/send-data-to-azure-log-analytics-from-c-code/ public class LogAnalyticsTelemetry : ITelemetryProvider { - private const int MaxRetries = 5; private readonly FabricClient fabricClient; private readonly CancellationToken token; private readonly Logger logger; diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index eea7c03c..7c1502f0 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -4,7 +4,7 @@ Exe FabricObserver FabricObserver - netcoreapp3.1 + net5.0 x64 linux-x64;win-x64 3.1.17.0 - Copyright © 2020 + Copyright © 2021 FabricObserver Service Fabric Observer 3.1.17 diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index a09a62ed..050d8aae 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -1448,14 +1448,13 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } } - var replicasOrInstances = GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType).GetAwaiter().GetResult(); + var replicasOrInstances = await GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType); foreach (var rep in replicasOrInstances) { ReplicaOrInstanceList.Enqueue(rep); } - var targets = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower())); diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 74e020f4..bad921b5 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -234,6 +234,7 @@ public override async Task ReportAsync(CancellationToken token) }; HasActiveFabricErrorOrWarning = true; + CurrentWarningCount++; if (IsTelemetryEnabled) { diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index 5189adbf..b0ae2d23 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -17,11 +17,14 @@ using FabricObserver.Observers.MachineInfoModel; using System.Fabric.Description; using System.Collections.Concurrent; +using System.ComponentModel; +using System.Fabric.Health; namespace FabricObserver.Observers { public class ContainerObserver : ObserverBase { + private const int MaxProcessExitWaitTimeMS = 60000; private ConcurrentQueue> allCpuDataPercentage; private ConcurrentQueue> allMemDataMB; @@ -30,7 +33,7 @@ public class ContainerObserver : ObserverBase // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. private ConcurrentQueue deployedTargetList; - private List ReplicaOrInstanceList; + private ConcurrentQueue ReplicaOrInstanceList; private readonly string ConfigPackagePath; private readonly object lockObj = new object(); private string ConfigurationFilePath = string.Empty; @@ -78,7 +81,7 @@ public override Task ReportAsync(CancellationToken token) _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { - var repOrInst = ReplicaOrInstanceList[i]; + var repOrInst = ReplicaOrInstanceList.ElementAt(i); ApplicationInfo app = deployedTargetList.First( a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); @@ -153,6 +156,8 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } + // Runs each time ObserveAsync is run to ensure that any new app targets and config changes will + // be up to date across observer loop iterations. // Runs each time ObserveAsync is run to ensure that any new app targets and config changes will // be up to date across observer loop iterations. private async Task InitializeAsync(CancellationToken token) @@ -167,7 +172,7 @@ private async Task InitializeAsync(CancellationToken token) userTargetList = new List(); deployedTargetList = new ConcurrentQueue(); - ReplicaOrInstanceList = new List(); + ReplicaOrInstanceList = new ConcurrentQueue(); using (Stream stream = new FileStream(ConfigurationFilePath, FileMode.Open, FileAccess.Read, FileShare.Read)) { @@ -226,7 +231,7 @@ private async Task InitializeAsync(CancellationToken token) foreach (var app in apps) { - token.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); if (app.ApplicationName.OriginalString == "fabric:/System") { @@ -246,7 +251,7 @@ private async Task InitializeAsync(CancellationToken token) if (userTargetList.Any(a => a.TargetApp == app.ApplicationName.OriginalString)) { - var existingAppConfig = userTargetList.FirstOrDefault(a => a.TargetApp == app.ApplicationName.OriginalString); + var existingAppConfig = userTargetList.Find(a => a.TargetApp == app.ApplicationName.OriginalString); if (existingAppConfig == null) { @@ -271,7 +276,7 @@ private async Task InitializeAsync(CancellationToken token) CpuWarningLimitPercent = application.CpuWarningLimitPercent, }; - userTargetList.Add(appConfig); + userTargetList.Add(appConfig); } } @@ -282,6 +287,7 @@ private async Task InitializeAsync(CancellationToken token) } int settingsFail = 0; + MonitoredAppCount = userTargetList.Count; foreach (var application in userTargetList) { @@ -324,7 +330,7 @@ private async Task InitializeAsync(CancellationToken token) null, ConfigurationSettings.AsyncTimeout, token), - token); + Token); if (codepackages.Count == 0) { @@ -347,14 +353,8 @@ private async Task InitializeAsync(CancellationToken token) } } - MonitoredAppCount = deployedTargetList.Count; MonitoredServiceProcessCount = ReplicaOrInstanceList.Count; - if (!EnableVerboseLogging) - { - return true; - } - foreach (var rep in ReplicaOrInstanceList) { token.ThrowIfCancellationRequested(); @@ -389,11 +389,12 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc { Token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList[i]; + var repOrInst = ReplicaOrInstanceList.ElementAt(i); string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; string containerId = string.Empty; + string error = string.Empty; if (!allCpuDataPercentage.Any(frud => frud.Id == cpuId)) { @@ -405,7 +406,6 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc allMemDataMB.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); } - var monitorTimer = Stopwatch.StartNew(); string args = "/c docker stats --no-stream --format \"table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\""; string filename = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe"; @@ -427,19 +427,101 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc WindowStyle = ProcessWindowStyle.Hidden, RedirectStandardInput = false, RedirectStandardOutput = true, - RedirectStandardError = false, + RedirectStandardError = true }; - using Process p = Process.Start(ps); - List output = new List(2); + using Process p = new Process(); + + // Capture any error information from docker. + p.ErrorDataReceived += (sender, e) => { error += e.Data; }; + p.StartInfo = ps; + _ = p.Start(); + var stdOutput = p.StandardOutput; + + // Start asynchronous read operation on error stream. + p.BeginErrorReadLine(); + + List output = new List(); string l; while ((l = p.StandardOutput.ReadLine()) != null) { - lock (lockObj) + output.Add(l); + } + + if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) + { + try + { + p?.Kill(); + } + catch (Exception e) when (e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) + { + + } + + ObserverLogger.LogWarning($"docker process has run too long ({MaxProcessExitWaitTimeMS} ms). Aborting."); + state.Stop(); + } + + int exitStatus = p.ExitCode; + stdOutput.Close(); + + // Was there an error running docker stats? + if (exitStatus != 0) + { + string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + msg += " NOTE: You must run FabricObserver as System user or Admin user on Windows " + + "in order for ContainerObserver to function correctly on Windows."; + } + + ObserverLogger.LogWarning(msg); + + var healthReport = new Utilities.HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = $"{msg}", + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = "docker_stats_failure", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) { - output.Add(l); + _ = TelemetryClient?.ReportHealthAsync( + "docker_stats_failure", + HealthState.Warning, + msg, + ObserverName, + Token); } + + // ETW. + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = "docker_stats_failure", + Level = "Warning", + Message = msg, + ObserverName + }); + } + + state.Stop(); } foreach (string line in output) @@ -525,7 +607,7 @@ private async Task SetInstanceOrReplicaMonitoringList( switch (deployedReplica) { - case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary || statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary: + case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary: replicaInfo = new ReplicaOrInstanceMonitoringInfo() { @@ -583,7 +665,7 @@ private async Task SetInstanceOrReplicaMonitoringList( continue; } - ReplicaOrInstanceList.Add(replicaInfo); + ReplicaOrInstanceList.Enqueue(replicaInfo); } } diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 869ea274..bbfd2487 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -362,6 +362,11 @@ public override Task ReportAsync(CancellationToken token) /// private void ReadServiceFabricWindowsEventLog() { + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + return; + } + string sfOperationalLogSource = "Microsoft-ServiceFabric/Operational"; string sfAdminLogSource = "Microsoft-ServiceFabric/Admin"; string systemLogSource = "System"; diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 47f20827..59f90efb 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -192,12 +192,12 @@ public override Task ReportAsync(CancellationToken token) "Average", Math.Round(MemDataInUse.AverageDataValue, 1)); - CsvFileLogger.LogData( - fileName, - NodeName, - "Committed Memory (MB)", - "Peak", - Math.Round(MemDataInUse.MaxDataValue)); + CsvFileLogger.LogData( + fileName, + NodeName, + "Committed Memory (MB)", + "Peak", + Math.Round(MemDataInUse.MaxDataValue)); } // % of Total @@ -365,11 +365,11 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) - { - ObserverLogger.LogWarning($"Unhandled exception re-thrown:{Environment.NewLine}{e}"); - + { + ObserverLogger.LogWarning($"Unhandled exception re-thrown:{Environment.NewLine}{e}"); + // Fix the bug.. - throw; + throw; } } @@ -680,7 +680,7 @@ error on these conditions. } timer.Start(); - + while (timer.Elapsed <= duration) { token.ThrowIfCancellationRequested(); @@ -707,24 +707,25 @@ error on these conditions. } } - await Task.Delay(150, Token).ConfigureAwait(true); + await Task.Delay(250, Token).ConfigureAwait(true); } timer.Stop(); timer.Reset(); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) - { - ObserverLogger.LogWarning($"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); - + { + ObserverLogger.LogWarning($"Unhandled exception in GetSystemCpuMemoryValuesAsync:{Environment.NewLine}{e}"); + // Fix the bug.. - throw; + throw; } finally { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { CpuUtilizationProvider.Instance?.Dispose(); + CpuUtilizationProvider.Instance = null; } } } diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 37d6c223..8866d923 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -72,7 +72,7 @@ public override async Task ObserveAsync(CancellationToken token) if (IsAUCheckSettingEnabled) { - await CheckWuAutoDownloadEnabledAsync(token).ConfigureAwait(true); + await CheckWuAutoDownloadEnabledAsync(token).ConfigureAwait(true); } } @@ -304,7 +304,7 @@ private async Task> GetInfrastructureServiceInstancesAsync( { var allSystemServices = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( - () => + () => FabricClientInstance.QueryManager.GetServiceListAsync( new Uri("fabric:/System"), null, @@ -322,6 +322,11 @@ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( private static string GetWindowsHotFixes(bool generateKbUrl, CancellationToken token) { + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + return null; + } + ManagementObject[] resultsOrdered; string ret = string.Empty; @@ -331,15 +336,19 @@ private static string GetWindowsHotFixes(bool generateKbUrl, CancellationToken t { using var searcher = new ManagementObjectSearcher("SELECT HotFixID,InstalledOn FROM Win32_QuickFixEngineering"); var results = searcher.Get(); - + if (results.Count < 1) { return string.Empty; } resultsOrdered = results.Cast() +#pragma warning disable CA1416 // Validate platform compatibility .Where(obj => obj["InstalledOn"] != null && obj["InstalledOn"].ToString() != string.Empty) +#pragma warning restore CA1416 // Validate platform compatibility +#pragma warning disable CA1416 // Validate platform compatibility .OrderByDescending(obj => DateTime.Parse(obj["InstalledOn"].ToString() ?? string.Empty)).ToArray(); +#pragma warning restore CA1416 // Validate platform compatibility var sb = new StringBuilder(); var baseUrl = "https://support.microsoft.com/help/"; @@ -371,7 +380,7 @@ private static string GetWindowsHotFixes(bool generateKbUrl, CancellationToken t ret = sb.ToString().Trim(); _ = sb.Clear(); sb = null; - + } catch (Exception e) when ( e is ArgumentException || diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 789a6352..c470c1c2 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -128,15 +128,15 @@ private int MaxArchivedLogFileLifetimeDays get; } - public DateTime LastTelemetrySendDate - { - get; private set; + public DateTime LastTelemetrySendDate + { + get; private set; } public TimeSpan OperationalTelemetryRunInterval { get; private set; - } = TimeSpan.FromHours(4); + } = TimeSpan.FromHours(8); /// /// This is for observers that support parallelized monitor loops. @@ -147,9 +147,9 @@ public static ParallelOptions ParallelOptions get; set; } - public static bool EnableConcurrentExecution + public static bool EnableConcurrentExecution { - get; set; + get; set; } /// @@ -188,7 +188,7 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie linkedSFRuntimeObserverTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cts.Token, this.token); FabricClientInstance = fabricClient; FabricServiceContext = serviceProvider.GetRequiredService(); - nodeName = FabricServiceContext?.NodeContext.NodeName; + nodeName = FabricServiceContext.NodeContext.NodeName; FabricServiceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent += CodePackageActivationContext_ConfigurationPackageModifiedEvent; // Observer Logger setup. @@ -274,10 +274,11 @@ public async Task StartObserversAsync() FabricClientInstance, FabricServiceContext, ServiceEventSource.Current, - this.token); + token, + EtwEnabled); var foData = GetFabricObserverInternalTelemetryData(); - + if (foData != null) { string filepath = Path.Combine(Logger.LogFolderBasePath, $"fo_operational_telemetry.log"); @@ -309,13 +310,6 @@ make that connection. You should generally not have to call GC.Collect from user GC.Collect(0, GCCollectionMode.Forced, true, false); GC.Collect(1, GCCollectionMode.Forced, true, false); - // LOH - if (EnableConcurrentExecution) - { - GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; - GC.Collect(2, GCCollectionMode.Forced, true, true); - } - if (ObserverExecutionLoopSleepSeconds > 0) { await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds), token); @@ -383,7 +377,8 @@ make that connection. You should generally not have to call GC.Collect from user FabricClientInstance, FabricServiceContext, ServiceEventSource.Current, - token); + token, + EtwEnabled); var foData = new FabricObserverCriticalErrorEventData { @@ -426,7 +421,7 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf if (isConfigurationUpdateLinux) { - configUpdateLinux = + configUpdateLinux = $" Note: This is due to a configuration update which requires an FO process restart on Linux (with UD walk (one by one) and safety checks).{Environment.NewLine}" + "The reason FO needs to be restarted as part of a parameter-only upgrade is due to the Linux Capabilities set FO employs not persisting across application upgrades (by design) " + "even when the upgrade is just a configuration parameter update. In order to re-create the Capabilities set, FO's setup script must be re-run by SF. Restarting FO is therefore required here."; @@ -457,7 +452,7 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf if (isConfigurationUpdateInProgress) { fabricObserverAppHealthEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) - && s.HealthInformation.HealthState == HealthState.Warning + && s.HealthInformation.HealthState == HealthState.Warning || s.HealthInformation.HealthState == HealthState.Error); } @@ -507,7 +502,7 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf await Task.Delay(250).ConfigureAwait(true); } - + } catch (FabricException) { @@ -542,7 +537,7 @@ private void Dispose(bool disposing) } if (disposing) - { + { FabricClientInstance?.Dispose(); FabricClientInstance = null; linkedSFRuntimeObserverTokenSource?.Dispose(); @@ -624,17 +619,40 @@ private FabricObserverOperationalEventData GetFabricObserverInternalTelemetryDat try { + // plugins + bool hasPlugins = false; + string pluginsDir = Path.Combine(FabricServiceContext.CodePackageActivationContext.GetDataPackageObject("Data").Path, "Plugins"); + + if (!Directory.Exists(pluginsDir)) + { + hasPlugins = false; + } + else + { + try + { + string[] pluginDlls = Directory.GetFiles(pluginsDir, "*.dll", SearchOption.AllDirectories); + hasPlugins = pluginDlls.Length > 0; + } + catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException || e is PathTooLongException) + { + + } + } + telemetryData = new FabricObserverOperationalEventData { UpTime = DateTime.UtcNow.Subtract(StartDateTime).ToString(), Version = InternalVersionNumber, EnabledObserverCount = observers.Count(obs => obs.IsEnabled), + HasPlugins = hasPlugins, + ParallelExecutionEnabled = EnableConcurrentExecution, ObserverData = GetObserverData(), }; } catch (Exception e) when (e is ArgumentException) { - + } return telemetryData; @@ -644,9 +662,28 @@ private List GetObserverData() { var observerData = new List(); var enabledObs = observers.Where(o => o.IsEnabled); + string[] builtInObservers = new string[] + { + ObserverConstants.AppObserverName, + ObserverConstants.AzureStorageUploadObserverName, + ObserverConstants.CertificateObserverName, + ObserverConstants.ContainerObserverName, + ObserverConstants.DiskObserverName, + ObserverConstants.FabricSystemObserverName, + ObserverConstants.NetworkObserverName, + ObserverConstants.NodeObserverName, + ObserverConstants.OSObserverName, + ObserverConstants.SFConfigurationObserverName + }; foreach (var obs in enabledObs) { + // We don't need to have any information about plugins besides whether or not there are any. + if (!builtInObservers.Any(o => o == obs.ObserverName)) + { + continue; + } + // These built-in (non-plugin) observers monitor apps and/or services. if (obs.ObserverName == ObserverConstants.AppObserverName || obs.ObserverName == ObserverConstants.ContainerObserverName || @@ -769,7 +806,7 @@ private async void CodePackageActivationContext_ConfigurationPackageModifiedEven private void SetPropertiesFromConfigurationParameters(ConfigurationSettings settings = null) { ApplicationName = FabricServiceContext.CodePackageActivationContext.ApplicationName; - + // Parallelization settings for capable hardware. \\ if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableConcurrentExecution, settings), out bool enableConcurrency)) @@ -829,7 +866,7 @@ private void SetPropertiesFromConfigurationParameters(ConfigurationSettings sett // ObserverFailure HealthState Level - Override \\ string state = GetConfigSettingValue(ObserverConstants.ObserverFailureHealthStateLevelParameter, settings); - + if (string.IsNullOrWhiteSpace(state) || state?.ToLower() == "none") { ObserverFailureHealthStateLevel = HealthState.Unknown; @@ -869,40 +906,40 @@ private void SetPropertiesFromConfigurationParameters(ConfigurationSettings sett switch (telemetryProvider) { case TelemetryProviderType.AzureLogAnalytics: - { - string logAnalyticsLogType = GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter, settings); - string logAnalyticsSharedKey = GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter, settings); - string logAnalyticsWorkspaceId = GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter, settings); - - if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) { - TelemetryEnabled = false; - return; - } + string logAnalyticsLogType = GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter, settings); + string logAnalyticsSharedKey = GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter, settings); + string logAnalyticsWorkspaceId = GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter, settings); - TelemetryClient = new LogAnalyticsTelemetry( - logAnalyticsWorkspaceId, - logAnalyticsSharedKey, - logAnalyticsLogType, - FabricClientInstance, - token); + if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) + { + TelemetryEnabled = false; + return; + } - break; - } + TelemetryClient = new LogAnalyticsTelemetry( + logAnalyticsWorkspaceId, + logAnalyticsSharedKey, + logAnalyticsLogType, + FabricClientInstance, + token); - case TelemetryProviderType.AzureApplicationInsights: - { - string aiKey = GetConfigSettingValue(ObserverConstants.AiKey, settings); + break; + } - if (string.IsNullOrEmpty(aiKey)) + case TelemetryProviderType.AzureApplicationInsights: { - TelemetryEnabled = false; - return; - } + string aiKey = GetConfigSettingValue(ObserverConstants.AiKey, settings); - TelemetryClient = new AppInsightsTelemetry(aiKey); - break; - } + if (string.IsNullOrEmpty(aiKey)) + { + TelemetryEnabled = false; + return; + } + + TelemetryClient = new AppInsightsTelemetry(aiKey); + break; + } default: TelemetryEnabled = false; @@ -919,7 +956,7 @@ private void SetPropertiesFromConfigurationParameters(ConfigurationSettings sett private void SignalAbortToRunningObserver() { Logger.LogInfo("Signalling task cancellation to currently running Observer."); - + try { cts?.Cancel(); @@ -928,7 +965,7 @@ private void SignalAbortToRunningObserver() { } - + Logger.LogInfo("Successfully signaled cancellation to currently running Observer."); } @@ -941,7 +978,7 @@ private async Task RunObserversAsync() var exceptionBuilder = new StringBuilder(); bool allExecuted = true; - for (int i = 0; i < observers.Count(); ++i) + for (int i = 0; i < observers.Count; ++i) { var observer = observers[i]; @@ -964,7 +1001,6 @@ private async Task RunObserversAsync() } Logger.LogInfo($"Starting {observer.ObserverName}"); - IsObserverRunning = true; // Synchronous call. @@ -974,7 +1010,7 @@ private async Task RunObserversAsync() // Currently, this observer will not run again for the lifetime of this FO service instance. if (!isCompleted && !(TaskCancelled || shutdownSignaled)) { - string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified Maximum run time of {ObserverExecutionTimeout.TotalSeconds} seconds. " + + string observerHealthWarning = $"{observer.ObserverName} on node {nodeName} has exceeded its specified Maximum run time of {ObserverExecutionTimeout.TotalSeconds} seconds. " + $"This means something is wrong with {observer.ObserverName}. It will not be run again. Please look into it."; Logger.LogError(observerHealthWarning); @@ -1045,7 +1081,6 @@ private async Task RunObserversAsync() if (observer.HasActiveFabricErrorOrWarning) { var errWarnMsg = !string.IsNullOrEmpty(Fqdn) ? $"One or more errors or warnings detected." : $"One or more errors or warnings detected. Check {observer.ObserverName} logs for details."; - Logger.LogWarning($"{observer.ObserverName}: " + errWarnMsg); } else @@ -1074,16 +1109,17 @@ await File.WriteAllLinesAsync( { IsObserverRunning = false; - if (ex.InnerExceptions != null && - (ex.InnerExceptions.Any(e => e is FabricException) || - ex.InnerExceptions.Any(e => e is OperationCanceledException) || - ex.InnerExceptions.Any(e => e is TaskCanceledException))) + if (ex.InnerException is FabricException || + ex.InnerException is OperationCanceledException || + ex.InnerException is TaskCanceledException) { if (isConfigurationUpdateInProgress) { IsObserverRunning = false; + return true; } + continue; } diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 5cb77d18..fe9ce638 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -40,7 +40,8 @@ + **NOTE: For Linux runtime target, just supply the directory name(s)/path (not a path with drive letter like you would do for Windows). + The below (default) works on both platforms: on Windows it will be C:\observer_logs. On Linux the folder will live in the deployed code package directory.** --> - --> + - --> + diff --git a/FabricObserverTests/FabricObserverTests.csproj b/FabricObserverTests/FabricObserverTests.csproj index a58b85e1..c89b6ee7 100644 --- a/FabricObserverTests/FabricObserverTests.csproj +++ b/FabricObserverTests/FabricObserverTests.csproj @@ -3,7 +3,7 @@ false {48C88BEB-9960-4183-861B-DF25C193E4C9} FabricObserverTests - netcoreapp3.1 + net5.0 x64 CS0414 1.0.0.0 diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 186acc35..82739950 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Fabric; @@ -282,8 +283,7 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() using var obs = new AppObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), - ReplicaOrInstanceList = new List() + ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json") }; await obs.ObserveAsync(token); @@ -326,8 +326,7 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is using var obs = new AppObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.oldstyle.json"), - ReplicaOrInstanceList = new List() + ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.oldstyle.json") }; await obs.ObserveAsync(token); diff --git a/TelemetryLib/FabricObserverOperationalEventData.cs b/TelemetryLib/FabricObserverOperationalEventData.cs index 05cf2bd9..8c0d8531 100644 --- a/TelemetryLib/FabricObserverOperationalEventData.cs +++ b/TelemetryLib/FabricObserverOperationalEventData.cs @@ -11,24 +11,34 @@ namespace FabricObserver.TelemetryLib { public class FabricObserverOperationalEventData { - public string UpTime - { - get; set; + public string UpTime + { + get; set; } - public string Version - { - get; set; + public string Version + { + get; set; } - public int EnabledObserverCount - { - get; set; + public int EnabledObserverCount + { + get; set; } - public List ObserverData - { - get; set; + public bool HasPlugins + { + get; set; + } + + public bool ParallelExecutionEnabled + { + get; set; + } + + public List ObserverData + { + get; set; } public string OS => RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux"; diff --git a/TelemetryLib/TelemetryEvents.cs b/TelemetryLib/TelemetryEvents.cs index 4506a186..3b795f48 100644 --- a/TelemetryLib/TelemetryEvents.cs +++ b/TelemetryLib/TelemetryEvents.cs @@ -29,26 +29,26 @@ public class TelemetryEvents : IDisposable private readonly ITelemetryEventSource serviceEventSource; private readonly string clusterId, tenantId, clusterType; private readonly TelemetryConfiguration appInsightsTelemetryConf; + private readonly bool isEtwEnabled; public TelemetryEvents( FabricClient fabricClient, ServiceContext context, ITelemetryEventSource eventSource, - CancellationToken token) + CancellationToken token, + bool etwEnabled) { serviceEventSource = eventSource; serviceContext = context; string config = File.ReadAllText(Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), "FOAppInsightsOperational.config")); appInsightsTelemetryConf = TelemetryConfiguration.CreateFromConfiguration(config); appInsightsTelemetryConf.InstrumentationKey = TelemetryConstants.AIKey; - telemetryClient = new TelemetryClient(appInsightsTelemetryConf) - { - InstrumentationKey = TelemetryConstants.AIKey - }; + telemetryClient = new TelemetryClient(appInsightsTelemetryConf); var (ClusterId, TenantId, ClusterType) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).GetAwaiter().GetResult(); clusterId = ClusterId; tenantId = TenantId; clusterType = ClusterType; + isEtwEnabled = etwEnabled; } public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventData foData, TimeSpan runInterval, string logFilePath) @@ -61,7 +61,10 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat try { // ETW - serviceEventSource.InternalFODataEvent(new { FOInternalTelemtryData = JsonConvert.SerializeObject(foData) }); + if (isEtwEnabled) + { + serviceEventSource.InternalFODataEvent(new { FOInternalTelemtryData = JsonConvert.SerializeObject(foData) }); + } string nodeHashString = string.Empty; int nodeNameHash = serviceContext?.NodeContext.NodeName.GetHashCode() ?? -1; @@ -78,17 +81,26 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat { "EventRunInterval", runInterval.ToString() }, { "ClusterId", clusterId }, { "ClusterType", clusterType }, - { "TenantId", tenantId }, { "NodeNameHash", nodeHashString }, { "FOVersion", foData.Version }, + { "HasPlugins", foData.HasPlugins.ToString() }, + { "ParallelExecution", foData.ParallelExecutionEnabled.ToString() }, { "UpTime", foData.UpTime }, { "Timestamp", DateTime.UtcNow.ToString("o") }, { "OS", foData.OS } }; + if (eventProperties.TryGetValue("ClusterType", out string clustType)) + { + if (clustType != TelemetryConstants.ClusterTypeSfrp) + { + eventProperties.Add("TenantId", tenantId); + } + } + IDictionary metrics = new Dictionary { - { "EnabledObserversCount", foData.EnabledObserverCount } + { "EnabledObserverCount", foData.EnabledObserverCount } }; const string err = "ErrorDetections"; @@ -104,7 +116,7 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat // These observers monitor app services/containers. if (obData.ObserverName.Contains("AppObserver") || obData.ObserverName.Contains("FabricSystemObserver") - || obData.ObserverName.Contains("NetworkObserver") || obData.ObserverName.Contains("ContainerObserver")) + || obData.ObserverName.Contains("NetworkObserver") || obData.ObserverName.Contains("ContainerObserver")) { // App count. data = ((AppServiceObserverData)obData).MonitoredAppCount; @@ -180,7 +192,10 @@ public bool EmitFabricObserverCriticalErrorEvent(FabricObserverCriticalErrorEven try { // ETW - serviceEventSource.InternalFOCriticalErrorDataEvent(new { FOCriticalErrorData = JsonConvert.SerializeObject(foErrorData) }); + if (isEtwEnabled) + { + serviceEventSource.InternalFOCriticalErrorDataEvent(new { FOCriticalErrorData = JsonConvert.SerializeObject(foErrorData) }); + } string nodeHashString = string.Empty; int nodeNameHash = serviceContext?.NodeContext.NodeName.GetHashCode() ?? -1; From 0baf5caf9e17a49e1711e0b6c9cb778e369ca250 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 20 Sep 2021 14:48:25 -0700 Subject: [PATCH 04/35] Latest (update to ContainerObs, Logger, Tests). --- ClusterObserver/ClusterObserver.cs | 12 +- .../Interfaces/IObserverLogger.cs | 2 + FabricObserver.Extensibility/ObserverBase.cs | 2 +- .../Utilities/DataTableFileLogger.cs | 2 +- .../Utilities/Logger.cs | 6 +- FabricObserver/Observers/AppObserver.cs | 17 +- FabricObserver/Observers/ContainerObserver.cs | 316 +++++++++--------- FabricObserver/Observers/ObserverManager.cs | 71 ++-- .../PackageRoot/Config/Settings.xml | 25 +- .../FabricObserverTests.csproj | 3 + FabricObserverTests/ObserverTest.cs | 60 ++++ .../Config/ContainerObserver.config.json | 7 + .../SampleObserverPlugin.csproj | 2 +- TelemetryLib/TelemetryEvents.cs | 11 +- 14 files changed, 308 insertions(+), 228 deletions(-) create mode 100644 FabricObserverTests/PackageRoot/Config/ContainerObserver.config.json diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index bde01b26..f0aaea63 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -215,6 +215,12 @@ private async Task ReportClusterHealthAsync(CancellationToken token) } else { + // Cluster is healthy. Don't do anything. + if (clusterHealth.AggregatedHealthState == HealthState.Ok) + { + return; + } + // If in Warning and you are not sending Warning state reports, then end here. if (!ConfigSettings.EmitWarningDetails && clusterHealth.AggregatedHealthState == HealthState.Warning) { @@ -229,12 +235,6 @@ private async Task ReportClusterHealthAsync(CancellationToken token) return; } - // Cluster is healthy. Don't do anything. - if (clusterHealth.AggregatedHealthState == HealthState.Ok) - { - return; - } - foreach (var evaluation in unhealthyEvaluations) { token.ThrowIfCancellationRequested(); diff --git a/FabricObserver.Extensibility/Interfaces/IObserverLogger.cs b/FabricObserver.Extensibility/Interfaces/IObserverLogger.cs index 317754b5..1cfe9a96 100644 --- a/FabricObserver.Extensibility/Interfaces/IObserverLogger.cs +++ b/FabricObserver.Extensibility/Interfaces/IObserverLogger.cs @@ -3,6 +3,8 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using NLog; + namespace FabricObserver.Observers.Interfaces { public interface IObserverLogger diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index cc853bc4..7b1c2d48 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -31,7 +31,7 @@ public abstract class ObserverBase : IObserver private const string FabricSystemAppName = "fabric:/System"; private bool disposed; private Dictionary ServiceDumpCountDictionary; - private object lockObj = new object(); + private readonly object lockObj = new object(); // Process dump settings. Only AppObserver and Windows is supported. \\ public string DumpsPath diff --git a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs index a7acdea6..714c4f05 100644 --- a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs +++ b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs @@ -109,7 +109,7 @@ public void ConfigureLogger(string filename) FileTarget dataLog = (FileTarget)LogManager.Configuration.FindTargetByName("AvgTargetDataStore"); dataLog.FileName = csvPath; dataLog.AutoFlush = true; - dataLog.ConcurrentWrites = false; + dataLog.ConcurrentWrites = true; dataLog.EnableFileDelete = true; dataLog.AutoFlush = true; dataLog.CreateDirs = true; diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index af1a9f5f..7d757676 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -23,10 +23,6 @@ namespace FabricObserver.Observers.Utilities public sealed class Logger : IObserverLogger { private const int Retries = 5; - - // This needs to be static to prevent internal EventSource instantiation errors. - //private static EventSource etwLogger; - private readonly string loggerName; // Text file logger for observers - info/warn/error. @@ -284,7 +280,7 @@ private void InitializeLoggers() { Name = targetName, OptimizeBufferReuse = true, - ConcurrentWrites = false, + ConcurrentWrites = true, EnableFileDelete = true, FileName = file, Layout = "${longdate}--${uppercase:${level}}--${message}", diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 050d8aae..301ff260 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -123,7 +123,7 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { - if (deployedTargetList.Count == 0) + if (deployedTargetList.IsEmpty) { return Task.CompletedTask; } @@ -223,7 +223,6 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { ProcessChildProcs(AllAppMemDataMb, childProcessTelemetryDataList, repOrInst, app, parentFrud, token); - } ProcessResourceDataReportHealth( @@ -984,6 +983,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) if (application?.TargetApp == null && application?.TargetAppType == null) { + // return in a parallel loop is equivalent to a standard loop's continue. return; } @@ -1173,11 +1173,11 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { int procId = procList.ElementAt(j).Pid; string procName = procList.ElementAt(j).procName; - TimeSpan duration = TimeSpan.FromSeconds(1); + TimeSpan maxDuration = TimeSpan.FromSeconds(1); if (MonitorDuration > TimeSpan.MinValue) { - duration = MonitorDuration; + maxDuration = MonitorDuration; } // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. @@ -1209,7 +1209,6 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) } else { - // Do NOT do this... if (!AllAppHandlesData.Any(x => x?.Id == $"{id}:{procName}")) { AllAppHandlesData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); @@ -1258,7 +1257,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) // Monitor Duration applies to the code below. timer.Start(); - while (timer.Elapsed.Seconds <= duration.Seconds) + while (timer.Elapsed <= maxDuration) { token.ThrowIfCancellationRequested(); @@ -1359,13 +1358,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) } }); - if (exceptions.Count > 0) + if (!exceptions.IsEmpty) { var aggEx = new AggregateException(exceptions); ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{aggEx}"); throw new AggregateException(aggEx); } - +#if DEBUG + ObserverLogger.LogInfo($"MonitorDeployedAppsAsync execution time: {stopwatch?.Elapsed}"); +#endif return Task.CompletedTask; } diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index b0ae2d23..4144c4ef 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -36,7 +36,8 @@ public class ContainerObserver : ObserverBase private ConcurrentQueue ReplicaOrInstanceList; private readonly string ConfigPackagePath; private readonly object lockObj = new object(); - private string ConfigurationFilePath = string.Empty; + private Stopwatch runDurationTimer; + public string ConfigurationFilePath = string.Empty; public ContainerObserver(FabricClient fabricClient, StatelessServiceContext context) : base(fabricClient, context) @@ -55,24 +56,30 @@ public override async Task ObserveAsync(CancellationToken token) return; } - var runDurationTimer = Stopwatch.StartNew(); + runDurationTimer = Stopwatch.StartNew(); if (!await InitializeAsync(token).ConfigureAwait(false)) { return; } + Token = token; MonitorContainers(); await ReportAsync(token); CleanUp(); runDurationTimer.Stop(); RunDuration = runDurationTimer.Elapsed; + if (EnableVerboseLogging) + { + ObserverLogger.LogInfo($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + + $"Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); + } LastRunDateTime = DateTime.Now; } public override Task ReportAsync(CancellationToken token) { - if (deployedTargetList.Count == 0) + if (deployedTargetList.IsEmpty) { return Task.CompletedTask; } @@ -81,6 +88,8 @@ public override Task ReportAsync(CancellationToken token) _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { + token.ThrowIfCancellationRequested(); + var repOrInst = ReplicaOrInstanceList.ElementAt(i); ApplicationInfo app = deployedTargetList.First( a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || @@ -162,9 +171,7 @@ public override Task ReportAsync(CancellationToken token) // be up to date across observer loop iterations. private async Task InitializeAsync(CancellationToken token) { - SetConfigurationFilePath(); - - if (!File.Exists(ConfigurationFilePath)) + if (!SetConfigurationFilePath()) { ObserverLogger.LogWarning($"Will not observe container resource consumption as no configuration file has been supplied."); return false; @@ -202,10 +209,10 @@ private async Task InitializeAsync(CancellationToken token) var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token), - Token); + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports @@ -216,22 +223,22 @@ private async Task InitializeAsync(CancellationToken token) // Check that it is not null, and make a new query passing back the token it gave you. while (appList.ContinuationToken != null) { - Token.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( - deployedAppQueryDesc, - ConfigurationSettings.AsyncTimeout, - Token), + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), Token); apps.AddRange(appList.ToList()); - await Task.Delay(250, Token).ConfigureAwait(false); + await Task.Delay(250, Token); } foreach (var app in apps) { - Token.ThrowIfCancellationRequested(); + token.ThrowIfCancellationRequested(); if (app.ApplicationName.OriginalString == "fabric:/System") { @@ -289,21 +296,23 @@ private async Task InitializeAsync(CancellationToken token) int settingsFail = 0; MonitoredAppCount = userTargetList.Count; - foreach (var application in userTargetList) + _ = Parallel.For(0, MonitoredAppCount, async (i, state) => { token.ThrowIfCancellationRequested(); + var application = userTargetList.ElementAt(i); + if (string.IsNullOrWhiteSpace(application.TargetApp)) { ObserverLogger.LogWarning($"InitializeAsync: Required setting, targetApp, is not set."); settingsFail++; - continue; + return; } // No required settings for supplied application(s). if (settingsFail == userTargetList.Count) { - return false; + state.Stop(); } ServiceFilterType filterType = ServiceFilterType.None; @@ -324,24 +333,24 @@ private async Task InitializeAsync(CancellationToken token) { var codepackages = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => FabricClientInstance.QueryManager.GetDeployedCodePackageListAsync( - NodeName, - new Uri(application.TargetApp), - null, - null, - ConfigurationSettings.AsyncTimeout, - token), + NodeName, + new Uri(application.TargetApp), + null, + null, + ConfigurationSettings.AsyncTimeout, + token), Token); if (codepackages.Count == 0) { - continue; + return; } int containerHostCount = codepackages.Count(c => c.HostType == HostType.ContainerHost); if (containerHostCount == 0) { - continue; + return; } deployedTargetList.Enqueue(application); @@ -351,7 +360,7 @@ private async Task InitializeAsync(CancellationToken token) { ObserverLogger.LogInfo($"Handled Exception in function InitializeAsync:{e.GetType().Name}."); } - } + }); MonitoredServiceProcessCount = ReplicaOrInstanceList.Count; @@ -385,143 +394,134 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc try { - _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => + string args = "/c docker stats --no-stream --format \"table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\""; + string filename = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe"; + string error = string.Empty; + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { - Token.ThrowIfCancellationRequested(); + args = string.Empty; - var repOrInst = ReplicaOrInstanceList.ElementAt(i); - string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); - string cpuId = $"{serviceName}_cpu"; - string memId = $"{serviceName}_mem"; - string containerId = string.Empty; - string error = string.Empty; + // We need the full path to the currently deployed FO CodePackage, which is where our + // linux Capabilities-laced proxy binary lives, which is used for elevated_docker_stats call. + string path = FabricServiceContext.CodePackageActivationContext.GetCodePackageObject("Code").Path; + filename = $"{path}/elevated_docker_stats"; + } - if (!allCpuDataPercentage.Any(frud => frud.Id == cpuId)) - { - allCpuDataPercentage.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, 1, false)); - } + var ps = new ProcessStartInfo + { + Arguments = args, + FileName = filename, + UseShellExecute = false, + WindowStyle = ProcessWindowStyle.Hidden, + RedirectStandardInput = false, + RedirectStandardOutput = true, + RedirectStandardError = true + }; - if (!allMemDataMB.Any(frud => frud.Id == memId)) - { - allMemDataMB.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); - } + var output = new List(); + using Process p = new Process(); + p.ErrorDataReceived += (sender, e) => { error += e.Data; }; + p.OutputDataReceived += (sender, e) => { output.Add(e.Data); }; + p.StartInfo = ps; + _ = p.Start(); - string args = "/c docker stats --no-stream --format \"table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\""; - string filename = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe"; + // Start async reads. + p.BeginErrorReadLine(); + p.BeginOutputReadLine(); - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + // It should not take 60 seconds for the process that calls docker stats to exit. + // If so, then end execution of the outer loop: stop monitoring for this run of ContainerObserver. + if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) + { + try { - args = string.Empty; - - // We need the full path to the currently deployed FO CodePackage, which is where our - // linux Capabilities-laced proxy binary lives, which is used for elevated_docker_stats call. - string path = FabricServiceContext.CodePackageActivationContext.GetCodePackageObject("Code").Path; - filename = $"{path}/elevated_docker_stats"; + p?.Kill(true); } - - var ps = new ProcessStartInfo + catch (Exception e) when (e is AggregateException || e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) { - Arguments = args, - FileName = filename, - UseShellExecute = false, - WindowStyle = ProcessWindowStyle.Hidden, - RedirectStandardInput = false, - RedirectStandardOutput = true, - RedirectStandardError = true - }; - using Process p = new Process(); + } - // Capture any error information from docker. - p.ErrorDataReceived += (sender, e) => { error += e.Data; }; - p.StartInfo = ps; - _ = p.Start(); - var stdOutput = p.StandardOutput; + ObserverLogger.LogWarning($"docker process has run too long ({MaxProcessExitWaitTimeMS} ms). Aborting."); + return; + } - // Start asynchronous read operation on error stream. - p.BeginErrorReadLine(); + int exitStatus = p.ExitCode; - List output = new List(); - string l; + // Was there an error running docker stats? + if (exitStatus != 0) + { + string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; - while ((l = p.StandardOutput.ReadLine()) != null) + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - output.Add(l); + msg += " NOTE: You must run FabricObserver as System user or Admin user on Windows " + + "in order for ContainerObserver to function correctly on Windows."; } - if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) + ObserverLogger.LogWarning(msg); + + var healthReport = new Utilities.HealthReport { - try - { - p?.Kill(); - } - catch (Exception e) when (e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) - { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = $"{msg}", + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = "docker_stats_failure", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverName, + }; - } + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); - ObserverLogger.LogWarning($"docker process has run too long ({MaxProcessExitWaitTimeMS} ms). Aborting."); - state.Stop(); + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + "docker_stats_failure", + HealthState.Warning, + msg, + ObserverName, + Token); } - int exitStatus = p.ExitCode; - stdOutput.Close(); - - // Was there an error running docker stats? - if (exitStatus != 0) + // ETW. + if (IsEtwEnabled) { - string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - msg += " NOTE: You must run FabricObserver as System user or Admin user on Windows " + - "in order for ContainerObserver to function correctly on Windows."; - } - - ObserverLogger.LogWarning(msg); - - var healthReport = new Utilities.HealthReport - { - AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), - EmitLogEvent = EnableVerboseLogging, - HealthMessage = $"{msg}", - HealthReportTimeToLive = GetHealthReportTimeToLive(), - Property = "docker_stats_failure", - ReportType = HealthReportType.Application, - State = HealthState.Warning, - NodeName = NodeName, - Observer = ObserverName, - }; + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = "docker_stats_failure", + Level = "Warning", + Message = msg, + ObserverName + }); + } - // Generate a Service Fabric Health Report. - HealthReporter.ReportHealthToServiceFabric(healthReport); + return; + } - // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). - if (IsTelemetryEnabled) - { - _ = TelemetryClient?.ReportHealthAsync( - "docker_stats_failure", - HealthState.Warning, - msg, - ObserverName, - Token); - } + _ = Parallel.For(0, ReplicaOrInstanceList.Count, (i, state) => + { + var repOrInst = ReplicaOrInstanceList.ElementAt(i); + string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); + string cpuId = $"{serviceName}_cpu"; + string memId = $"{serviceName}_mem"; + string containerId = string.Empty; - // ETW. - if (IsEtwEnabled) - { - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Property = "docker_stats_failure", - Level = "Warning", - Message = msg, - ObserverName - }); - } + if (!allCpuDataPercentage.Any(frud => frud.Id == cpuId)) + { + allCpuDataPercentage.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, 1, false)); + } - state.Stop(); + if (!allMemDataMB.Any(frud => frud.Id == memId)) + { + allMemDataMB.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); } foreach (string line in output) @@ -540,7 +540,7 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc if (stats.Length < 4) { ObserverLogger.LogWarning($"docker stats not returning expected information: stats.Count = {stats.Length}. Expected 4."); - state.Stop(); + return; } if (!stats[1].Contains(repOrInst.ServicePackageActivationId)) @@ -562,33 +562,41 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc double mem_working_set_mb = double.TryParse(stats[3].Replace("MiB", ""), out double memMib) ? memMib : 0; allMemDataMB?.FirstOrDefault(f => f.Id == memId)?.Data.Add(mem_working_set_mb); - Thread.Sleep(150); - } - - lock (lockObj) - { - output.Clear(); - output = null; + break; } - }); + }); } - catch (AggregateException e) when (!(e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException)) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - ObserverLogger.LogWarning($"Failure in ObserveAsync:{Environment.NewLine}{e}"); - - // fix the bug.. + ObserverLogger.LogError($"Exception in MonitorContainers:{Environment.NewLine}{e}"); throw; } } - private void SetConfigurationFilePath() + private bool SetConfigurationFilePath() { + // Already set. + if (File.Exists(ConfigurationFilePath)) + { + return true; + } + string configDataFilename = GetSettingParameterValue(ConfigurationSectionName, "ConfigFileName"); + + if (string.IsNullOrWhiteSpace(configDataFilename)) + { + return false; + } - if (!string.IsNullOrEmpty(configDataFilename) && !ConfigurationFilePath.Contains(configDataFilename)) + string path = Path.Combine(ConfigPackagePath, configDataFilename); + + if (File.Exists(path)) { - ConfigurationFilePath = Path.Combine(ConfigPackagePath, configDataFilename); + ConfigurationFilePath = path; + return true; } + + return false; } private async Task SetInstanceOrReplicaMonitoringList( diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index c470c1c2..28460344 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -39,6 +39,7 @@ public class ObserverManager : IDisposable private readonly IEnumerable serviceCollection; private bool isConfigurationUpdateInProgress; private DateTime StartDateTime; + private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromHours(8); // Folks often use their own version numbers. This is for internal diagnostic telemetry. private const string InternalVersionNumber = "3.1.17"; @@ -96,18 +97,26 @@ public string ApplicationName get; set; } - public bool IsObserverRunning - { - get; - private set; - } - public static HealthState ObserverFailureHealthStateLevel { get; set; } = HealthState.Unknown; + /// + /// This is for observers that support parallelized monitor loops. + /// AppObserver, ContainerObserver, FabricSystemObserver. + /// + public static ParallelOptions ParallelOptions + { + get; set; + } + + public static bool EnableConcurrentExecution + { + get; set; + } + private ObserverHealthReporter HealthReporter { get; @@ -128,26 +137,17 @@ private int MaxArchivedLogFileLifetimeDays get; } - public DateTime LastTelemetrySendDate + private DateTime LastForcedGCDateTime { - get; private set; + get; set; } - public TimeSpan OperationalTelemetryRunInterval - { - get; private set; - } = TimeSpan.FromHours(8); - - /// - /// This is for observers that support parallelized monitor loops. - /// AppObserver, ContainerObserver, FabricSystemObserver. - /// - public static ParallelOptions ParallelOptions + private TimeSpan ForcedGCInterval { get; set; - } + } = TimeSpan.FromMinutes(15); - public static bool EnableConcurrentExecution + private DateTime LastTelemetrySendDate { get; set; } @@ -212,6 +212,7 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie // this logs error/warning/info messages for ObserverManager. Logger = new Logger("ObserverManager", logFolderBasePath, MaxArchivedLogFileLifetimeDays > 0 ? MaxArchivedLogFileLifetimeDays : 7); + SetPropertiesFromConfigurationParameters(); serviceCollection = serviceProvider.GetServices(); // Populate the Observer list for the sequential run loop. @@ -229,7 +230,6 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie } HealthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); - SetPropertiesFromConfigurationParameters(); ParallelOptions = new ParallelOptions { @@ -296,19 +296,13 @@ public async Task StartObserversAsync() } } - /* Note the below use of GC.Collect is NOT a general recommendation for what to do in your own managed service code or app code. Please don't - make that connection. You should generally not have to call GC.Collect from user service code. It just depends on your performance needs. - As always, measure and understand what impact your code has on memory before employing the GC API in your own projects. - This is only used here to ensure gen0 and gen1 do not hold unecessary objects for any amount of time before FO goes to sleep and to compact the LOH. - - All observers clear and null their internal lists, objects that maintain internal lists. They also dispose/null disposable objects, etc before this code runs. - This is a micro "optimization" and not really necessary. However, it does modestly decrease the already reasonable memory footprint of FO. - Out of the box, FO will generally consume less than 100MB of workingset. Most of this (~65-70%) is held in native memory. - FO workingset can increase depending upon how many services you monitor, how you write your plugins with respect to memory consumption, etc.. */ - - // SOH, sweep-only collection (no compaction). This will clear the early generation objects (short-lived) from memory. This only impacts the FO process. - GC.Collect(0, GCCollectionMode.Forced, true, false); - GC.Collect(1, GCCollectionMode.Forced, true, false); + // Force Gen0-Gen2 collection with compaction, including LOH. This runs every 15 minutes. + if (DateTime.UtcNow.Subtract(LastForcedGCDateTime) >= ForcedGCInterval) + { + GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; + GC.Collect(2, GCCollectionMode.Forced, true, true); + LastForcedGCDateTime = DateTime.UtcNow; + } if (ObserverExecutionLoopSleepSeconds > 0) { @@ -520,7 +514,6 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf if (!isConfigurationUpdateInProgress) { SignalAbortToRunningObserver(); - IsObserverRunning = false; } } @@ -1001,7 +994,6 @@ private async Task RunObserversAsync() } Logger.LogInfo($"Starting {observer.ObserverName}"); - IsObserverRunning = true; // Synchronous call. bool isCompleted = observer.ObserveAsync(linkedSFRuntimeObserverTokenSource?.Token ?? token).Wait(ObserverExecutionTimeout); @@ -1107,16 +1099,12 @@ await File.WriteAllLinesAsync( } catch (AggregateException ex) { - IsObserverRunning = false; - if (ex.InnerException is FabricException || ex.InnerException is OperationCanceledException || ex.InnerException is TaskCanceledException) { if (isConfigurationUpdateInProgress) { - IsObserverRunning = false; - return true; } @@ -1130,7 +1118,6 @@ ex.InnerException is OperationCanceledException || { if (isConfigurationUpdateInProgress) { - IsObserverRunning = false; return true; } @@ -1142,8 +1129,6 @@ ex.InnerException is OperationCanceledException || Logger.LogWarning($"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); allExecuted = false; } - - IsObserverRunning = false; } if (allExecuted) diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index fe9ce638..6cf9cd24 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -7,6 +7,7 @@ 0 means run continuously with no pausing - not recommended. If you only enable one observer, FO will sleep for 10 seconds between runs, regardless of this setting. SF Error/Warning Reports have TTLs that are computed in part with this value. --> + - + + + + + + + + @@ -90,9 +98,6 @@ - - - + + + + + @@ -237,6 +247,7 @@ + @@ -251,8 +262,10 @@ + + @@ -265,6 +278,7 @@ + @@ -297,6 +311,7 @@ + @@ -304,9 +319,11 @@ + + + + diff --git a/FabricObserverTests/FabricObserverTests.csproj b/FabricObserverTests/FabricObserverTests.csproj index 3c12d6ae..4cbec7a5 100644 --- a/FabricObserverTests/FabricObserverTests.csproj +++ b/FabricObserverTests/FabricObserverTests.csproj @@ -3,7 +3,7 @@ false {48C88BEB-9960-4183-861B-DF25C193E4C9} FabricObserverTests - net5.0 + netcoreapp3.1 x64 CS0414 1.0.0.0 From d9bbd84460263117a0f0e2f7b09717e4258102fd Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 23 Sep 2021 10:00:06 -0700 Subject: [PATCH 11/35] 3.1.18 begin --- Build-SFPkgs.ps1 | 8 +- Documentation/OperationalTelemetry.md | 2 +- Documentation/Plugins.md | 9 +- Documentation/Using.md | 2 +- FabricObserver.nuspec.template | 4 +- FabricObserver/FabricObserver.csproj | 4 +- FabricObserver/Observers/ContainerObserver.cs | 238 +++++++++--------- FabricObserver/Observers/ObserverManager.cs | 42 ++-- .../PackageRoot/ServiceManifest._linux.xml | 8 +- .../PackageRoot/ServiceManifest.xml | 8 +- README.md | 8 +- .../SampleObserverPlugin.csproj | 2 +- 12 files changed, 158 insertions(+), 177 deletions(-) diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index 2888f579..c7f58ff4 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.17" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.17" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.18" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.18" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.17" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.17" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.18" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.18" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/Documentation/OperationalTelemetry.md b/Documentation/OperationalTelemetry.md index 65f0fd3e..9b66f929 100644 --- a/Documentation/OperationalTelemetry.md +++ b/Documentation/OperationalTelemetry.md @@ -34,7 +34,7 @@ Here is a full example of exactly what is sent in one of these telemetry events, "ClusterId": "50bf5602-1611-459c-aed2-45b960e9eb16", "ClusterType": "SFRP", "NodeNameHash": "1672329571", - "FOVersion": "3.1.17", + "FOVersion": "3.1.18", "HasPlugins": "False", "UpTime": "00:00:27.2535830", "Timestamp": "2021-08-26T20:51:42.8588118Z", diff --git a/Documentation/Plugins.md b/Documentation/Plugins.md index 8ba2d484..368e47a7 100644 --- a/Documentation/Plugins.md +++ b/Documentation/Plugins.md @@ -1,9 +1,6 @@ ## How to implement an observer plugin using FO's extensibility model - -**FabricObserver version 3.1.0 introduces a refactored plugin implementation that will break existing plugins. The changes required by plugin authors are trivial, however. Please see the [SampleObserver project](/SampleObserverPlugin) for a complete sample observer plugin implementation with code comments and readme with examples of the new format.** - -This document is a simple overview of how to get started with building an observer plugin. Also, for a more advanced sample, please see [ContainerObserver](https://github.com/gittorre/containerobserver) reference project (ContainerObserver is a part of FO as of 3.1.17). +This document is a simple overview of how to get started with building an observer plugin. Also, for a more advanced sample, please see [ContainerObserver](https://github.com/gittorre/containerobserver) reference project (ContainerObserver is a part of FO as of 3.1.18). Note: The plugin model depends on the following packages, which **must have the same versions in both your plugin project and FabricObserver**: @@ -24,11 +21,11 @@ Create a new .NET Standard 2.0 library project, install the nupkg you need for y You can find the Microsoft-signed packages in the nuget.org gallery [here](https://www.nuget.org/profiles/ServiceFabricApps) or just run this in the package manager console: ``` -Install-Package Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained -Version 3.1.12 +Install-Package Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained -Version 3.1.18 or for Linux: -Install-Package Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained -Version 3.1.12 +Install-Package Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained -Version 3.1.18 ``` Note: diff --git a/Documentation/Using.md b/Documentation/Using.md index 0e8139b5..82469b73 100644 --- a/Documentation/Using.md +++ b/Documentation/Using.md @@ -559,7 +559,7 @@ $appParams = @{ "FabricSystemObserverEnabled" = "true"; "FabricSystemObserverMem Then execute the application upgrade with ```Powershell -Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.17 -ApplicationParameter $appParams -Monitored -FailureAction rollback +Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.18 -ApplicationParameter $appParams -Monitored -FailureAction rollback ``` Note: On *Linux*, this will restart FO processes (one at a time, UD Walk with safety checks) due to the way Linux Capabilites work. In a nutshell, for any kind of application upgrade, we have to re-run the FO setup script to get the Capabilities in place. For Windows, FO processes will NOT be restarted. diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index dcb50122..7d1440fd 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 3.1.17 - This version brings ContainerObserver into the mix, promoted to a built-in observer. Many ObserverManager settings are now overridable. 3.1.17 also includes monitoring and performance improvements for AppObserver, FabricSystemObserver and NodeObserver. Finally, this version ships with an updated operational telemetry impl for Non-PII data sharing with Microsoft (opt out). The only stuff we care about is if FO is working/healthy, it's finding issues (generating health events) and the total number of issues it finds, transmitted every 8 hours. Please see the release notes on the FO repo for more information. + 3.1.18 + This version brings ContainerObserver into the mix, promoted to a built-in observer. Many ObserverManager settings are now overridable. 3.1.18 also includes monitoring and performance improvements for AppObserver, FabricSystemObserver and NodeObserver. Finally, this version ships with an updated operational telemetry impl for Non-PII data sharing with Microsoft (opt out). The only stuff we care about is if FO is working/healthy, it's finding issues (generating health events) and the total number of issues it finds, transmitted every 8 hours. Please see the release notes on the FO repo for more information. Microsoft MIT true diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index eea7c03c..9ce6a8c6 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -12,11 +12,11 @@ linux-x64;win-x64 - 3.1.17.0 + 3.1.18.0 Copyright © 2020 FabricObserver Service Fabric Observer - 3.1.17 + 3.1.18 true true FabricObserver.Program diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index 680ebf37..8a6b4604 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -381,13 +381,124 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc int listcapacity = ReplicaOrInstanceList.Count; allCpuDataPercentage ??= new List>(listcapacity); allMemDataMB ??= new List>(listcapacity); - + string error = string.Empty; + string args = "/c docker stats --no-stream --format \"table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\""; + string filename = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe"; + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + args = string.Empty; + + // We need the full path to the currently deployed FO CodePackage, which is where our + // linux Capabilities-laced proxy binary lives, which is used for elevated_docker_stats call. + string path = FabricServiceContext.CodePackageActivationContext.GetCodePackageObject("Code").Path; + filename = $"{path}/elevated_docker_stats"; + } + + var ps = new ProcessStartInfo + { + Arguments = args, + FileName = filename, + UseShellExecute = false, + WindowStyle = ProcessWindowStyle.Hidden, + RedirectStandardInput = false, + RedirectStandardOutput = true, + RedirectStandardError = true + }; + + var output = new List(); + using Process p = new Process(); + p.ErrorDataReceived += (sender, e) => { error += e.Data; }; + p.OutputDataReceived += (sender, e) => { output.Add(e.Data); }; + p.StartInfo = ps; + _ = p.Start(); + + // Start async reads. + p.BeginErrorReadLine(); + p.BeginOutputReadLine(); + + // It should not take 60 seconds for the process that calls docker stats to exit. + // If so, then end execution of the outer loop: stop monitoring for this run of ContainerObserver. + if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) + { + try + { + p?.Kill(true); + } + catch (Exception e) when (e is AggregateException || e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) + { + + } + + ObserverLogger.LogWarning($"docker process has run too long ({MaxProcessExitWaitTimeMS} ms). Aborting."); + return; + } + + int exitStatus = p.ExitCode; + + // Was there an error running docker stats? + if (exitStatus != 0) + { + string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + msg += " NOTE: docker must be running and you must run FabricObserver as System user or Admin user on Windows " + + "in order for ContainerObserver to function correctly on Windows."; + } + + ObserverLogger.LogWarning(msg); + CurrentWarningCount++; + + var healthReport = new Utilities.HealthReport + { + AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), + EmitLogEvent = EnableVerboseLogging, + HealthMessage = $"{msg}", + HealthReportTimeToLive = GetHealthReportTimeToLive(), + Property = "docker_stats_failure", + ReportType = HealthReportType.Application, + State = HealthState.Warning, + NodeName = NodeName, + Observer = ObserverName, + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + + // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + "docker_stats_failure", + HealthState.Warning, + msg, + ObserverName, + Token); + } + + // ETW. + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Property = "docker_stats_failure", + Level = "Warning", + Message = msg, + ObserverName + }); + } + + return; + } + try { foreach (ReplicaOrInstanceMonitoringInfo repOrInst in ReplicaOrInstanceList) { Token.ThrowIfCancellationRequested(); - string error = string.Empty; string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; @@ -403,129 +514,11 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc allMemDataMB.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1)); } - var monitorTimer = Stopwatch.StartNew(); - string args = "/c docker stats --no-stream --format \"table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}\""; - string filename = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe"; - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - args = string.Empty; - - // We need the full path to the currently deployed FO CodePackage, which is where our - // linux Capabilities-laced proxy binary lives, which is used for elevated_docker_stats call. - string path = FabricServiceContext.CodePackageActivationContext.GetCodePackageObject("Code").Path; - filename = $"{path}/elevated_docker_stats"; - } - - var ps = new ProcessStartInfo - { - Arguments = args, - FileName = filename, - UseShellExecute = false, - WindowStyle = ProcessWindowStyle.Hidden, - RedirectStandardInput = false, - RedirectStandardOutput = true, - RedirectStandardError = true - }; - - using Process p = new Process(); - - // Capture any error information from docker. - p.ErrorDataReceived += (sender, e) => { error += e.Data; }; - p.StartInfo = ps; - _ = p.Start(); - var stdOutput = p.StandardOutput; - - // Start asynchronous read operation on error stream. - p.BeginErrorReadLine(); - - List output = new List(); - string l; - - while ((l = await p.StandardOutput.ReadLineAsync()) != null) - { - output.Add(l); - } - - if (!p.WaitForExit(MaxProcessExitWaitTimeMS)) - { - try - { - p?.Kill(); - } - catch (Exception e) when (e is InvalidOperationException || e is NotSupportedException || e is Win32Exception) - { - - } - - return; - } - - int exitStatus = p.ExitCode; - stdOutput.Close(); - - // Was there an error running docker stats? - if (exitStatus != 0) - { - string msg = $"docker stats --no-stream exited with {exitStatus}: {error}"; - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - msg += " NOTE: You must run FabricObserver as System user or Admin user on Windows " + - "in order for ContainerObserver to function correctly on Windows."; - } - - ObserverLogger.LogWarning(msg); - - var healthReport = new Utilities.HealthReport - { - AppName = new Uri($"fabric:/{ObserverConstants.FabricObserverName}"), - EmitLogEvent = EnableVerboseLogging, - HealthMessage = $"{msg}", - HealthReportTimeToLive = GetHealthReportTimeToLive(), - Property = "docker_stats_failure", - ReportType = HealthReportType.Application, - State = HealthState.Warning, - NodeName = NodeName, - Observer = ObserverName, - }; - - // Generate a Service Fabric Health Report. - HealthReporter.ReportHealthToServiceFabric(healthReport); - - // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). - if (IsTelemetryEnabled) - { - _ = TelemetryClient?.ReportHealthAsync( - "docker_stats_failure", - HealthState.Warning, - msg, - ObserverName, - Token); - } - - // ETW. - if (IsEtwEnabled) - { - ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Property = "docker_stats_failure", - Level = "Warning", - Message = msg, - ObserverName - }); - } - - return; - } - foreach (string line in output) { Token.ThrowIfCancellationRequested(); - if (line.Contains("CPU")) + if (line == null || line.Contains("CPU")) { continue; } @@ -561,9 +554,6 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc await Task.Delay(150, Token); } - - output.Clear(); - output = null; } } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index bf6adc03..78618b0b 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -20,6 +20,7 @@ using FabricObserver.TelemetryLib; using HealthReport = FabricObserver.Observers.Utilities.HealthReport; using System.Fabric.Description; +using System.Runtime; namespace FabricObserver.Observers { @@ -28,10 +29,14 @@ namespace FabricObserver.Observers // with optional sleeps, and reliable shutdown event handling. public class ObserverManager : IDisposable { + // Folks often use their own version numbers. This is for internal diagnostic telemetry. + private const string InternalVersionNumber = "3.1.18"; private readonly string nodeName; private readonly List observers; - private volatile bool shutdownSignaled; private readonly CancellationToken token; + private readonly TimeSpan ForcedGCInterval = TimeSpan.FromMinutes(15); + private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); + private volatile bool shutdownSignaled; private CancellationTokenSource cts; private CancellationTokenSource linkedSFRuntimeObserverTokenSource; private bool disposed; @@ -39,9 +44,6 @@ public class ObserverManager : IDisposable private bool isConfigurationUpdateInProgress; private DateTime StartDateTime; - // Folks often use their own version numbers. This is for internal diagnostic telemetry. - private const string InternalVersionNumber = "3.1.17"; - private bool TaskCancelled => linkedSFRuntimeObserverTokenSource?.Token.IsCancellationRequested ?? token.IsCancellationRequested; @@ -97,14 +99,12 @@ public string ApplicationName public bool IsObserverRunning { - get; - private set; + get; private set; } public static HealthState ObserverFailureHealthStateLevel { - get; - set; + get; set; } = HealthState.Unknown; private ObserverHealthReporter HealthReporter @@ -132,10 +132,10 @@ public DateTime LastTelemetrySendDate get; private set; } - public TimeSpan OperationalTelemetryRunInterval + private DateTime LastForcedGCDateTime { - get; private set; - } = TimeSpan.FromHours(8); + get; set; + } /// /// Initializes a new instance of the class. @@ -274,19 +274,13 @@ public async Task StartObserversAsync() } } - /* Note the below use of GC.Collect is NOT a general recommendation for what to do in your own managed service code or app code. Please don't - make that connection. You should generally not have to call GC.Collect from user service code. It just depends on your performance needs. - As always, measure and understand what impact your code has on memory before employing the GC API in your own projects. - This is only used here to ensure gen0 and gen1 do not hold unecessary objects for any amount of time before FO goes to sleep and to compact the LOH. - - All observers clear and null their internal lists, objects that maintain internal lists. They also dispose/null disposable objects, etc before this code runs. - This is a micro "optimization" and not really necessary. However, it does modestly decrease the already reasonable memory footprint of FO. - Out of the box, FO will generally consume less than 100MB of workingset. Most of this (~65-70%) is held in native memory. - FO workingset can increase depending upon how many services you monitor, how you write your plugins with respect to memory consumption, etc.. */ - - // SOH, sweep-only collection (no compaction). This will clear the early generation objects (short-lived) from memory. This only impacts the FO process. - GC.Collect(0, GCCollectionMode.Forced, true, false); - GC.Collect(1, GCCollectionMode.Forced, true, false); + // Force Gen0-Gen2 collection with compaction, including LOH. This runs every 15 minutes. + if (DateTime.UtcNow.Subtract(LastForcedGCDateTime) >= ForcedGCInterval) + { + GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; + GC.Collect(2, GCCollectionMode.Forced, true, true); + LastForcedGCDateTime = DateTime.UtcNow; + } if (ObserverExecutionLoopSleepSeconds > 0) { diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index 621605bc..e80c456a 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,10 +27,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 2ffc2acf..02632dc8 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,10 +21,10 @@ - + - + \ No newline at end of file diff --git a/README.md b/README.md index 6f238d11..4850a854 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# FabricObserver 3.1.17 +# FabricObserver 3.1.18 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of machine resources that tend to be very important to all Service Fabric applications, like disk space consumption, CPU use, memory use, endpoint availability, ephemeral TCP port use, and app/cluster certificate health out-of-the-box. 2. Runs on multiple versions of Windows Server and Ubuntu 16.04 and 18.04 -3. Provides [an easy-to-use extensibility model](/Documentation/Plugins.md) for creating [custom Observers](/SampleObserverPlugin) out of band (so, you don't need to clone the repo to build an Observer). See [ContainerObserver](https://github.com/GitTorre/ContainerObserver) for a complete plugin impl that extends FO with SF container app resource monitoring and alerting (note that this observer is built into FO as of version 3.1.17). +3. Provides [an easy-to-use extensibility model](/Documentation/Plugins.md) for creating [custom Observers](/SampleObserverPlugin) out of band (so, you don't need to clone the repo to build an Observer). See [ContainerObserver](https://github.com/GitTorre/ContainerObserver) for a complete plugin impl that extends FO with SF container app resource monitoring and alerting (note that this observer is built into FO as of version 3.1.18). 4. Supports [Configuration Setting Application Updates](/Documentation/Using.md#parameterUpdates) for any observer for any supported setting. 5. Is actively developed completely in the open. The latest code (generally in flight and not meant for production) lives in the develop branch. It is highly recommended that you only deploy code built from the main branch into your production clusters. @@ -143,11 +143,11 @@ Register-ServiceFabricApplicationType -ApplicationPathInImageStore FO3117 #Create FO application (if not already deployed at lesser version): -New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.17 +New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.18 #OR if updating existing version: -Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.17 -Monitored -FailureAction rollback +Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.18 -Monitored -FailureAction rollback ``` ## Observer Model diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index 89b3d574..e4993892 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -13,7 +13,7 @@ - + From 8953027e696e07720bd437b5c860b14c9cba2b54 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 23 Sep 2021 17:41:48 -0700 Subject: [PATCH 12/35] Bug fix - ContainerObserver. 3.1.18 version change. --- Build-SFPkgs.ps1 | 8 +++---- Documentation/Using.md | 2 +- FabricObserver.nuspec.template | 4 ++-- FabricObserver/FabricObserver.csproj | 4 ++-- FabricObserver/Observers/ContainerObserver.cs | 23 +++++++++---------- FabricObserver/Observers/ObserverManager.cs | 22 +++++++++--------- .../PackageRoot/ServiceManifest._linux.xml | 8 +++---- .../PackageRoot/ServiceManifest.xml | 8 +++---- .../ApplicationManifest.xml | 12 +++++----- README.md | 8 +++---- .../SampleObserverPlugin.csproj | 2 +- 11 files changed, 50 insertions(+), 51 deletions(-) diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index 2888f579..c7f58ff4 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.17" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.17" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.18" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.18" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.17" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.17" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.18" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.18" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/Documentation/Using.md b/Documentation/Using.md index 0e8139b5..82469b73 100644 --- a/Documentation/Using.md +++ b/Documentation/Using.md @@ -559,7 +559,7 @@ $appParams = @{ "FabricSystemObserverEnabled" = "true"; "FabricSystemObserverMem Then execute the application upgrade with ```Powershell -Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.17 -ApplicationParameter $appParams -Monitored -FailureAction rollback +Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.18 -ApplicationParameter $appParams -Monitored -FailureAction rollback ``` Note: On *Linux*, this will restart FO processes (one at a time, UD Walk with safety checks) due to the way Linux Capabilites work. In a nutshell, for any kind of application upgrade, we have to re-run the FO setup script to get the Capabilities in place. For Windows, FO processes will NOT be restarted. diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index b0178dfc..0b71ded9 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 3.1.17 - This version brings ContainerObserver into the mix, promoted to a built-in observer. It also includes monitoring and performance improvements for AppObserver, FabricSystemObserver and NodeObserver. Finally, this version ships with an updated operational telemetry impl for Non-PII data sharing with Micrsoft (opt out). The only stuff we care about is if FO is working/healthy, it's finding issues (generating health events) and the total number of issues it finds, transmitted every 4 hours. Please see the release notes on the FO repo for more information. + 3.1.18 + This release introduces support for parallel execution of service process monitoring in AppObserver, ContainerObserver and FabricSystemObserver. Please see the release notes on the FO repo for more information. Microsoft MIT true diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index bd3eb355..c00e1e0e 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -12,11 +12,11 @@ linux-x64;win-x64 - 3.1.17.0 + 3.1.18.0 Copyright © 2021 FabricObserver Service Fabric Observer - 3.1.17 + 3.1.18 true true FabricObserver.Program diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index d8f6f8fd..b14b1454 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -295,23 +295,22 @@ private async Task InitializeAsync(CancellationToken token) int settingsFail = 0; - _ = Parallel.For(0, userTargetList.Count, async (i, state) => + // This doesn't add any real value for parallelization unless there are hundreds of apps on the node. + foreach (var application in userTargetList) { token.ThrowIfCancellationRequested(); - var application = userTargetList.ElementAt(i); - if (string.IsNullOrWhiteSpace(application.TargetApp)) { ObserverLogger.LogWarning($"InitializeAsync: Required setting, targetApp, is not set."); settingsFail++; - return; + continue; } // No required settings for supplied application(s). if (settingsFail == userTargetList.Count) { - state.Stop(); + return false; } ServiceFilterType filterType = ServiceFilterType.None; @@ -342,17 +341,16 @@ private async Task InitializeAsync(CancellationToken token) if (codepackages.Count == 0) { - return; + continue; } int containerHostCount = codepackages.Count(c => c.HostType == HostType.ContainerHost); if (containerHostCount == 0) { - return; + continue; } - MonitoredAppCount++; deployedTargetList.Enqueue(application); await SetInstanceOrReplicaMonitoringList(new Uri(application.TargetApp), filteredServiceList, filterType, null).ConfigureAwait(false); } @@ -360,8 +358,9 @@ private async Task InitializeAsync(CancellationToken token) { ObserverLogger.LogInfo($"Handled Exception in function InitializeAsync:{e.GetType().Name}."); } - }); + } + MonitoredAppCount = deployedTargetList.Count; MonitoredServiceProcessCount = ReplicaOrInstanceList.Count; foreach (var rep in ReplicaOrInstanceList) @@ -419,10 +418,10 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc RedirectStandardError = true }; - var output = new List(); + var output = new ConcurrentQueue(); using Process p = new Process(); p.ErrorDataReceived += (sender, e) => { error += e.Data; }; - p.OutputDataReceived += (sender, e) => { output.Add(e.Data); }; + p.OutputDataReceived += (sender, e) => { if (!string.IsNullOrWhiteSpace(e.Data)) { output.Enqueue(e.Data); } }; p.StartInfo = ps; _ = p.Start(); @@ -529,7 +528,7 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc { Token.ThrowIfCancellationRequested(); - if (string.IsNullOrWhiteSpace(line) || line.Contains("CPU")) + if (line.Contains("CPU")) { continue; } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 29cfa239..50fdaa37 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -31,18 +31,18 @@ public class ObserverManager : IDisposable { private readonly string nodeName; private readonly List observers; - private volatile bool shutdownSignaled; + private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); private readonly CancellationToken token; - private CancellationTokenSource cts; - private CancellationTokenSource linkedSFRuntimeObserverTokenSource; - private bool disposed; private readonly IEnumerable serviceCollection; + private bool disposed; private bool isConfigurationUpdateInProgress; + private CancellationTokenSource cts; + private CancellationTokenSource linkedSFRuntimeObserverTokenSource; private DateTime StartDateTime; - private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); + private volatile bool shutdownSignaled; // Folks often use their own version numbers. This is for internal diagnostic telemetry. - private const string InternalVersionNumber = "3.1.17"; + private const string InternalVersionNumber = "3.1.18"; private bool TaskCancelled => linkedSFRuntimeObserverTokenSource?.Token.IsCancellationRequested ?? token.IsCancellationRequested; @@ -72,11 +72,6 @@ public static bool TelemetryEnabled get; set; } - private TimeSpan ObserverExecutionTimeout - { - get; set; - } = TimeSpan.FromMinutes(30); - private static bool FabricObserverOperationalTelemetryEnabled { get; set; @@ -132,6 +127,11 @@ private Logger Logger get; } + private TimeSpan ObserverExecutionTimeout + { + get; set; + } = TimeSpan.FromMinutes(30); + private int MaxArchivedLogFileLifetimeDays { get; diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index 621605bc..e80c456a 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,10 +27,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 2ffc2acf..02632dc8 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,10 +21,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index b5de2b21..1ef33ac1 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + - + @@ -336,10 +336,10 @@ - - --> + - --> + diff --git a/README.md b/README.md index 96d92a12..28036611 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ -# FabricObserver 3.1.17 +# FabricObserver 3.1.18 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of machine resources that tend to be very important to all Service Fabric applications, like disk space consumption, CPU use, memory use, endpoint availability, ephemeral TCP port use, and app/cluster certificate health out-of-the-box. 2. Runs on multiple versions of Windows Server and Ubuntu 16.04 and 18.04 -3. Provides [an easy-to-use extensibility model](/Documentation/Plugins.md) for creating [custom Observers](/SampleObserverPlugin) out of band (so, you don't need to clone the repo to build an Observer). See [ContainerObserver](https://github.com/GitTorre/ContainerObserver) for a complete plugin impl that extends FO with SF container app resource monitoring and alerting (note that this observer is built into FO as of version 3.1.17). +3. Provides [an easy-to-use extensibility model](/Documentation/Plugins.md) for creating [custom Observers](/SampleObserverPlugin) out of band (so, you don't need to clone the repo to build an Observer). See [ContainerObserver](https://github.com/GitTorre/ContainerObserver) for a complete plugin impl that extends FO with SF container app resource monitoring and alerting (note that this observer is built into FO as of version 3.1.18). 4. Supports [Configuration Setting Application Updates](/Documentation/Using.md#parameterUpdates) for any observer for any supported setting. 5. Is actively developed completely in the open. The latest code (generally in flight and not meant for production) lives in the develop branch. It is highly recommended that you only deploy code built from the main branch into your production clusters. @@ -143,11 +143,11 @@ Register-ServiceFabricApplicationType -ApplicationPathInImageStore FO3117 #Create FO application (if not already deployed at lesser version): -New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.17 +New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.18 #OR if updating existing version: -Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.17 -Monitored -FailureAction rollback +Start-ServiceFabricApplicationUpgrade -ApplicationName fabric:/FabricObserver -ApplicationTypeVersion 3.1.18 -Monitored -FailureAction rollback ``` ## Observer Model diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index e4993892..36eecc63 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -13,7 +13,7 @@ - + From 499904b189c1dcc75d81b6da267fed0004d83cc2 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 27 Sep 2021 14:50:05 -0700 Subject: [PATCH 13/35] Latest (new data structures, bug fixes) --- .gitignore | 24 ++ .../Utilities/ConfigSettings.cs | 8 +- FabricObserver/Observers/AppObserver.cs | 253 +++++++++--------- FabricObserver/Observers/ContainerObserver.cs | 89 +++--- .../Observers/FabricSystemObserver.cs | 129 ++++----- FabricObserver/Observers/ObserverManager.cs | 19 +- .../PackageRoot/Config/Settings.xml | 22 +- FabricObserverApp/FabricObserverApp.sfproj | 2 +- .../FabricObserverTests.csproj | 6 +- FabricObserverTests/ObserverTest.cs | 84 +++--- .../SampleObserverPlugin.csproj | 2 +- TelemetryLib/TelemetryEvents.cs | 2 +- 12 files changed, 331 insertions(+), 309 deletions(-) diff --git a/.gitignore b/.gitignore index 42a0c818..1b56b5ff 100644 --- a/.gitignore +++ b/.gitignore @@ -336,3 +336,27 @@ ASALocalRun/ /FabricObserver/observer_logs /FabricObserver/PackageRoot/Data/Plugins/SampleNewObserver.dll /nuget.exe +/FabricObserver/PackageRoot/Data/Plugins/vcruntime140.dll +/FabricObserver/PackageRoot/Data/Plugins/vccorlib140.dll +/FabricObserver/PackageRoot/Data/Plugins/Tfx_x86.lib +/FabricObserver/PackageRoot/Data/Plugins/Tfx_x86.dll +/FabricObserver/PackageRoot/Data/Plugins/TfxPerfCounter.man +/FabricObserver/PackageRoot/Data/Plugins/Tfx.lib +/FabricObserver/PackageRoot/Data/Plugins/Tfx.dll +/FabricObserver/PackageRoot/Data/Plugins/msvcp140.dll +/FabricObserver/PackageRoot/Data/Plugins/Microsoft.Cloud.InstrumentationFramework.Metrics.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxMetrics_x86.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxMetrics_x86.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxMetrics.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxMetrics.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxHealth_x86.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxHealth_x86.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxHealth.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxHealth.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxEvents_x86.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxEvents_x86.dll +/FabricObserver/PackageRoot/Data/Plugins/IfxEvents.lib +/FabricObserver/PackageRoot/Data/Plugins/IfxEvents.dll +/FabricObserver/PackageRoot/Data/Plugins/FabricObserverMdm.dll +/FabricObserver/PackageRoot/Data/Plugins/concrt140.dll +/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest_MDM.xml diff --git a/FabricObserver.Extensibility/Utilities/ConfigSettings.cs b/FabricObserver.Extensibility/Utilities/ConfigSettings.cs index 328f17a3..cf89c334 100644 --- a/FabricObserver.Extensibility/Utilities/ConfigSettings.cs +++ b/FabricObserver.Extensibility/Utilities/ConfigSettings.cs @@ -79,9 +79,13 @@ public bool IsObserverEtwEnabled public ConfigSettings(ConfigurationSettings settings, string observerConfiguration) { - Settings = settings; - section = settings?.Sections[observerConfiguration]; + if (settings == null || !settings.Sections.Contains(observerConfiguration)) + { + return; + } + Settings = settings; + section = settings.Sections[observerConfiguration]; UpdateConfigSettings(); } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 301ff260..8b576471 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -32,12 +32,14 @@ public class AppObserver : ObserverBase // Health Report data containers - For use in analysis to determine health state. // These lists are cleared after each healthy iteration. // Change this data structure to concurrentqueue.... - private ConcurrentQueue> AllAppCpuData; - private ConcurrentQueue> AllAppMemDataMb; - private ConcurrentQueue> AllAppMemDataPercent; - private ConcurrentQueue> AllAppTotalActivePortsData; - private ConcurrentQueue> AllAppEphemeralPortsData; - private ConcurrentQueue> AllAppHandlesData; + private ConcurrentDictionary> AllAppCpuData; + private ConcurrentDictionary> AllAppMemDataMb; + private ConcurrentDictionary> AllAppMemDataPercent; + private ConcurrentDictionary> AllAppTotalActivePortsData; + private ConcurrentDictionary> AllAppEphemeralPortsData; + private ConcurrentDictionary> AllAppHandlesData; + + // TOTHINK: experiment with ConcurrentDictionary instead of queues... Why are you using queues in the first place? // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. private List userTargetList; @@ -128,15 +130,20 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - // For use in process family tree monitoring. - ConcurrentQueue childProcessTelemetryDataList = null; TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => { token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList.ElementAt(i); + // For use in process family tree monitoring. + ConcurrentQueue childProcessTelemetryDataList = null; + + if (!ReplicaOrInstanceList.TryDequeue(out ReplicaOrInstanceMonitoringInfo repOrInst)) + { + return; + } + string processName = null; int processId = 0; ApplicationInfo app = null; @@ -175,6 +182,7 @@ public override Task ReportAsync(CancellationToken token) return; } + // This lock is required. lock (lockObj) { fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; @@ -195,9 +203,9 @@ public override Task ReportAsync(CancellationToken token) } // CPU - Parent process - if (AllAppCpuData.Any(x => x.Id == id)) + if (AllAppCpuData.ContainsKey(id)) { - var parentFrud = AllAppCpuData.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppCpuData[id]; if (hasChildProcs) { @@ -216,9 +224,9 @@ public override Task ReportAsync(CancellationToken token) } // Memory MB - Parent process - if (AllAppMemDataMb.Any(x => x.Id == id)) + if (AllAppMemDataMb.ContainsKey(id)) { - var parentFrud = AllAppMemDataMb.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppMemDataMb[id]; if (hasChildProcs) { @@ -236,9 +244,9 @@ public override Task ReportAsync(CancellationToken token) } // Memory Percent - Parent process - if (AllAppMemDataPercent.Any(x => x.Id == id)) + if (AllAppMemDataPercent.ContainsKey(id)) { - var parentFrud = AllAppMemDataPercent.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppMemDataPercent[id]; if (hasChildProcs) { @@ -256,9 +264,9 @@ public override Task ReportAsync(CancellationToken token) } // TCP Ports - Active - Parent process - if (AllAppTotalActivePortsData.Any(x => x.Id == id)) + if (AllAppTotalActivePortsData.ContainsKey(id)) { - var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppTotalActivePortsData[id]; if (hasChildProcs) { @@ -276,9 +284,9 @@ public override Task ReportAsync(CancellationToken token) } // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process - if (AllAppEphemeralPortsData.Any(x => x.Id == id)) + if (AllAppEphemeralPortsData.ContainsKey(id)) { - var parentFrud = AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppEphemeralPortsData[id]; if (hasChildProcs) { @@ -296,9 +304,9 @@ public override Task ReportAsync(CancellationToken token) } // Allocated (in use) Handles - Parent process - if (AllAppHandlesData.Any(x => x.Id == id)) + if (AllAppHandlesData.ContainsKey(id)) { - var parentFrud = AllAppHandlesData.FirstOrDefault(x => x.Id == id); + var parentFrud = AllAppHandlesData[id]; if (hasChildProcs) { @@ -316,9 +324,9 @@ public override Task ReportAsync(CancellationToken token) } // Child proc info telemetry. - if (IsEtwEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) + if (hasChildProcs && MaxChildProcTelemetryDataCount > 0) { - lock (lockObj) + if (IsEtwEnabled) { var data = new { @@ -327,22 +335,19 @@ public override Task ReportAsync(CancellationToken token) ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); } - } - if (IsTelemetryEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) - { - lock (lockObj) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList.ToList(), token); } - } + } }); return Task.CompletedTask; } private void ProcessChildProcs( - ConcurrentQueue> fruds, + ConcurrentDictionary> fruds, ConcurrentQueue childProcessTelemetryDataList, ReplicaOrInstanceMonitoringInfo repOrInst, ApplicationInfo app, @@ -375,7 +380,7 @@ private void ProcessChildProcs( } private (ChildProcessTelemetryData childProcInfo, double Sum) ProcessChildFrudsGetDataSum( - ConcurrentQueue> fruds, + ConcurrentDictionary> fruds, ReplicaOrInstanceMonitoringInfo repOrInst, ApplicationInfo app, CancellationToken token) where T : struct @@ -410,17 +415,17 @@ private void ProcessChildProcs( try { - if (fruds.Any(x => x != null && x.Id.Contains(childProcName))) + if (fruds.Any(x => x.Key.Contains(childProcName))) { - var childFruds = fruds.Where(x => x != null && x.Id.Contains(childProcName)).ToList(); - metric = childFruds[0].Property; + var childFruds = fruds.Where(x => x.Key.Contains(childProcName)).ToList(); + metric = childFruds[0].Value.Property; for (int j = 0; j < childFruds.Count; ++j) { token.ThrowIfCancellationRequested(); var frud = childFruds[j]; - double value = frud.AverageDataValue; + double value = frud.Value.AverageDataValue; sumValues += Math.Round(value, 0); if (IsEtwEnabled || IsTelemetryEnabled) @@ -433,49 +438,49 @@ private void ProcessChildProcs( if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && app.DumpProcessOnError && EnableProcessDumps) { - string prop = frud.Property; + string prop = frud.Value.Property; bool dump = false; switch (prop) { case ErrorWarningProperty.TotalCpuTime: // Test error threshold breach for supplied metric. - if (frud.IsUnhealthy(app.CpuErrorLimitPercent)) + if (frud.Value.IsUnhealthy(app.CpuErrorLimitPercent)) { dump = true; } break; case ErrorWarningProperty.TotalMemoryConsumptionMb: - if (frud.IsUnhealthy(app.MemoryErrorLimitMb)) + if (frud.Value.IsUnhealthy(app.MemoryErrorLimitMb)) { dump = true; } break; case ErrorWarningProperty.TotalMemoryConsumptionPct: - if (frud.IsUnhealthy(app.MemoryErrorLimitPercent)) + if (frud.Value.IsUnhealthy(app.MemoryErrorLimitPercent)) { dump = true; } break; case ErrorWarningProperty.TotalActivePorts: - if (frud.IsUnhealthy(app.NetworkErrorActivePorts)) + if (frud.Value.IsUnhealthy(app.NetworkErrorActivePorts)) { dump = true; } break; case ErrorWarningProperty.TotalEphemeralPorts: - if (frud.IsUnhealthy(app.NetworkErrorEphemeralPorts)) + if (frud.Value.IsUnhealthy(app.NetworkErrorEphemeralPorts)) { dump = true; } break; case ErrorWarningProperty.TotalFileHandles: - if (frud.IsUnhealthy(app.ErrorOpenFileHandles)) + if (frud.Value.IsUnhealthy(app.ErrorOpenFileHandles)) { dump = true; } @@ -492,17 +497,7 @@ private void ProcessChildProcs( } // Remove child FRUD from FRUDs. - lock (lockObj) - { - var tempQueue = new ConcurrentQueue>(); - - foreach (var f in fruds.Where(fr => fr != frud)) - { - tempQueue.Enqueue(f); - } - - fruds = tempQueue; - } + _ = fruds.TryRemove(frud.Key, out _); } childFruds?.Clear(); @@ -957,12 +952,12 @@ private void SetDumpPath() private Task MonitorDeployedAppsAsync(CancellationToken token) { int capacity = ReplicaOrInstanceList.Count; - AllAppCpuData ??= new ConcurrentQueue>(); - AllAppMemDataMb ??= new ConcurrentQueue>(); - AllAppMemDataPercent ??= new ConcurrentQueue>(); - AllAppTotalActivePortsData ??= new ConcurrentQueue>(); - AllAppEphemeralPortsData ??= new ConcurrentQueue>(); - AllAppHandlesData ??= new ConcurrentQueue>(); + AllAppCpuData ??= new ConcurrentDictionary>(); + AllAppMemDataMb ??= new ConcurrentDictionary>(); + AllAppMemDataPercent ??= new ConcurrentDictionary>(); + AllAppTotalActivePortsData ??= new ConcurrentDictionary>(); + AllAppEphemeralPortsData ??= new ConcurrentDictionary>(); + AllAppHandlesData ??= new ConcurrentDictionary>(); var exceptions = new ConcurrentQueue(); _ = Parallel.For(0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => @@ -979,7 +974,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - ConcurrentQueue<(string procName, int Pid)> procList = null; + ConcurrentDictionary procList = null; if (application?.TargetApp == null && application?.TargetAppType == null) { @@ -1088,66 +1083,62 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) capacity = (int)MonitorDuration.TotalSeconds * 4; } - // Add new resource data structures for each app service process where the metric is specified in configuration for related observation. - // 1 thread safe - // 2 or lock - - if (AllAppCpuData.All(list => list?.Id != id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) + if (!AllAppCpuData.ContainsKey(id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) { - AllAppCpuData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); + _ = AllAppCpuData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); } - if (AllAppCpuData.Any(list => list?.Id == id)) + if (AllAppCpuData.ContainsKey(id)) { checkCpu = true; } - if (AllAppMemDataMb.All(list => list?.Id != id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0)) + if (!AllAppMemDataMb.ContainsKey(id) && (application.MemoryErrorLimitMb > 0 || application.MemoryWarningLimitMb > 0)) { - AllAppMemDataMb.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, id, capacity, UseCircularBuffer)); + _ = AllAppMemDataMb.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, id, capacity, UseCircularBuffer)); } - if (AllAppMemDataMb.Any(list => list?.Id == id)) + if (AllAppMemDataMb.ContainsKey(id)) { checkMemMb = true; } - if (AllAppMemDataPercent.All(list => list?.Id != id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0)) + if (!AllAppMemDataPercent.ContainsKey(id) && (application.MemoryErrorLimitPercent > 0 || application.MemoryWarningLimitPercent > 0)) { - AllAppMemDataPercent.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, id, capacity, UseCircularBuffer)); + _ = AllAppMemDataPercent.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, id, capacity, UseCircularBuffer)); } - if (AllAppMemDataPercent.Any(list => list?.Id == id)) + if (AllAppMemDataPercent.ContainsKey(id)) { checkMemPct = true; } - if (AllAppTotalActivePortsData.All(list => list?.Id != id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0)) + if (!AllAppTotalActivePortsData.ContainsKey(id) && (application.NetworkErrorActivePorts > 0 || application.NetworkWarningActivePorts > 0)) { - AllAppTotalActivePortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, id, 1, false)); + _ = AllAppTotalActivePortsData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, id, 1, false)); } - if (AllAppTotalActivePortsData.Any(list => list?.Id == id)) + if (AllAppTotalActivePortsData.ContainsKey(id)) { checkAllPorts = true; } - if (AllAppEphemeralPortsData.All(list => list?.Id != id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0)) + if (!AllAppEphemeralPortsData.ContainsKey(id) && (application.NetworkErrorEphemeralPorts > 0 || application.NetworkWarningEphemeralPorts > 0)) { - AllAppEphemeralPortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, id, 1, false)); + _ = AllAppEphemeralPortsData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, id, 1, false)); } - if (AllAppEphemeralPortsData.Any(list => list?.Id == id)) + if (AllAppEphemeralPortsData.ContainsKey(id)) { checkEphemeralPorts = true; } - if (AllAppHandlesData.All(list => list?.Id != id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) + if (!AllAppHandlesData.ContainsKey(id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) { - AllAppHandlesData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, id, 1, false)); + _ = AllAppHandlesData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, id, 1, false)); } - if (AllAppHandlesData.Any(list => list?.Id == id)) + if (AllAppHandlesData.ContainsKey(id)) { checkHandles = true; } @@ -1155,24 +1146,24 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) // Get list of child processes of parentProc should they exist. // In order to provide accurate resource usage of an SF service process we need to also account for // any processes (children) that the service process (parent) created/spawned. - procList = new ConcurrentQueue<(string procName, int Pid)>(); + procList = new ConcurrentDictionary(); // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, // then only the parent process will be in this list. - procList.Enqueue((parentProc.ProcessName, parentProc.Id)); + _ = procList.TryAdd(parentProc.ProcessName, parentProc.Id); if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) { for (int k = 0; k < repOrInst.ChildProcesses.Count; ++k) { - procList.Enqueue(repOrInst.ChildProcesses[k]); + _ = procList.TryAdd(repOrInst.ChildProcesses[k].procName, repOrInst.ChildProcesses[k].Pid); } } for (int j = 0; j < procList.Count; ++j) { - int procId = procList.ElementAt(j).Pid; - string procName = procList.ElementAt(j).procName; + int procId = procList.ElementAt(j).Value; + string procName = procList.ElementAt(j).Key; TimeSpan maxDuration = TimeSpan.FromSeconds(1); if (MonitorDuration > TimeSpan.MinValue) @@ -1205,15 +1196,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { if (procId == parentPid) { - AllAppHandlesData.FirstOrDefault(x => x?.Id == id).Data.Add(handles); + AllAppHandlesData[id].Data.Add(handles); } else { - if (!AllAppHandlesData.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppHandlesData.ContainsKey($"{id}:{procName}")) { - AllAppHandlesData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppHandlesData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppHandlesData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(handles); + AllAppHandlesData[$"{id}:{procName}"].Data.Add(handles); } } } @@ -1224,16 +1215,16 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) // Parent process (the service process). if (procId == parentPid) { - AllAppTotalActivePortsData.FirstOrDefault(x => x?.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + AllAppTotalActivePortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } else { // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppTotalActivePortsData.ContainsKey($"{id}:{procName}")) { - AllAppTotalActivePortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppTotalActivePortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppTotalActivePortsData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + AllAppTotalActivePortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } } @@ -1242,15 +1233,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { if (procId == parentPid) { - AllAppEphemeralPortsData.FirstOrDefault(x => x?.Id == id).Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppEphemeralPortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } else { - if (!AllAppEphemeralPortsData.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppEphemeralPortsData.ContainsKey($"{id}:{procName}")) { - AllAppEphemeralPortsData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppEphemeralPortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppEphemeralPortsData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppEphemeralPortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } } @@ -1276,15 +1267,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppCpuData.FirstOrDefault(x => x?.Id == id).Data.Add(cpu); + AllAppCpuData[id].Data.Add(cpu); } else { - if (!AllAppCpuData.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppCpuData.ContainsKey($"{id}:{procName}")) { - AllAppCpuData.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppCpuData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppCpuData.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(cpu); + AllAppCpuData[$"{id}:{procName}"].Data.Add(cpu); } } } @@ -1300,15 +1291,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppMemDataMb.FirstOrDefault(x => x?.Id == id).Data.Add(processMem); + AllAppMemDataMb[id].Data.Add(processMem); } else { - if (!AllAppMemDataMb.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppMemDataMb.ContainsKey($"{id}:{procName}")) { - AllAppMemDataMb.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppMemDataMb.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppMemDataMb.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(processMem); + AllAppMemDataMb[$"{id}:{procName}"].Data.Add(processMem); } } @@ -1328,15 +1319,15 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) if (procId == parentPid) { - AllAppMemDataPercent.FirstOrDefault(x => x?.Id == id).Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataPercent[id].Data.Add(Math.Round(usedPct, 1)); } else { - if (!AllAppMemDataPercent.Any(x => x?.Id == $"{id}:{procName}")) + if (!AllAppMemDataPercent.ContainsKey($"{id}:{procName}")) { - AllAppMemDataPercent.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); + AllAppMemDataPercent.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppMemDataPercent.FirstOrDefault(x => x?.Id == $"{id}:{procName}").Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataPercent[$"{id}:{procName}"].Data.Add(Math.Round(usedPct, 1)); } } } @@ -1601,37 +1592,37 @@ private void CleanUp() ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; - if (AllAppCpuData != null && AllAppCpuData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppCpuData != null && AllAppCpuData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppCpuData?.Clear(); AllAppCpuData = null; } - if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppEphemeralPortsData?.Clear(); AllAppEphemeralPortsData = null; } - if (AllAppHandlesData != null && AllAppHandlesData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppHandlesData != null && AllAppHandlesData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppHandlesData?.Clear(); AllAppHandlesData = null; } - if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppMemDataMb?.Clear(); AllAppMemDataMb = null; } - if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppMemDataPercent?.Clear(); AllAppMemDataPercent = null; } - if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => frud != null && !frud.ActiveErrorOrWarning)) + if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) { AllAppTotalActivePortsData?.Clear(); AllAppTotalActivePortsData = null; @@ -1646,59 +1637,59 @@ private void LogAllAppResourceDataToCsv(string appName) } // CPU Time - if (AllAppCpuData.Any(x => x.Id == appName)) + if (AllAppCpuData.ContainsKey(appName)) { CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalCpuTime, "Average", - Math.Round(AllAppCpuData.First(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppCpuData.First(x => x.Key == appName).Value.AverageDataValue)); CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalCpuTime, "Peak", - Math.Round(AllAppCpuData.First(x => x.Id == appName).MaxDataValue)); + Math.Round(AllAppCpuData.First(x => x.Key == appName).Value.MaxDataValue)); } // Memory - MB - if (AllAppMemDataMb.Any(x => x.Id == appName)) + if (AllAppMemDataMb.ContainsKey(appName)) { CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalMemoryConsumptionMb, "Average", - Math.Round(AllAppMemDataMb.First(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppMemDataMb.First(x => x.Key == appName).Value.AverageDataValue)); CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalMemoryConsumptionMb, "Peak", - Math.Round(Convert.ToDouble(AllAppMemDataMb.First(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppMemDataMb.First(x => x.Key == appName).Value.MaxDataValue))); } - if (AllAppMemDataPercent.Any(x => x.Id == appName)) + if (AllAppMemDataPercent.ContainsKey(appName)) { CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalMemoryConsumptionPct, "Average", - Math.Round(AllAppMemDataPercent.First(x => x.Id == appName).AverageDataValue)); + Math.Round(AllAppMemDataPercent.First(x => x.Key == appName).Value.AverageDataValue)); CsvFileLogger.LogData( fileName, appName, ErrorWarningProperty.TotalMemoryConsumptionPct, "Peak", - Math.Round(Convert.ToDouble(AllAppMemDataPercent.FirstOrDefault(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppMemDataPercent.FirstOrDefault(x => x.Key == appName).Value.MaxDataValue))); } - if (AllAppTotalActivePortsData.Any(x => x.Id == appName)) + if (AllAppTotalActivePortsData.ContainsKey(appName)) { // Network CsvFileLogger.LogData( @@ -1706,10 +1697,10 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalActivePorts, "Total", - Math.Round(Convert.ToDouble(AllAppTotalActivePortsData.First(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppTotalActivePortsData.First(x => x.Key == appName).Value.MaxDataValue))); } - if (AllAppEphemeralPortsData.Any(x => x.Id == appName)) + if (AllAppEphemeralPortsData.ContainsKey(appName)) { // Network CsvFileLogger.LogData( @@ -1717,10 +1708,10 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalEphemeralPorts, "Total", - Math.Round(Convert.ToDouble(AllAppEphemeralPortsData.First(x => x.Id == appName).MaxDataValue))); + Math.Round(Convert.ToDouble(AllAppEphemeralPortsData.First(x => x.Key == appName).Value.MaxDataValue))); } - if (AllAppHandlesData.Any(x => x.Id == appName)) + if (AllAppHandlesData.ContainsKey(appName)) { // Handles CsvFileLogger.LogData( @@ -1728,7 +1719,7 @@ private void LogAllAppResourceDataToCsv(string appName) appName, ErrorWarningProperty.TotalFileHandles, "Total", - Math.Round(AllAppHandlesData.First(x => x.Id == appName).MaxDataValue)); + Math.Round(AllAppHandlesData.First(x => x.Key == appName).Value.MaxDataValue)); } DataTableFileLogger.Flush(); diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index b14b1454..f6d2e840 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -25,8 +25,8 @@ namespace FabricObserver.Observers public class ContainerObserver : ObserverBase { private const int MaxProcessExitWaitTimeMS = 60000; - private ConcurrentQueue> allCpuDataPercentage; - private ConcurrentQueue> allMemDataMB; + private ConcurrentDictionary> allCpuDataPercentage; + private ConcurrentDictionary> allMemDataMB; // userTargetList is the list of ApplicationInfo objects representing apps supplied in configuration. private List userTargetList; @@ -64,16 +64,22 @@ public override async Task ObserveAsync(CancellationToken token) } Token = token; - MonitorContainers(); - await ReportAsync(token); + + if (MonitorContainers()) + { + await ReportAsync(token); + } + CleanUp(); runDurationTimer.Stop(); RunDuration = runDurationTimer.Elapsed; + if (EnableVerboseLogging) { ObserverLogger.LogInfo($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + $"Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); } + LastRunDateTime = DateTime.Now; } @@ -90,7 +96,11 @@ public override Task ReportAsync(CancellationToken token) { token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList.ElementAt(i); + if (!ReplicaOrInstanceList.TryDequeue(out ReplicaOrInstanceMonitoringInfo repOrInst)) + { + return; + } + ApplicationInfo app = deployedTargetList.First( a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); @@ -98,8 +108,8 @@ public override Task ReportAsync(CancellationToken token) string serviceName = repOrInst.ServiceName.OriginalString.Replace(app.TargetApp, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; - var cpuFrudInst = allCpuDataPercentage.FirstOrDefault(cpu => cpu.Id == cpuId); - var memFrudInst = allMemDataMB.FirstOrDefault(mem => mem.Id == memId); + var cpuFrudInst = allCpuDataPercentage[cpuId]; + var memFrudInst = allMemDataMB[memId]; if (EnableCsvLogging) { @@ -114,7 +124,7 @@ public override Task ReportAsync(CancellationToken token) CsvFileLogger.DataLogFolder = serviceName; } - // Log resource usage data to local CSV file(s). + // Log resource usage data to local CSV file(s). locks are required here. // CPU Time lock (lockObj) { @@ -139,27 +149,23 @@ public override Task ReportAsync(CancellationToken token) } // Report -> Send Telemetry/Write ETW/Create SF Health Warnings (if threshold breach) - lock (lockObj) - { - ProcessResourceDataReportHealth( - cpuFrudInst, - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - timeToLive, - HealthReportType.Application, - repOrInst); - } - - lock (lockObj) - { - ProcessResourceDataReportHealth( - memFrudInst, - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - timeToLive, - HealthReportType.Application, - repOrInst); - } + + ProcessResourceDataReportHealth( + cpuFrudInst, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + timeToLive, + HealthReportType.Application, + repOrInst); + + ProcessResourceDataReportHealth( + memFrudInst, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + timeToLive, + HealthReportType.Application, + repOrInst); + }); return Task.CompletedTask; @@ -373,7 +379,7 @@ private async Task InitializeAsync(CancellationToken token) return true; } - private void MonitorContainers() + private bool MonitorContainers() { /* docker stats --no-stream --format "table {{.Container}}\t{{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}" @@ -388,8 +394,8 @@ 9e380a42233c sf-243-2d2f9fde-fb93-4e77-a5d2-df1600000000_3161e2ee-3d8f-2d45-b1 fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc96-cd4e596b6b6a 0.05% 27.19MiB / 15.45GiB */ - allCpuDataPercentage ??= new ConcurrentQueue>(); - allMemDataMB ??= new ConcurrentQueue>(); + allCpuDataPercentage ??= new ConcurrentDictionary>(); + allMemDataMB ??= new ConcurrentDictionary>(); try { @@ -443,7 +449,7 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc } ObserverLogger.LogWarning($"docker process has run too long ({MaxProcessExitWaitTimeMS} ms). Aborting."); - return; + return false; } int exitStatus = p.ExitCode; @@ -503,25 +509,26 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc }); } - return; + return false; } _ = Parallel.For(0, ReplicaOrInstanceList.Count, (i, state) => { + // Do not TryDequeue here as ReplicaOrInstanceList is used in other functions (like ReportAsync). var repOrInst = ReplicaOrInstanceList.ElementAt(i); string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; string containerId = string.Empty; - if (!allCpuDataPercentage.Any(frud => frud.Id == cpuId)) + if (!allCpuDataPercentage.ContainsKey(cpuId)) { - allCpuDataPercentage.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, 1, false)); + _ = allCpuDataPercentage.TryAdd(cpuId, new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, cpuId, 1, false)); } - if (!allMemDataMB.Any(frud => frud.Id == memId)) + if (!allMemDataMB.ContainsKey(memId)) { - allMemDataMB.Enqueue(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); + _ = allMemDataMB.TryAdd(memId, new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, memId, 1, false)); } foreach (string line in output) @@ -556,11 +563,11 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc // CPU (%) double cpu_percent = double.TryParse(stats[2].Replace("%", ""), out double cpuPerc) ? cpuPerc : 0; - allCpuDataPercentage?.FirstOrDefault(f => f.Id == cpuId)?.Data.Add(cpu_percent); + allCpuDataPercentage[cpuId].Data.Add(cpu_percent); // Memory (MiB) double mem_working_set_mb = double.TryParse(stats[3].Replace("MiB", ""), out double memMib) ? memMib : 0; - allMemDataMB?.FirstOrDefault(f => f.Id == memId)?.Data.Add(mem_working_set_mb); + allMemDataMB[memId].Data.Add(mem_working_set_mb); break; } @@ -571,6 +578,8 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc ObserverLogger.LogError($"Exception in MonitorContainers:{Environment.NewLine}{e}"); throw; } + + return true; } private bool SetConfigurationFilePath() diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index bbfd2487..8bc45a37 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -34,11 +34,11 @@ public class FabricSystemObserver : ObserverBase private Stopwatch stopwatch; // Health Report data container - For use in analysis to determine health state. - private ConcurrentQueue> allCpuData; - private ConcurrentQueue> allMemData; - private ConcurrentQueue> allActiveTcpPortData; - private ConcurrentQueue> allEphemeralTcpPortData; - private ConcurrentQueue> allHandlesData; + private ConcurrentDictionary> allCpuData; + private ConcurrentDictionary> allMemData; + private ConcurrentDictionary> allActiveTcpPortData; + private ConcurrentDictionary> allEphemeralTcpPortData; + private ConcurrentDictionary> allHandlesData; // Windows only. (EventLog). private List evtRecordList = null; @@ -200,24 +200,24 @@ public override Task ReportAsync(CancellationToken token) if (allMemData != null) { - memHandlesInfo += $"Fabric memory: {allMemData.FirstOrDefault(x => x.Id == "Fabric")?.AverageDataValue} MB{Environment.NewLine}" + - $"FabricDCA memory: {allMemData.FirstOrDefault(x => x.Id.Contains("FabricDCA"))?.AverageDataValue} MB{Environment.NewLine}" + - $"FabricGateway memory: {allMemData.FirstOrDefault(x => x.Id.Contains("FabricGateway"))?.AverageDataValue} MB{Environment.NewLine}" + + memHandlesInfo += $"Fabric memory: {allMemData["Fabric"].AverageDataValue} MB{Environment.NewLine}" + + $"FabricDCA memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue} MB{Environment.NewLine}" + + $"FabricGateway memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue} MB{Environment.NewLine}" + // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? - $"FabricHost memory: {allMemData.FirstOrDefault(x => x.Id == "FabricHost")?.AverageDataValue} MB{Environment.NewLine}" : string.Empty); + $"FabricHost memory: {allMemData["FabricHost"].AverageDataValue} MB{Environment.NewLine}" : string.Empty); } if (allHandlesData != null) { - memHandlesInfo += $"Fabric file handles: {allHandlesData.FirstOrDefault(x => x.Id == "Fabric")?.AverageDataValue}{Environment.NewLine}" + - $"FabricDCA file handles: {allHandlesData.FirstOrDefault(x => x.Id.Contains("FabricDCA"))?.AverageDataValue}{Environment.NewLine}" + - $"FabricGateway file handles: {allHandlesData.FirstOrDefault(x => x.Id.Contains("FabricGateway"))?.AverageDataValue}{Environment.NewLine}" + + memHandlesInfo += $"Fabric file handles: {allHandlesData["Fabric"].AverageDataValue}{Environment.NewLine}" + + $"FabricDCA file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue}{Environment.NewLine}" + + $"FabricGateway file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue}{Environment.NewLine}" + // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? - $"FabricHost file handles: {allHandlesData.FirstOrDefault(x => x.Id == "FabricHost")?.AverageDataValue}" : string.Empty); + $"FabricHost file handles: {allHandlesData["FabricHost"]?.AverageDataValue}" : string.Empty); } // Informational report. @@ -545,79 +545,84 @@ private void Initialize() // CPU data if (allCpuData == null && (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0)) { - allCpuData = new ConcurrentQueue>(); + allCpuData = new ConcurrentDictionary>(); foreach (var proc in processWatchList) { - allCpuData.Enqueue( - new FabricResourceUsageData( - ErrorWarningProperty.TotalCpuTime, - proc, - frudCapacity, - UseCircularBuffer)); + _ = allCpuData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalCpuTime, + proc, + frudCapacity, + UseCircularBuffer)); } } // Memory data if (allMemData == null && (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0)) { - allMemData = new ConcurrentQueue>(); + allMemData = new ConcurrentDictionary>(); foreach (var proc in processWatchList) { - allMemData.Enqueue( - new FabricResourceUsageData( - ErrorWarningProperty.TotalMemoryConsumptionMb, - proc, - frudCapacity, - UseCircularBuffer)); + _ = allMemData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalMemoryConsumptionMb, + proc, + frudCapacity, + UseCircularBuffer)); } } // Ports if (allActiveTcpPortData == null && (ActiveTcpPortCountError > 0 || ActiveTcpPortCountWarning > 0)) { - allActiveTcpPortData = new ConcurrentQueue>(); + allActiveTcpPortData = new ConcurrentDictionary>(); foreach (var proc in processWatchList) { - allActiveTcpPortData.Enqueue( - new FabricResourceUsageData( - ErrorWarningProperty.TotalActivePorts, - proc, - frudCapacity, - UseCircularBuffer)); + _ = allActiveTcpPortData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalActivePorts, + proc, + frudCapacity, + UseCircularBuffer)); } } if (allEphemeralTcpPortData == null && (ActiveEphemeralPortCountError > 0 || ActiveEphemeralPortCountWarning > 0)) { - allEphemeralTcpPortData = new ConcurrentQueue>(); + allEphemeralTcpPortData = new ConcurrentDictionary>(); foreach (var proc in processWatchList) { - allEphemeralTcpPortData.Enqueue( - new FabricResourceUsageData( - ErrorWarningProperty.TotalEphemeralPorts, - proc, - frudCapacity, - UseCircularBuffer)); + _ = allEphemeralTcpPortData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalEphemeralPorts, + proc, + frudCapacity, + UseCircularBuffer)); } } // Handles if (allHandlesData == null && (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0)) { - allHandlesData = new ConcurrentQueue>(); + allHandlesData = new ConcurrentDictionary>(); foreach (var proc in processWatchList) { - allHandlesData.Enqueue( - new FabricResourceUsageData( - ErrorWarningProperty.TotalFileHandles, - proc, - frudCapacity, - UseCircularBuffer)); + _ = allHandlesData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalFileHandles, + proc, + frudCapacity, + UseCircularBuffer)); } } @@ -800,7 +805,7 @@ private async Task GetProcessInfoAsync(string procName) if (ActiveTcpPortCountError > 0 || ActiveTcpPortCountWarning > 0) { - allActiveTcpPortData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(activePortCount); + allActiveTcpPortData[dotnetArg].Data.Add(activePortCount); } // Ports - Active TCP Ephemeral @@ -811,7 +816,7 @@ private async Task GetProcessInfoAsync(string procName) if (ActiveEphemeralPortCountError > 0 || ActiveEphemeralPortCountWarning > 0) { - allEphemeralTcpPortData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(activeEphemeralPortCount); + allEphemeralTcpPortData[dotnetArg].Data.Add(activeEphemeralPortCount); } // Allocated Handles @@ -831,7 +836,7 @@ private async Task GetProcessInfoAsync(string procName) // Handles/FDs if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) { - allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handles); + allHandlesData[dotnetArg].Data.Add(handles); } CpuUsage cpuUsage = new CpuUsage(); @@ -840,7 +845,7 @@ private async Task GetProcessInfoAsync(string procName) if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) { float mem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(process.Id, true); - allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); + allMemData[dotnetArg].Data.Add(mem); } TimeSpan duration = TimeSpan.FromSeconds(1); @@ -862,7 +867,7 @@ private async Task GetProcessInfoAsync(string procName) if (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0) { int cpu = (int)cpuUsage.GetCpuUsagePercentageProcess(process.Id); - allCpuData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(cpu); + allCpuData[dotnetArg].Data.Add(cpu); } await Task.Delay(250, Token).ConfigureAwait(true); @@ -907,7 +912,7 @@ private async Task GetProcessInfoAsync(string procName) } private void ProcessResourceDataList( - ConcurrentQueue> data, + ConcurrentDictionary> data, T thresholdError, T thresholdWarning) where T : struct @@ -919,11 +924,11 @@ private void ProcessResourceDataList( fileName = $"FabricSystemServices{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; } - _ = Parallel.For (0, data.Count, ObserverManager.ParallelOptions, (i, state) => + _ = Parallel.ForEach (data, ObserverManager.ParallelOptions, (state) => { Token.ThrowIfCancellationRequested(); - var dataItem = data.ElementAt(i); + var dataItem = state.Value; if (dataItem.Data.Count == 0 || dataItem.AverageDataValue <= 0) { @@ -932,7 +937,7 @@ private void ProcessResourceDataList( if (EnableCsvLogging) { - var propertyName = data.First().Property; + var propertyName = dataItem.Property; /* Log average data value to long-running store (CSV).*/ @@ -993,31 +998,31 @@ private void CleanUp() { processWatchList = null; - if (allCpuData != null && !allCpuData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) + if (allCpuData != null && !allCpuData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allCpuData?.Clear(); allCpuData = null; } - if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) + if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allEphemeralTcpPortData?.Clear(); allEphemeralTcpPortData = null; } - if (allHandlesData != null && !allHandlesData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) + if (allHandlesData != null && !allHandlesData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allHandlesData?.Clear(); allHandlesData = null; } - if (allMemData != null && !allMemData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) + if (allMemData != null && !allMemData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allMemData?.Clear(); allMemData = null; } - if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud != null && frud.ActiveErrorOrWarning)) + if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allActiveTcpPortData?.Clear(); allActiveTcpPortData = null; diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 50fdaa37..336c609b 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -32,14 +32,15 @@ public class ObserverManager : IDisposable private readonly string nodeName; private readonly List observers; private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); + private readonly TimeSpan ForcedGCInterval = TimeSpan.FromMinutes(15); private readonly CancellationToken token; private readonly IEnumerable serviceCollection; + private volatile bool shutdownSignaled; private bool disposed; private bool isConfigurationUpdateInProgress; private CancellationTokenSource cts; private CancellationTokenSource linkedSFRuntimeObserverTokenSource; private DateTime StartDateTime; - private volatile bool shutdownSignaled; // Folks often use their own version numbers. This is for internal diagnostic telemetry. private const string InternalVersionNumber = "3.1.18"; @@ -94,8 +95,7 @@ public string ApplicationName public static HealthState ObserverFailureHealthStateLevel { - get; - set; + get; set; } = HealthState.Unknown; /// @@ -142,11 +142,6 @@ private DateTime LastForcedGCDateTime get; set; } - private TimeSpan ForcedGCInterval - { - get; set; - } = TimeSpan.FromMinutes(15); - private DateTime LastTelemetrySendDate { get; set; @@ -462,15 +457,13 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); healthReporter.ReportHealthToServiceFabric(healthReport); - await Task.Delay(250).ConfigureAwait(true); + await Task.Delay(50).ConfigureAwait(true); } } catch (FabricException) { } - - await Task.Delay(250).ConfigureAwait(true); } } else @@ -497,7 +490,7 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); healthReporter.ReportHealthToServiceFabric(healthReport); - await Task.Delay(250).ConfigureAwait(true); + await Task.Delay(50).ConfigureAwait(true); } } @@ -505,8 +498,6 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf { } - - await Task.Delay(250).ConfigureAwait(true); } obs.HasActiveFabricErrorOrWarning = false; diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 6cf9cd24..6fa64914 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -323,12 +323,7 @@ - - -
+
@@ -338,4 +333,19 @@
+ \ No newline at end of file diff --git a/FabricObserverApp/FabricObserverApp.sfproj b/FabricObserverApp/FabricObserverApp.sfproj index c0a3a27b..e815e38d 100644 --- a/FabricObserverApp/FabricObserverApp.sfproj +++ b/FabricObserverApp/FabricObserverApp.sfproj @@ -19,7 +19,6 @@ - @@ -31,6 +30,7 @@ + diff --git a/FabricObserverTests/FabricObserverTests.csproj b/FabricObserverTests/FabricObserverTests.csproj index 4cbec7a5..c9f612ab 100644 --- a/FabricObserverTests/FabricObserverTests.csproj +++ b/FabricObserverTests/FabricObserverTests.csproj @@ -44,9 +44,9 @@ - - - + + + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 50992b9b..8d99133b 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -63,7 +63,7 @@ private static readonly StatelessServiceContext context private static readonly bool isSFRuntimePresentOnTestMachine; private static readonly CancellationToken token = new CancellationToken(); - private static readonly FabricClient fabricClient = new FabricClient(FabricClientRole.User); + private static readonly FabricClient fabricClient = new FabricClient(); static ObserverTest() { @@ -247,7 +247,7 @@ public void OSObserver_Constructor_Test() [TestMethod] public void SFConfigurationObserver_Constructor_Test() { - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); ObserverManager.FabricServiceContext = context; ObserverManager.FabricClientInstance = client; @@ -322,7 +322,7 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -412,7 +412,7 @@ public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() } var startDateTime = DateTime.Now; - var client = new FabricClient(FabricClientRole.User); + var client = new FabricClient(); ClusterObserverManager.FabricServiceContext = context; ClusterObserverManager.FabricClientInstance = client; @@ -442,7 +442,7 @@ public async Task CertificateObserver_validCerts() try { var startDateTime = DateTime.Now; - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); ObserverManager.FabricServiceContext = context; ObserverManager.FabricClientInstance = client; @@ -497,7 +497,7 @@ public async Task CertificateObserver_expiredAndexpiringCerts() return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -560,7 +560,7 @@ public async Task NodeObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Fa return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -599,7 +599,7 @@ public async Task NodeObserver_Negative_Integer_CPU_Mem_Ports_Firewalls_Values_N return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -639,7 +639,7 @@ public async Task NodeObserver_Negative_Integer_Thresholds_CPU_Mem_Ports_Firewal return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -686,7 +686,7 @@ public async Task OSObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarnin return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -736,7 +736,7 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarn return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -786,7 +786,7 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -846,7 +846,7 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_IsHealthy_NoW return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -881,7 +881,7 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFi return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -974,7 +974,7 @@ public async Task SFConfigurationObserver_ObserveAsync_Successful_Observer_IsHea return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -1025,7 +1025,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1082,7 +1082,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var startDateTime = DateTime.Now; ObserverManager.FabricServiceContext = context; @@ -1130,7 +1130,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1185,7 +1185,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1240,7 +1240,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1295,7 +1295,7 @@ public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Un return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1349,7 +1349,7 @@ public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Thresho return; } - using var client = new FabricClient(FabricClientRole.User); + using var client = new FabricClient(); var nodeList = await client.QueryManager.GetNodeListAsync().ConfigureAwait(true); // This is meant to be run on your dev machine's one node test cluster. @@ -1406,18 +1406,6 @@ private static bool IsLocalSFRuntimePresent() } } - private static async Task WaitAsync(Func predicate, int timeoutInMilliseconds) - { - var stopwatch = Stopwatch.StartNew(); - - while (stopwatch.Elapsed < TimeSpan.FromMilliseconds(timeoutInMilliseconds) && !predicate()) - { - await Task.Delay(1).ConfigureAwait(false); - } - - return predicate(); - } - private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) { // Clear any existing user app, node or fabric:/System app Test Health Reports. @@ -1433,7 +1421,7 @@ private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) }; var logger = new Logger("TestCleanUp"); - var client = new FabricClient(FabricClientRole.User); + var client = new FabricClient(); // App reports if (obs is {HasActiveFabricErrorOrWarning: true} && obs.ObserverName != ObserverConstants.NetworkObserverName) @@ -1463,11 +1451,12 @@ private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) var healthReporter = new ObserverHealthReporter(logger, client); healthReporter.ReportHealthToServiceFabric(healthReport); - Thread.Sleep(250); + Thread.Sleep(50); } } catch (FabricException) { + } } } @@ -1478,13 +1467,11 @@ private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) if (sysAppHealth != null) { - foreach (var evt in sysAppHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("FabricSystemObserver"))) + foreach (var evt in sysAppHealth.HealthEvents?.Where( + s => s.HealthInformation.SourceId.Contains("FabricSystemObserver") + && (s.HealthInformation.HealthState == HealthState.Error + || s.HealthInformation.HealthState == HealthState.Warning))) { - if (evt.HealthInformation.HealthState == HealthState.Ok) - { - continue; - } - healthReport.AppName = new Uri("fabric:/System"); healthReport.Property = evt.HealthInformation.Property; healthReport.SourceId = evt.HealthInformation.SourceId; @@ -1492,17 +1479,18 @@ private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) var healthReporter = new ObserverHealthReporter(logger, client); healthReporter.ReportHealthToServiceFabric(healthReport); - Thread.Sleep(250); + Thread.Sleep(50); } } // Node reports var nodeHealth = await client.HealthManager.GetNodeHealthAsync(context.NodeContext.NodeName).ConfigureAwait(false); - var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("NodeObserver") - || s.HealthInformation.SourceId.Contains("DiskObserver") - && (s.HealthInformation.HealthState == HealthState.Error - || s.HealthInformation.HealthState == HealthState.Warning)); + var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where( + s => s.HealthInformation.SourceId.Contains("NodeObserver") + || s.HealthInformation.SourceId.Contains("DiskObserver") + && (s.HealthInformation.HealthState == HealthState.Error + || s.HealthInformation.HealthState == HealthState.Warning)); healthReport.ReportType = HealthReportType.Node; @@ -1516,7 +1504,7 @@ private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) var healthReporter = new ObserverHealthReporter(logger, client); healthReporter.ReportHealthToServiceFabric(healthReport); - Thread.Sleep(250); + Thread.Sleep(50); } } } diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index 36eecc63..e4993892 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -13,7 +13,7 @@ - + diff --git a/TelemetryLib/TelemetryEvents.cs b/TelemetryLib/TelemetryEvents.cs index 347f3dde..0b50e7c7 100644 --- a/TelemetryLib/TelemetryEvents.cs +++ b/TelemetryLib/TelemetryEvents.cs @@ -63,7 +63,7 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat // ETW if (isEtwEnabled) { - serviceEventSource.InternalFODataEvent(new { FOInternalTelemtryData = JsonConvert.SerializeObject(foData) }); + serviceEventSource.InternalFODataEvent(new { FOInternalTelemetryData = JsonConvert.SerializeObject(foData) }); } string nodeHashString = string.Empty; From 963a684edfe141713b7935a390fced8a26f8113a Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 27 Sep 2021 16:22:43 -0700 Subject: [PATCH 14/35] comments --- FabricObserver/Observers/AppObserver.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 8b576471..fbd95382 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -29,9 +29,6 @@ namespace FabricObserver.Observers // in AppObserver.config.json. This observer will also emit telemetry (ETW, LogAnalytics/AppInsights) if enabled in Settings.xml (ObserverManagerConfiguration) and ApplicationManifest.xml (AppObserverEnableEtw). public class AppObserver : ObserverBase { - // Health Report data containers - For use in analysis to determine health state. - // These lists are cleared after each healthy iteration. - // Change this data structure to concurrentqueue.... private ConcurrentDictionary> AllAppCpuData; private ConcurrentDictionary> AllAppMemDataMb; private ConcurrentDictionary> AllAppMemDataPercent; From 3527316dbb23b97d5911ca027655d03ce16024e9 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 27 Sep 2021 16:23:44 -0700 Subject: [PATCH 15/35] removed TOTHINK --- FabricObserver/Observers/AppObserver.cs | 3 --- 1 file changed, 3 deletions(-) diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index fbd95382..4e6a72b6 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -35,9 +35,6 @@ public class AppObserver : ObserverBase private ConcurrentDictionary> AllAppTotalActivePortsData; private ConcurrentDictionary> AllAppEphemeralPortsData; private ConcurrentDictionary> AllAppHandlesData; - - // TOTHINK: experiment with ConcurrentDictionary instead of queues... Why are you using queues in the first place? - // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. private List userTargetList; From 6d5570373a048714be01ced548872e2920c2d3c8 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 28 Sep 2021 13:01:37 -0700 Subject: [PATCH 16/35] updated yaml.. --- FOAzurePipeline.yaml | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/FOAzurePipeline.yaml b/FOAzurePipeline.yaml index 71ff6e3c..df4988d7 100644 --- a/FOAzurePipeline.yaml +++ b/FOAzurePipeline.yaml @@ -1,18 +1,11 @@ pool: name: Hosted Windows 2019 with VS2019 - demands: - - msbuild - - visualstudio steps: -- task: VSBuild@1 - displayName: 'Build TelemetryLib.csproj' +- task: NuGetToolInstaller@1 + displayName: 'Use NuGet ' inputs: - solution: TelemetryLib/TelemetryLib.csproj - msbuildArgs: '/property:AppInsightsKey="AIF-58ef8eab-a250-4b11-aea8-36435e5be1a7" /restore' - platform: x64 - configuration: Release - msbuildArchitecture: x64 + checkLatest: true - task: PowerShell@2 displayName: 'Build FabricObserver' @@ -49,30 +42,45 @@ steps: inputs: targetType: filePath filePath: './Build-ClusterObserver.ps1' + enabled: false - task: PowerShell@2 displayName: 'Build Nuget Packages - CO' inputs: targetType: filePath filePath: './Build-CONugetPackages.ps1' + enabled: false - task: PublishBuildArtifacts@1 displayName: 'Publish Nuget Packages - CO' inputs: PathtoPublish: '$(Build.SourcesDirectory)/bin/Release/ClusterObserver/Nugets' ArtifactName: ClusterObserverNugets + enabled: false - task: PowerShell@2 displayName: 'Build Service Fabric Packages - CO' inputs: targetType: filePath filePath: './Build-COSFPkgs.ps1' + enabled: false - task: PublishBuildArtifacts@1 displayName: 'Publish Service Fabric Packages - CO' inputs: PathtoPublish: '$(Build.SourcesDirectory)/bin/Release/ClusterObserver/SFPkgs' ArtifactName: ClusterObserverSFPackage + enabled: false + +- task: VSTest@2 + displayName: 'VsTest - FabricObserver Unit Tests' + inputs: + testAssemblyVer2: '$(Parameters.testAssemblyVer2)' + vsTestVersion: 16.0 + runTestsInIsolation: false + publishRunAttachments: false + rerunFailedTests: true + enabled: false - task: PowerShell@2 displayName: 'Build FabricObserverWebApi' @@ -107,4 +115,4 @@ steps: inputs: PathtoPublish: '$(Build.SourcesDirectory)/bin/Release/FabricObserverWeb/SFPkgs' ArtifactName: FabricObserverSFPackage - enabled: false + enabled: false \ No newline at end of file From 008fc8152a288c9fba37adedf02b19f13739b454 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 28 Sep 2021 13:05:25 -0700 Subject: [PATCH 17/35] Default AppManifest --- .../ApplicationPackageRoot/ApplicationManifest.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 1ef33ac1..7852ea14 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -336,10 +336,10 @@ - + + From fa9b6189cff18366358c08946334cd9d617e70a9 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 30 Sep 2021 14:54:05 -0700 Subject: [PATCH 18/35] Bug fixes, concurrent child process monitoring --- FabricObserver.Extensibility/ObserverBase.cs | 2 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 79 ++-- FabricObserver/Observers/AppObserver.cs | 370 ++++++++++-------- FabricObserver/Observers/OSObserver.cs | 9 + .../ApplicationManifest.xml | 28 +- TelemetryLib/TelemetryEvents.cs | 3 +- TelemetryLib/TelemetryLib.csproj | 8 +- 7 files changed, 273 insertions(+), 226 deletions(-) diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 7b1c2d48..a224e9a2 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -27,7 +27,7 @@ namespace FabricObserver.Observers { public abstract class ObserverBase : IObserver { - private const int TtlAddMinutes = 5; + private const int TtlAddMinutes = 1; private const string FabricSystemAppName = "fabric:/System"; private bool disposed; private Dictionary ServiceDumpCountDictionary; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 2e493328..96e640b8 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -122,56 +122,64 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService { List<(string ProcName, int Pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].Pid); - if (c1 != null && c1.Count > 0) + if (c1 == null || c1.Count <= 0) { - childProcesses.AddRange(c1); + continue; + } + + childProcesses.AddRange(c1); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int j = 0; j < c1.Count; ++j) + { + List<(string ProcName, int Pid)> c2 = TupleGetChildProcessInfo(c1[j].Pid); + + if (c2 == null || c2.Count <= 0) + { + continue; + } + + childProcesses.AddRange(c2); if (childProcesses.Count >= MaxDescendants) { return childProcesses.Take(MaxDescendants).ToList(); } - for (int j = 0; j < c1.Count; ++j) + for (int k = 0; k < c2.Count; ++k) { - List<(string ProcName, int Pid)> c2 = TupleGetChildProcessInfo(c1[j].Pid); + List<(string ProcName, int Pid)> c3 = TupleGetChildProcessInfo(c2[k].Pid); - if (c2 != null && c2.Count > 0) + if (c3 == null || c3.Count <= 0) { - childProcesses.AddRange(c2); + continue; + } - if (childProcesses.Count >= MaxDescendants) - { - return childProcesses.Take(MaxDescendants).ToList(); - } + childProcesses.AddRange(c3); - for (int k = 0; k < c2.Count; ++k) - { - List<(string ProcName, int Pid)> c3 = TupleGetChildProcessInfo(c2[k].Pid); - - if (c3 != null && c3.Count > 0) - { - childProcesses.AddRange(c3); + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } - if (childProcesses.Count >= MaxDescendants) - { - return childProcesses.Take(MaxDescendants).ToList(); - } + for (int l = 0; l < c3.Count; ++l) + { + List<(string ProcName, int Pid)> c4 = TupleGetChildProcessInfo(c3[l].Pid); - for (int l = 0; l < c3.Count; ++l) - { - List<(string ProcName, int Pid)> c4 = TupleGetChildProcessInfo(c3[l].Pid); + if (c4 == null || c4.Count <= 0) + { + continue; + } - if (c4 != null && c4.Count > 0) - { - childProcesses.AddRange(c4); + childProcesses.AddRange(c4); - if (childProcesses.Count >= MaxDescendants) - { - return childProcesses.Take(MaxDescendants).ToList(); - } - } - } - } + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); } } } @@ -188,6 +196,7 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return null; } + string[] ignoreProcessList = new string[] { "conhost.exe", "csrss.exe", "svchost.exe", "wininit.exe" }; List<(string procName, int pid)> childProcesses = null; string query = $"select caption,processid from win32_process where parentprocessid = {processId}"; @@ -213,7 +222,7 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService continue; } - if (childProcessNameObj.ToString() == "conhost.exe") + if (ignoreProcessList.Contains(childProcessNameObj.ToString())) { continue; } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 4e6a72b6..c00b8e47 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -308,13 +308,13 @@ public override Task ReportAsync(CancellationToken token) } ProcessResourceDataReportHealth( - parentFrud, - app.ErrorOpenFileHandles, - app.WarningOpenFileHandles, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.ErrorOpenFileHandles, + app.WarningOpenFileHandles, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Child proc info telemetry. @@ -945,6 +945,8 @@ private void SetDumpPath() private Task MonitorDeployedAppsAsync(CancellationToken token) { + Stopwatch execTimer = Stopwatch.StartNew(); + int capacity = ReplicaOrInstanceList.Count; AllAppCpuData ??= new ConcurrentDictionary>(); AllAppMemDataMb ??= new ConcurrentDictionary>(); @@ -961,14 +963,13 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) var repOrInst = ReplicaOrInstanceList.ElementAt(i); var timer = new Stopwatch(); int parentPid = (int)repOrInst.HostProcessId; - var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false; var application = deployedTargetList?.First( app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString.ToLower() || !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - ConcurrentDictionary procList = null; + ConcurrentDictionary procs = null; if (application?.TargetApp == null && application?.TargetAppType == null) { @@ -1079,7 +1080,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) if (!AllAppCpuData.ContainsKey(id) && (application.CpuErrorLimitPercent > 0 || application.CpuWarningLimitPercent > 0)) { - _ = AllAppCpuData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); + _ = AllAppCpuData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, id, capacity, UseCircularBuffer)); } if (AllAppCpuData.ContainsKey(id)) @@ -1091,7 +1092,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { _ = AllAppMemDataMb.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, id, capacity, UseCircularBuffer)); } - + if (AllAppMemDataMb.ContainsKey(id)) { checkMemMb = true; @@ -1111,7 +1112,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { _ = AllAppTotalActivePortsData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, id, 1, false)); } - + if (AllAppTotalActivePortsData.ContainsKey(id)) { checkAllPorts = true; @@ -1121,7 +1122,7 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) { _ = AllAppEphemeralPortsData.TryAdd(id, new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, id, 1, false)); } - + if (AllAppEphemeralPortsData.ContainsKey(id)) { checkEphemeralPorts = true; @@ -1137,222 +1138,247 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) checkHandles = true; } + // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. + if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) + { + return; + } + // Get list of child processes of parentProc should they exist. // In order to provide accurate resource usage of an SF service process we need to also account for // any processes (children) that the service process (parent) created/spawned. - procList = new ConcurrentDictionary(); + procs = new ConcurrentDictionary(); // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, - // then only the parent process will be in this list. - _ = procList.TryAdd(parentProc.ProcessName, parentProc.Id); + // then only the parent process will be in this dictionary. + _ = procs.TryAdd(parentProc.ProcessName, parentProc.Id); if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) { for (int k = 0; k < repOrInst.ChildProcesses.Count; ++k) { - _ = procList.TryAdd(repOrInst.ChildProcesses[k].procName, repOrInst.ChildProcesses[k].Pid); + _ = procs.TryAdd(repOrInst.ChildProcesses[k].procName, repOrInst.ChildProcesses[k].Pid); } } - for (int j = 0; j < procList.Count; ++j) + ComputeResourceUsage( + capacity, + parentPid, + checkCpu, + checkMemMb, + checkMemPct, + checkAllPorts, + checkEphemeralPorts, + checkHandles, + procs, + id, + token); + } + catch (AggregateException e) when (e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException) + { + state.Stop(); + } + catch (Exception e) + { + exceptions.Enqueue(e); + } + }); + + if (!exceptions.IsEmpty) + { + var aggEx = new AggregateException(exceptions); + ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{aggEx}"); + throw new AggregateException(aggEx); + } +#if DEBUG + ObserverLogger.LogInfo($"MonitorDeployedAppsAsync execution time: {execTimer.Elapsed}"); +#endif + return Task.CompletedTask; + } + + private void ComputeResourceUsage( + int capacity, + int parentPid, + bool checkCpu, + bool checkMemMb, + bool checkMemPct, + bool checkAllPorts, + bool checkEphemeralPorts, + bool checkHandles, + ConcurrentDictionary procs, + string id, + CancellationToken token) + { + _ = Parallel.ForEach(procs, (proc, state) => + { + int procId = proc.Value; + string procName = proc.Key; + TimeSpan maxDuration = TimeSpan.FromSeconds(1); + CpuUsage cpuUsage = new CpuUsage(); + + if (MonitorDuration > TimeSpan.MinValue) + { + maxDuration = MonitorDuration; + } + + /* Warm up Windows perf counters. */ + + if (checkCpu) + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - int procId = procList.ElementAt(j).Value; - string procName = procList.ElementAt(j).Key; - TimeSpan maxDuration = TimeSpan.FromSeconds(1); + _ = cpuUsage.GetCpuUsagePercentageProcess(procId); + } + } - if (MonitorDuration > TimeSpan.MinValue) - { - maxDuration = MonitorDuration; - } + // Handles/FDs + if (checkHandles) + { + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); - // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. - if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) + if (handles > -1) + { + if (procId == parentPid) { - return; + AllAppHandlesData[id].Data.Add(handles); } - - /* Warm up Windows perf counters. */ - - if (checkCpu) + else { - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + if (!AllAppHandlesData.ContainsKey($"{id}:{procName}")) { - _ = cpuUsage.GetCpuUsagePercentageProcess(procId); + _ = AllAppHandlesData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppHandlesData[$"{id}:{procName}"].Data.Add(handles); } + } + } - // Handles/FDs - if (checkHandles) + // Total TCP ports usage + if (checkAllPorts) + { + // Parent process (the service process). + if (procId == parentPid) + { + AllAppTotalActivePortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + else + { + // Child procs spawned by the parent service process. + if (!AllAppTotalActivePortsData.ContainsKey($"{id}:{procName}")) { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + _ = AllAppTotalActivePortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppTotalActivePortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + } - if (handles > -1) - { - if (procId == parentPid) - { - AllAppHandlesData[id].Data.Add(handles); - } - else - { - if (!AllAppHandlesData.ContainsKey($"{id}:{procName}")) - { - AllAppHandlesData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppHandlesData[$"{id}:{procName}"].Data.Add(handles); - } - } + // Ephemeral TCP ports usage + if (checkEphemeralPorts) + { + if (procId == parentPid) + { + AllAppEphemeralPortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + else + { + if (!AllAppEphemeralPortsData.ContainsKey($"{id}:{procName}")) + { + _ = AllAppEphemeralPortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppEphemeralPortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + } - // Total TCP ports usage - if (checkAllPorts) + // Monitor Duration applies to the code below. + var timer = Stopwatch.StartNew(); + + while (timer.Elapsed <= maxDuration) + { + token.ThrowIfCancellationRequested(); + + // CPU (all cores) \\ + + if (checkCpu) + { + double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); + + if (cpu >= 0) { - // Parent process (the service process). - if (procId == parentPid) + if (cpu > 100) { - AllAppTotalActivePortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); - } - else - { - // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.ContainsKey($"{id}:{procName}")) - { - AllAppTotalActivePortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppTotalActivePortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + cpu = 100; } - } - // Ephemeral TCP ports usage - if (checkEphemeralPorts) - { if (procId == parentPid) { - AllAppEphemeralPortsData[id].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppCpuData[id].Data.Add(cpu); } else { - if (!AllAppEphemeralPortsData.ContainsKey($"{id}:{procName}")) + if (!AllAppCpuData.ContainsKey($"{id}:{procName}")) { - AllAppEphemeralPortsData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + _ = AllAppCpuData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); } - AllAppEphemeralPortsData[$"{id}:{procName}"].Data.Add(OSInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + AllAppCpuData[$"{id}:{procName}"].Data.Add(cpu); } } + } - // Monitor Duration applies to the code below. - timer.Start(); + // Memory \\ - while (timer.Elapsed <= maxDuration) - { - token.ThrowIfCancellationRequested(); + float processMem = 0; - // CPU (all cores) \\ + // private working set. + if (checkMemMb) + { + processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); - if (checkCpu) + if (procId == parentPid) + { + AllAppMemDataMb[id].Data.Add(processMem); + } + else + { + if (!AllAppMemDataMb.ContainsKey($"{id}:{procName}")) { - double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); - - if (cpu >= 0) - { - if (cpu > 100) - { - cpu = 100; - } - - if (procId == parentPid) - { - AllAppCpuData[id].Data.Add(cpu); - } - else - { - if (!AllAppCpuData.ContainsKey($"{id}:{procName}")) - { - AllAppCpuData.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppCpuData[$"{id}:{procName}"].Data.Add(cpu); - } - } + _ = AllAppMemDataMb.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppMemDataMb[$"{id}:{procName}"].Data.Add(processMem); + } + } - // Memory \\ + // percent in use (of total). + if (checkMemPct) + { + if (processMem == 0) + { + processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); + } - float processMem = 0; + var (TotalMemoryGb, _, _) = OSInfoProvider.Instance.TupleGetMemoryInfo(); - // private working set. - if (checkMemMb) - { - processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); + if (TotalMemoryGb > 0) + { + double usedPct = Math.Round((double)(processMem * 100) / (TotalMemoryGb * 1024), 2); - if (procId == parentPid) - { - AllAppMemDataMb[id].Data.Add(processMem); - } - else - { - if (!AllAppMemDataMb.ContainsKey($"{id}:{procName}")) - { - AllAppMemDataMb.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppMemDataMb[$"{id}:{procName}"].Data.Add(processMem); - } + if (procId == parentPid) + { + AllAppMemDataPercent[id].Data.Add(Math.Round(usedPct, 1)); } - - // percent in use (of total). - if (checkMemPct) + else { - if (processMem == 0) - { - processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); - } - - var (TotalMemoryGb, _, _) = OSInfoProvider.Instance.TupleGetMemoryInfo(); - - if (TotalMemoryGb > 0) + if (!AllAppMemDataPercent.ContainsKey($"{id}:{procName}")) { - double usedPct = Math.Round((double)(processMem * 100) / (TotalMemoryGb * 1024), 2); - - if (procId == parentPid) - { - AllAppMemDataPercent[id].Data.Add(Math.Round(usedPct, 1)); - } - else - { - if (!AllAppMemDataPercent.ContainsKey($"{id}:{procName}")) - { - AllAppMemDataPercent.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); - } - AllAppMemDataPercent[$"{id}:{procName}"].Data.Add(Math.Round(usedPct, 1)); - } + _ = AllAppMemDataPercent.TryAdd($"{id}:{procName}", new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppMemDataPercent[$"{id}:{procName}"].Data.Add(Math.Round(usedPct, 1)); } - - Thread.Sleep(150); } - - timer.Stop(); - timer.Reset(); } + + Thread.Sleep(150); } - catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) - { - state.Stop(); - } - catch (Exception e) - { - exceptions.Enqueue(e); - } - }); - - if (!exceptions.IsEmpty) - { - var aggEx = new AggregateException(exceptions); - ObserverLogger.LogError($"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{aggEx}"); - throw new AggregateException(aggEx); - } -#if DEBUG - ObserverLogger.LogInfo($"MonitorDeployedAppsAsync execution time: {stopwatch?.Elapsed}"); -#endif - return Task.CompletedTask; + }); } private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) @@ -1525,6 +1551,7 @@ private void SetInstanceOrReplicaMonitoringList( if (childPids != null && childPids.Count > 0) { replicaInfo.ChildProcesses = childPids; + ObserverLogger.LogInfo($"{replicaInfo.ServiceName}:{Environment.NewLine}Child procs (name, id): {string.Join(" ", replicaInfo.ChildProcesses)}"); } } @@ -1561,6 +1588,7 @@ private void SetInstanceOrReplicaMonitoringList( if (childProcs != null && childProcs.Count > 0) { replicaInfo.ChildProcesses = childProcs; + ObserverLogger.LogInfo($"{replicaInfo.ServiceName}:{Environment.NewLine}Child procs (name, id): {string.Join(" ", replicaInfo.ChildProcesses)}"); } } diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 8866d923..fc4d3cd7 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -93,6 +93,8 @@ public override async Task ReportAsync(CancellationToken token) // OS Health. if (osStatus != null && !string.Equals(osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { + CurrentErrorCount++; + string healthMessage = $"OS reporting unhealthy: {osStatus}"; var healthReport = new HealthReport { @@ -145,6 +147,11 @@ public override async Task ReportAsync(CancellationToken token) // Clear Error or Warning with an OK Health Report. string healthMessage = $"OS reporting healthy: {osStatus}"; + if (CurrentErrorCount > 0) + { + CurrentErrorCount--; + } + var healthReport = new HealthReport { Observer = ObserverName, @@ -219,6 +226,8 @@ public override async Task ReportAsync(CancellationToken token) // Windows Update automatic download enabled? if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && isAUAutomaticDownloadEnabled) { + CurrentWarningCount++; + string linkText = $"{Environment.NewLine}For clusters of Silver durability or above, " + "please consider " + diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 7852ea14..02238a3c 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -49,9 +49,9 @@ - - - + + + @@ -64,19 +64,23 @@ + - - - + + + - - - - + + + + @@ -336,7 +340,7 @@ - @@ -352,7 +356,7 @@ - - From 023773a5438041b4e39ac65ca8601f89d6ba3695 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 30 Sep 2021 15:38:46 -0700 Subject: [PATCH 21/35] Fixed CriticalErrorEvent data event name.. --- TelemetryLib/TelemetryEvents.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TelemetryLib/TelemetryEvents.cs b/TelemetryLib/TelemetryEvents.cs index b0588077..4328f9d2 100644 --- a/TelemetryLib/TelemetryEvents.cs +++ b/TelemetryLib/TelemetryEvents.cs @@ -8,7 +8,6 @@ using System.Fabric; using System.IO; using System.Linq; -using System.Reflection; using System.Threading; using Microsoft.ApplicationInsights; using Microsoft.ApplicationInsights.Extensibility; @@ -221,7 +220,7 @@ public bool EmitFabricObserverCriticalErrorEvent(FabricObserverCriticalErrorEven { "OS", foErrorData.OS } }; - telemetryClient?.TrackEvent($"{TaskName}.{OperationalEventName}", eventProperties); + telemetryClient?.TrackEvent($"{TaskName}.{CriticalErrorEventName}", eventProperties); telemetryClient?.Flush(); // allow time for flushing From b60d64bf8a6b00f88242294d1b79aa71ff79f812 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 30 Sep 2021 16:24:24 -0700 Subject: [PATCH 22/35] comment --- .../ApplicationPackageRoot/ApplicationManifest.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 7b6eb7c5..04a6564f 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -66,7 +66,7 @@ From da93dd4231e9c5cfdb9b0227a385f5cc9306958f Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 30 Sep 2021 16:30:37 -0700 Subject: [PATCH 23/35] FO 3.1.18 nuspec --- FabricObserver.nuspec.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 0b71ded9..265338aa 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -3,7 +3,7 @@ %PACKAGE_ID% 3.1.18 - This release introduces support for parallel execution of service process monitoring in AppObserver, ContainerObserver and FabricSystemObserver. Please see the release notes on the FO repo for more information. + This release introduces support for parallel execution of service process monitoring in AppObserver, ContainerObserver and FabricSystemObserver on machines with capable hardware configuration (logical processors >= 4). There are important bug fixes and code improvements in this release. Please see the release notes on the FabricObserver repo for more information. Microsoft MIT true From bd7fd300319a765040a2edf8c30303dac81b51cb Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Fri, 1 Oct 2021 16:28:19 -0700 Subject: [PATCH 24/35] Removed AI config, updated AI format, bug fixes. --- .../ProcessInfo/WindowsProcessInfoProvider.cs | 5 +- .../Telemetry/AppInsightsTelemetry.cs | 20 ++- FabricObserver/ApplicationInsights.config | 141 ------------------ FabricObserver/FabricObserver.csproj | 6 +- FabricObserver/Observers/AppObserver.cs | 9 +- .../ApplicationManifest.xml | 4 +- 6 files changed, 23 insertions(+), 162 deletions(-) delete mode 100644 FabricObserver/ApplicationInsights.config diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 96e640b8..364f5f0a 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -283,8 +283,9 @@ private float GetProcessPrivateWorkingSetMb(int processId) { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - ulong workingSet = (ulong)mObj.Properties["WorkingSetPrivate"].Value / 1024 / 1024; - return workingSet; + ulong workingSet = (ulong)mObj.Properties["WorkingSetPrivate"].Value; + float privWorkingSetMb = Convert.ToSingle(workingSet); + return privWorkingSetMb / 1024 / 1024; } } catch (Exception e) when (e is ArgumentException || e is ManagementException) diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 6fef01be..645fc078 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -254,8 +254,6 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "ServiceName", telemetryData.ServiceName ?? string.Empty }, { "ProcessId", telemetryData.ProcessId.ToString() }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", telemetryData.Value.ToString() }, { "PartitionId", telemetryData.PartitionId }, { "ReplicaId", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, @@ -263,7 +261,12 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "OS", telemetryData.OS ?? string.Empty } }; - telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + var metric = new Dictionary + { + { telemetryData.Metric, telemetryData.Value } + }; + + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties, metric); } catch (Exception e) { @@ -306,9 +309,6 @@ public Task ReportMetricAsync(List telemetryDataList, { "ApplicationName", telemData.ApplicationName ?? string.Empty }, { "ServiceName", telemData.ServiceName ?? string.Empty }, { "ProcessId", telemData.ProcessId.ToString() }, - { "Metric", telemData.Metric ?? string.Empty }, - { "Value", telemData.Value.ToString() }, - { "ChildProcessCount", telemData.ChildProcessCount.ToString() }, { "ChildProcessInfo", JsonConvert.SerializeObject(telemData.ChildProcessInfo) }, { "PartitionId", telemData.PartitionId }, { "ReplicaId", telemData.ReplicaId }, @@ -317,7 +317,13 @@ public Task ReportMetricAsync(List telemetryDataList, { "OS", OS } }; - telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + var metrics = new Dictionary + { + { "ChildProcessCount", telemData.ChildProcessCount }, + { $"{telemData.Metric} (Parent + Descendants)", telemData.Value } + }; + + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties, metrics); } catch (Exception e) { diff --git a/FabricObserver/ApplicationInsights.config b/FabricObserver/ApplicationInsights.config deleted file mode 100644 index bba013fa..00000000 --- a/FabricObserver/ApplicationInsights.config +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - - - - - - - search|spider|crawl|Bot|Monitor|AlwaysOn - - - - - - - - - - - - - - - core.windows.net - core.chinacloudapi.cn - core.cloudapi.de - core.usgovcloudapi.net - - - Microsoft.Azure.EventHubs - Microsoft.Azure.ServiceBus - - - - - - - - - - - - - - - - - - - Microsoft.VisualStudio.Web.PageInspector.Runtime.Tracing.RequestDataHttpHandler - System.Web.StaticFileHandler - System.Web.Handlers.AssemblyResourceLoader - System.Web.Optimization.BundleHandler - System.Web.Script.Services.ScriptHandlerFactory - System.Web.Handlers.TraceHandler - System.Web.Services.Discovery.DiscoveryRequestHandler - System.Web.HttpDebugHandler - - - - - - - - - - - - - 5 - Event - - - 5 - Event - - - - - - diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index c00e1e0e..f5ea07b8 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -24,6 +24,9 @@ true AnyCPU;x64 + + + @@ -45,9 +48,6 @@ - - PreserveNewest - Designer diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index d93dcffc..6a2e82c8 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -1326,12 +1326,10 @@ private void ComputeResourceUsage( // Memory \\ - float processMem = 0; - // private working set. if (checkMemMb) { - processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); + float processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); if (procId == parentPid) { @@ -1350,10 +1348,7 @@ private void ComputeResourceUsage( // percent in use (of total). if (checkMemPct) { - if (processMem == 0) - { - processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); - } + float processMem = ProcessInfoProvider.Instance.GetProcessWorkingSetMb(procId, true); var (TotalMemoryGb, _, _) = OSInfoProvider.Instance.TupleGetMemoryInfo(); diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 04a6564f..d1324149 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -340,7 +340,7 @@ - @@ -356,7 +356,7 @@ - - - - - + + + From 4e369594b06b28c49900d41338082ca26691b496 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 5 Oct 2021 16:08:07 -0700 Subject: [PATCH 28/35] CO 2.1.11 (bug fix), FO 3.1.18 (Threads config) --- Build-COSFPkgs.ps1 | 8 +- ClusterObserver.nuspec.template | 4 +- ClusterObserver/ClusterObserver.cs | 247 +++++++++--------- ClusterObserver/ClusterObserverManager.cs | 87 +++--- .../PackageRoot/Config/Settings.xml | 2 +- .../PackageRoot/ServiceManifest.xml | 6 +- ClusterObserver/Utilities/JsonHelper.cs | 10 +- ClusterObserver/Utilities/Logger.cs | 29 +- .../ApplicationManifest.xml | 4 +- .../Config/AppObserver.config.json | 3 +- 10 files changed, 188 insertions(+), 212 deletions(-) diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index 1d86e1ad..ba745cda 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.11" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.11" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.11" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.11" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index 5d576d76..2b1fb3ce 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,9 +2,9 @@ %PACKAGE_ID% - 2.1.10 + 2.1.11 - Updated TelemetryData and ApplicationInsights impl to match FO 3.1.15's impls. + Bug fixes. Microsoft MIT diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index f0aaea63..e0a24aa1 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -136,15 +136,13 @@ private async Task ReportClusterHealthAsync(CancellationToken token) foreach (var repair in repairsInProgress) { - ids += - $"TaskId: {repair.TaskId}{Environment.NewLine}State: {repair.State}{Environment.NewLine}"; + ids += $"TaskId: {repair.TaskId}{Environment.NewLine}State: {repair.State}{Environment.NewLine}"; } - repairState += - $"There are currently one or more Repair Jobs processing in the cluster.{Environment.NewLine}{ids}"; + repairState += $"There are currently one or more Repair Jobs processing in the cluster.{Environment.NewLine}{ids}"; // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetry = new TelemetryData(FabricClientInstance, token) { @@ -154,22 +152,22 @@ private async Task ReportClusterHealthAsync(CancellationToken token) Source = ObserverName }; - await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetry, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Cluster", - HealthState = "Ok", - HealthEventDescription = repairState, - Metric = "AggregatedClusterHealth", - Source = ObserverName - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Cluster", + HealthState = "Ok", + HealthEventDescription = repairState, + Metric = "AggregatedClusterHealth", + Source = ObserverName + }); } } } @@ -185,7 +183,7 @@ private async Task ReportClusterHealthAsync(CancellationToken token) LastKnownClusterHealthState = HealthState.Ok; // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetry = new TelemetryData(FabricClientInstance, token) { @@ -195,22 +193,22 @@ private async Task ReportClusterHealthAsync(CancellationToken token) Source = ObserverName }; - await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetry, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Cluster", - HealthState = "Ok", - HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", - Metric = "AggregatedClusterHealth", - Source = ObserverName - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Cluster", + HealthState = "Ok", + HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", + Metric = "AggregatedClusterHealth", + Source = ObserverName + }); } } else @@ -300,7 +298,7 @@ private async Task ReportClusterHealthAsync(CancellationToken token) ObserverLogger.LogWarning(msg); // Send Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetryData = new TelemetryData(FabricClientInstance, token) { @@ -308,19 +306,19 @@ private async Task ReportClusterHealthAsync(CancellationToken token) Description = msg }; - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetryData, token); } // Emit ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthState = "Warning", - HealthEventDescription = msg - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthState = "Warning", + HealthEventDescription = msg + }); } // Fix the bug. @@ -342,7 +340,8 @@ private async Task ProcessApplicationHealthAsync(IList a token.ThrowIfCancellationRequested(); string telemetryDescription = string.Empty; - ApplicationHealth appHealth = await FabricClientInstance.HealthManager.GetApplicationHealthAsync( + + var appHealth = await FabricClientInstance.HealthManager.GetApplicationHealthAsync( healthState.ApplicationName, ConfigSettings.AsyncTimeout, token).ConfigureAwait(true); @@ -386,41 +385,40 @@ private async Task ProcessApplicationHealthAsync(IList a foreach (HealthEvent healthEvent in appHealthEvents.OrderByDescending(f => f.SourceUtcTimestamp)) { + // From FabricObserver? var foTelemetryData = TryGetFOHealthStateEventData(healthEvent, HealthScope.Application); - // From FabricObserver? if (foTelemetryData != null) { foTelemetryData.Description += telemetryDescription; // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { - - await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); + await ObserverTelemetryClient?.ReportHealthAsync(foTelemetryData, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - foTelemetryData.ApplicationName, - foTelemetryData.ServiceName, - foTelemetryData.HealthState, - foTelemetryData.Description, - foTelemetryData.Metric, - foTelemetryData.ObserverName, - foTelemetryData.NodeName, - foTelemetryData.Source, - foTelemetryData.PartitionId, - foTelemetryData.ProcessId, - foTelemetryData.ReplicaId, - foTelemetryData.SystemServiceProcessName, - foTelemetryData.Value - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + foTelemetryData.ApplicationName, + foTelemetryData.ServiceName, + foTelemetryData.HealthState, + foTelemetryData.Description, + foTelemetryData.Metric, + foTelemetryData.ObserverName, + foTelemetryData.NodeName, + foTelemetryData.Source, + foTelemetryData.PartitionId, + foTelemetryData.ProcessId, + foTelemetryData.ReplicaId, + foTelemetryData.SystemServiceProcessName, + foTelemetryData.Value + }); } // Reset @@ -438,7 +436,7 @@ private async Task ProcessApplicationHealthAsync(IList a } // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetryData = new TelemetryData(FabricClientInstance, token) { @@ -448,21 +446,21 @@ private async Task ProcessApplicationHealthAsync(IList a Source = ObserverName }; - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetryData, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + ApplicationName = appName.OriginalString, + HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), + HealthEventDescription = telemetryDescription, + Source = ObserverName + }); } // Reset @@ -533,7 +531,7 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt targetNode = targetNodeList[0]; } - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetryData = new TelemetryData(FabricClientInstance, token) { @@ -547,29 +545,30 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt }; // Telemetry. - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetryData, token); } // ETW. - if (!EtwEnabled) + if (EtwEnabled) { - continue; + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + node.NodeName, + NodeStatus = targetNode != null ? Enum.GetName(typeof(NodeStatus), targetNode.NodeStatus) : string.Empty, + HealthScope = "Node", + HealthState = Enum.GetName(typeof(HealthState), node.AggregatedHealthState), + HealthEventDescription = telemetryDescription, + Metric = metric ?? "AggregatedClusterHealth", + ObserverName = sourceObserver ?? string.Empty, + Source = foStats != null ? foStats.Source : ObserverName, + Value = foStats != null ? foStats.Value : 0 + }); } - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - node.NodeName, - NodeStatus = targetNode != null ? Enum.GetName(typeof(NodeStatus), targetNode.NodeStatus) : string.Empty, - HealthScope = "Node", - HealthState = Enum.GetName(typeof(HealthState), node.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Metric = metric ?? "AggregatedClusterHealth", - ObserverName = sourceObserver ?? string.Empty, - Source = foStats != null ? foStats.Source : ObserverName, - Value = foStats != null ? foStats.Value : 0 - }); + // Reset + telemetryDescription = string.Empty; } } } @@ -589,22 +588,22 @@ private async Task ProcessGenericEntityHealthAsync(HealthEvaluation evaluation, }; // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetryData, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthEventDescription = telemetryDescription, - HealthState = healthState, - Source = ObserverName - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthEventDescription = telemetryDescription, + HealthState = healthState, + Source = ObserverName + }); } } @@ -626,7 +625,7 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) } // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetry = new TelemetryData(FabricClientInstance, token) { @@ -638,22 +637,22 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Value = 0 }; - await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetry, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthState = "Ok", - Description = $"{nodeDictItem.Key} is now Up.", - Metric = "NodeStatus", - NodeName = nodeDictItem.Key, - Source = ObserverName, - Value = 0 + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthState = "Ok", + Description = $"{nodeDictItem.Key} is now Up.", + Metric = "NodeStatus", + NodeName = nodeDictItem.Key, + Source = ObserverName, + Value = 0 }); } @@ -699,7 +698,7 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) $"for {Math.Round(kvp.Value.LastDetectedTime.Subtract(kvp.Value.FirstDetectedTime).TotalHours, 2)} hours.{Environment.NewLine}"; // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + if (TelemetryEnabled) { var telemetry = new TelemetryData(FabricClientInstance, token) { @@ -711,23 +710,23 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Value = 1, }; - await ObserverTelemetryClient.ReportHealthAsync(telemetry, token); + await ObserverTelemetryClient?.ReportHealthAsync(telemetry, token); } // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthState = "Warning", - Description = message, - Metric = "NodeStatus", - NodeName = kvp.Key, - Source = ObserverName, - Value = 1, - }); + ObserverLogger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthState = "Warning", + Description = message, + Metric = "NodeStatus", + NodeName = kvp.Key, + Source = ObserverName, + Value = 1, + }); } } } diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index 55be9fbe..1fe71183 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -36,9 +36,8 @@ public bool IsObserverRunning } private static int ObserverExecutionLoopSleepSeconds - { - get; - set; + { + get; set; } = ObserverConstants.ObserverRunLoopSleepTimeSeconds; public static int AsyncOperationTimeoutSeconds @@ -71,8 +70,6 @@ public static bool TelemetryEnabled public static bool EtwEnabled { get => bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableETWProvider), out etwEnabled) && etwEnabled; - - set => etwEnabled = value; } public static string LogPath @@ -285,16 +282,16 @@ public async Task StartAsync() // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = message, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName - }); + Logger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = message, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName + }); } // Don't swallow the unhandled exception. Fix the bug. @@ -339,16 +336,16 @@ private Task SignalAbortToRunningObserverAsync() // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = $"{e}", - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName - }); + Logger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = $"{e}", + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName + }); } } @@ -392,16 +389,16 @@ private async Task RunObserverAync() if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = observerHealthWarning, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName - }); + Logger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = observerHealthWarning, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName + }); } // Create new instance of CO. @@ -427,16 +424,16 @@ private async Task RunObserverAync() if (EtwEnabled) { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = msg, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName - }); + Logger.LogEtw( + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = msg, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName + }); } throw; diff --git a/ClusterObserver/PackageRoot/Config/Settings.xml b/ClusterObserver/PackageRoot/Config/Settings.xml index 1340c602..924ff5e5 100644 --- a/ClusterObserver/PackageRoot/Config/Settings.xml +++ b/ClusterObserver/PackageRoot/Config/Settings.xml @@ -3,7 +3,7 @@
- + diff --git a/ClusterObserver/PackageRoot/ServiceManifest.xml b/ClusterObserver/PackageRoot/ServiceManifest.xml index 8795ad20..12e39a65 100644 --- a/ClusterObserver/PackageRoot/ServiceManifest.xml +++ b/ClusterObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Utilities/JsonHelper.cs b/ClusterObserver/Utilities/JsonHelper.cs index a3b168fb..a6e2e44f 100644 --- a/ClusterObserver/Utilities/JsonHelper.cs +++ b/ClusterObserver/Utilities/JsonHelper.cs @@ -39,15 +39,7 @@ public static bool IsJson(string text) _ = JsonConvert.DeserializeObject(text); return true; } - catch (JsonSerializationException) - { - return false; - } - catch (JsonReaderException) - { - return false; - } - catch (JsonWriterException) + catch { return false; } diff --git a/ClusterObserver/Utilities/Logger.cs b/ClusterObserver/Utilities/Logger.cs index f7ad4511..dc0016f0 100644 --- a/ClusterObserver/Utilities/Logger.cs +++ b/ClusterObserver/Utilities/Logger.cs @@ -4,7 +4,6 @@ // ------------------------------------------------------------ using System; -using System.Diagnostics.Tracing; using System.IO; using System.Runtime.InteropServices; using System.Threading; @@ -30,11 +29,6 @@ private ILogger OLogger private readonly string loggerName; - public static EventSource EtwLogger - { - get; private set; - } - public bool EnableVerboseLogging { get; set; @@ -60,14 +54,6 @@ public string Filename get; } - static Logger() - { - if (EtwLogger == null && ClusterObserverManager.EtwEnabled) - { - EtwLogger = new EventSource(ObserverConstants.EventSourceProviderName); - } - } - /// /// Initializes a new instance of the class. /// @@ -117,6 +103,11 @@ public void LogError(string format, params object[] parameters) OLogger.Error(format, parameters); } + public void LogEtw(string eventName, T data) + { + ServiceEventSource.Current.Write(eventName, data); + } + public void LogWarning(string format, params object[] parameters) { OLogger.Warn(format, parameters); @@ -146,11 +137,9 @@ public bool TryWriteLogFile(string path, string content) File.WriteAllText(path, content); return true; } - catch (IOException) - { - } - catch (UnauthorizedAccessException) + catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) { + } Thread.Sleep(1000); @@ -224,19 +213,17 @@ public void InitializeLoggers() { Name = targetName, OptimizeBufferReuse = true, - ConcurrentWrites = true, FileName = file, Layout = "${longdate}--${uppercase:${level}}--${message}", OpenFileCacheTimeout = 5, ArchiveNumbering = ArchiveNumberingMode.DateAndSequence, ArchiveEvery = FileArchivePeriod.Day, + MaxArchiveDays = 14, AutoFlush = true }; LogManager.Configuration.AddTarget(loggerName + "LogFile", target); - var ruleInfo = new LoggingRule(loggerName, NLog.LogLevel.Debug, target); - LogManager.Configuration.LoggingRules.Add(ruleInfo); LogManager.ReconfigExistingLoggers(); } diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 974750e2..069ca59d 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -15,7 +15,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 5f25ee5e..63ce4dfe 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -3,6 +3,7 @@ "targetApp": "*", "cpuWarningLimitPercent": 85, "memoryWarningLimitMb": 1048, - "networkWarningEphemeralPorts": 7500 + "networkWarningEphemeralPorts": 7500, + "warningThreadCount": 350 } ] \ No newline at end of file From 6956eb5012bb2412041b2cbaf6bc7f02437f70df Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 5 Oct 2021 18:16:56 -0700 Subject: [PATCH 29/35] Bug fix (Thread) --- FabricObserver/PackageRoot/Config/AppObserver.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 63ce4dfe..97138295 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -4,6 +4,6 @@ "cpuWarningLimitPercent": 85, "memoryWarningLimitMb": 1048, "networkWarningEphemeralPorts": 7500, - "warningThreadCount": 350 + "warningThreadCount": 50 } ] \ No newline at end of file From a6aff4fe251790542531ac070a250dc9c9ce5105 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 5 Oct 2021 18:17:26 -0700 Subject: [PATCH 30/35] Default config --- FabricObserver/PackageRoot/Config/AppObserver.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 97138295..cb7f6858 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -4,6 +4,6 @@ "cpuWarningLimitPercent": 85, "memoryWarningLimitMb": 1048, "networkWarningEphemeralPorts": 7500, - "warningThreadCount": 50 + "warningThreadCount": 500 } ] \ No newline at end of file From 18ffc6df84a4fbb89859f37cbcb89a9e0914f444 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 6 Oct 2021 10:53:32 -0700 Subject: [PATCH 31/35] add setter --- ClusterObserver/ClusterObserverManager.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index 1fe71183..5a70bf85 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -70,6 +70,7 @@ public static bool TelemetryEnabled public static bool EtwEnabled { get => bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableETWProvider), out etwEnabled) && etwEnabled; + set => etwEnabled = value; } public static string LogPath From 8488a9885f339f89994b1c8cfadba8f59154937a Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 7 Oct 2021 15:34:39 -0700 Subject: [PATCH 32/35] Concurrency is per observer setting, code improvements. Threads support in FSO. --- ClusterObserver.nuspec.template | 4 +- Documentation/Observers.md | 95 +++- .../Utilities/ObserverConstants.cs | 3 + FabricObserver.nuspec.template | 2 +- FabricObserver/Observers/AppObserver.cs | 71 +-- FabricObserver/Observers/ContainerObserver.cs | 39 +- .../Observers/FabricSystemObserver.cs | 235 ++++++--- FabricObserver/Observers/ObserverManager.cs | 40 +- .../PackageRoot/Config/Settings.xml | 17 +- .../ApplicationManifest.xml | 27 +- FabricObserverTests/ObserverTest.cs | 486 +++++++----------- .../FabricObserverOperationalEventData.cs | 6 +- TelemetryLib/ObserverData.cs | 5 + TelemetryLib/TelemetryEvents.cs | 13 +- 14 files changed, 535 insertions(+), 508 deletions(-) diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index 2b1fb3ce..7c829588 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -3,9 +3,7 @@ %PACKAGE_ID% 2.1.11 - - Bug fixes. - + Bug fixes. Microsoft MIT true diff --git a/Documentation/Observers.md b/Documentation/Observers.md index 5b1071cc..9476a995 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -145,10 +145,14 @@ All settings are optional, ***except target OR targetType***, and can be omitted AppObserver also supports non-JSON parameters for configuration unrelated to thresholds. Like all observers these settings are located in ApplicationManifest.xml to support versionless configuration updates via application upgrade. -#### Non-json settings +#### Non-json settings set in ApplicationManifest.xml + +Version 3.1.18 introduces support for concurrent service process monitoring and reporting by AppObserver. You can enable/disable this feature by setting the boolean value for AppObserverEnableConcurrentMonitoring. Note that this is disabled by default. +If your compute configuration includes multiple CPUs (logical processors >= 4) and you monitor several services, then you should consider enabling this capability as it will significantly decrease the time it takes AppObserver to complete monitoring/reporting. +If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. ```XML - + @@ -157,16 +161,19 @@ AppObserver also supports non-JSON parameters for configuration unrelated to thr + - - - + - + + + ``` Example AppObserver Output (Warning - Ephemeral Ports Usage): @@ -178,7 +185,7 @@ AppObserver also optionally outputs CSV files for each app containing all resour AppObserver error/warning thresholds are user-supplied-only and bound to specific service instances (processes) as dictated by the user, as explained above. Like FabricSystemObserver, all data is stored in in-memory data structures for the lifetime of the run (for example, 60 seconds at 5 second intervals). Like all observers, the last thing this observer does is call its *ReportAsync*, which will then determine the health state based on accumulated data across resources, send a Warning if necessary (clear an existing warning if necessary), then clean out the in-memory data structures to limit impact on the system over time. So, each iteration of this observer accumulates *temporary* data for use in health determination. -This observer also monitors the FabricObserver service itself across CPU/Mem/FileHandles/Ports. +This observer can also monitor the FabricObserver service itself across CPU/Mem/FileHandles/Ports/Threads. ## AzureStorageUploadObserver Runs periodically (you can set its RunInterval setting, just like any observer) and will upload dmp files of user services that AppObserver creates when you set dumpProcessOnError to true and supply Error thresholds in AppObserver configuration. The files are compressed and uploaded to a specified Azure Storage Account (blob storage only) and blob container name (default is fodumps, but you can configure this). It will delete dmp files from local storage after each successful upload. @@ -306,6 +313,10 @@ Monitors CPU and Memory use of Service Fabric containerized (docker) services. **In order for ContainerObserver to function properly on Windows, FabricObserver must be configured to run as Admin or System user.** This is not the case for Linux deployments. +Version 3.1.18 introduces support for concurrent docker stats data parsing and reporting by ContainerObserver. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. +If your compute configuration includes multiple CPUs (logical processors >= 4) and you monitor several containerized services, then you should consider enabling this capability as it will significantly decrease the time it takes ContainerObserver to complete monitoring/reporting. +If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. + ### Configuration @@ -315,8 +326,12 @@ Settings.xml ```XML -
+
+ + @@ -375,8 +390,8 @@ After DiskObserver logs basic disk information, it performs measurements on all - - + +
``` @@ -430,6 +445,10 @@ By default, FabricObserver runs as NetworkUser on Windows and sfappsuser on Linu running as System or root, default FabricObserver can't monitor process behavior (this is always true on Windows). That said, there are only a few system services you would care about: Fabric.exe and FabricGateway.exe. Fabric.exe is generally the system service that your code can directly impact with respect to machine resource usage. +Version 3.1.18 introduces support for concurrent service process monitoring and reporting by FabricSystemObserver. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. +If your compute configuration includes multiple CPUs (logical processors >= 4), then you should consider enabling this capability as it will significantly decrease the time it takes FabricSystemObserver to complete monitoring/reporting. +If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. + **Input - Settings.xml**: Only ClusterOperationTimeoutSeconds is set in Settings.xml. ```xml @@ -443,22 +462,46 @@ services you would care about: Fabric.exe and FabricGateway.exe. Fabric.exe is g ```xml - - - - - - - - - - - - - - - - +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
``` **Output**: Log text(Error/Warning), Service Fabric Health Report (Error/Warning/Ok), ETW, Telemetry diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 98478573..fb097a6d 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -53,6 +53,7 @@ public sealed class ObserverConstants public const string MaxArchivedCsvFileLifetimeDays = "MaxArchivedCsvFileLifetimeDays"; public const string MaxArchivedLogFileLifetimeDays = "MaxArchivedLogFileLifetimeDays"; public const string CsvFileWriteFormat = "CsvFileWriteFormat"; + public const string EnableConcurrentMonitoring = "EnableConcurrentMonitoring"; // AppObserver. public const string AppObserverName = "AppObserver"; @@ -105,6 +106,8 @@ public sealed class ObserverConstants public const string FabricSystemObserverMonitorWindowsEventLog = "MonitorWindowsEventLog"; public const string FabricSystemObserverErrorHandles = "AllocatedHandlesErrorLimit"; public const string FabricSystemObserverWarningHandles = "AllocatedHandlesWarningLimit"; + public const string FabricSystemObserverWarningThreadCount = "ThreadCountWarningLimit"; + public const string FabricSystemObserverErrorThreadCount = "ThreadCountErrorLimit"; // NetworkObserver. public const string NetworkObserverName = "NetworkObserver"; diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 265338aa..55c561f9 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -3,7 +3,7 @@ %PACKAGE_ID% 3.1.18 - This release introduces support for parallel execution of service process monitoring in AppObserver, ContainerObserver and FabricSystemObserver on machines with capable hardware configuration (logical processors >= 4). There are important bug fixes and code improvements in this release. Please see the release notes on the FabricObserver repo for more information. + This release introduces support for parallel execution of service process monitoring in AppObserver, ContainerObserver and FabricSystemObserver on machines with capable hardware configurations (logical processors >= 4). Support for thread count monitoring by AppObserver and FabricSystemObserver. There are important bug fixes and code improvements in this release. Please see the release notes on the FabricObserver repo for more information. Microsoft MIT true diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index d76ae38d..35374fd6 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -41,7 +41,7 @@ public class AppObserver : ObserverBase private List userTargetList; // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. - private ConcurrentQueue deployedTargetList; + private List deployedTargetList; private readonly ConfigSettings configSettings; private string fileName; private readonly Stopwatch stopwatch; @@ -57,7 +57,7 @@ public bool EnableChildProcessMonitoring get; set; } - public ConcurrentQueue ReplicaOrInstanceList + public List ReplicaOrInstanceList { get; set; } @@ -67,6 +67,16 @@ public string ConfigPackagePath get; set; } + public bool EnableConcurrentMonitoring + { + get; set; + } + + ParallelOptions ParallelOptions + { + get; set; + } + public bool EnableProcessDumps { get; set; @@ -111,7 +121,7 @@ public override async Task ObserveAsync(CancellationToken token) if (EnableVerboseLogging) { - ObserverLogger.LogInfo($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + + ObserverLogger.LogInfo($"Run Duration {(ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + $"Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); } @@ -121,25 +131,20 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { - if (deployedTargetList.IsEmpty) + if (deployedTargetList.Count == 0) { return Task.CompletedTask; } TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); - _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => + _ = Parallel.ForEach(ReplicaOrInstanceList, ParallelOptions, (repOrInst, state) => { token.ThrowIfCancellationRequested(); // For use in process family tree monitoring. ConcurrentQueue childProcessTelemetryDataList = null; - if (!ReplicaOrInstanceList.TryDequeue(out ReplicaOrInstanceMonitoringInfo repOrInst)) - { - return; - } - string processName = null; int processId = 0; ApplicationInfo app = null; @@ -560,23 +565,17 @@ private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst // be up to date across observer loop iterations. private async Task InitializeAsync() { - ReplicaOrInstanceList = new ConcurrentQueue(); + ReplicaOrInstanceList = new List(); userTargetList = new List(); - deployedTargetList = new ConcurrentQueue(); + deployedTargetList = new List(); /* Child/Descendant proc monitoring config */ - if (bool.TryParse( - GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.EnableChildProcessMonitoringParameter), out bool enableDescendantMonitoring)) + if (bool.TryParse( GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableChildProcessMonitoringParameter), out bool enableDescendantMonitoring)) { EnableChildProcessMonitoring = enableDescendantMonitoring; } - if (int.TryParse( - GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) + if (int.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) { MaxChildProcTelemetryDataCount = maxChildProcs; } @@ -607,6 +606,20 @@ private async Task InitializeAsync() MaxDumpsTimeWindow = dumpTimeWindow; } + // Concurrency/Parallelism support. + if (bool.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableConcurrentMonitoring), out bool enableConcurrency)) + { + EnableConcurrentMonitoring = enableConcurrency; + } + + ParallelOptions = new ParallelOptions + { + // Parallelism only makes sense for capable CPU configurations. The minimum requirement is 4 logical processors; which would map to more than 1 available core. + MaxDegreeOfParallelism = EnableConcurrentMonitoring && Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = Token, + TaskScheduler = TaskScheduler.Default + }; + configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( ObserverConstants.ObserverConfigurationPackageName)?.Settings, @@ -979,7 +992,7 @@ private void SetDumpPath() private Task MonitorDeployedAppsAsync(CancellationToken token) { Stopwatch execTimer = Stopwatch.StartNew(); - + ConcurrentQueue exceptions = new ConcurrentQueue(); int capacity = ReplicaOrInstanceList.Count; AllAppCpuData ??= new ConcurrentDictionary>(); AllAppMemDataMb ??= new ConcurrentDictionary>(); @@ -988,13 +1001,11 @@ private Task MonitorDeployedAppsAsync(CancellationToken token) AllAppEphemeralPortsData ??= new ConcurrentDictionary>(); AllAppHandlesData ??= new ConcurrentDictionary>(); AllAppThreadsData ??= new ConcurrentDictionary>(); - var exceptions = new ConcurrentQueue(); - _ = Parallel.For(0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => + _ = Parallel.ForEach(ReplicaOrInstanceList, ParallelOptions, (repOrInst, state) => { token.ThrowIfCancellationRequested(); - var repOrInst = ReplicaOrInstanceList.ElementAt(i); var timer = new Stopwatch(); int parentPid = (int)repOrInst.HostProcessId; bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false, checkThreads = false; @@ -1521,19 +1532,13 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } var replicasOrInstances = await GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType); - - foreach (var rep in replicasOrInstances) - { - ReplicaOrInstanceList.Enqueue(rep); - } + ReplicaOrInstanceList.AddRange(replicasOrInstances); + var targets = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower())); - foreach (var target in targets) - { - deployedTargetList.Enqueue(target); - } + deployedTargetList.AddRange(targets); } deployedApps.Clear(); diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index f6d2e840..f70c5f3a 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -39,6 +39,16 @@ public class ContainerObserver : ObserverBase private Stopwatch runDurationTimer; public string ConfigurationFilePath = string.Empty; + public bool EnableConcurrentMonitoring + { + get; set; + } + + public ParallelOptions ParallelOptions + { + get; private set; + } + public ContainerObserver(FabricClient fabricClient, StatelessServiceContext context) : base(fabricClient, context) { @@ -76,7 +86,7 @@ public override async Task ObserveAsync(CancellationToken token) if (EnableVerboseLogging) { - ObserverLogger.LogInfo($"Run Duration {(ObserverManager.ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + + ObserverLogger.LogInfo($"Run Duration {(ParallelOptions.MaxDegreeOfParallelism == -1 ? "with" : "without")} " + $"Parallel (Processors: {Environment.ProcessorCount}):{RunDuration}"); } @@ -92,15 +102,10 @@ public override Task ReportAsync(CancellationToken token) TimeSpan timeToLive = GetHealthReportTimeToLive(); - _ = Parallel.For (0, ReplicaOrInstanceList.Count, ObserverManager.ParallelOptions, (i, state) => + _ = Parallel.ForEach(ReplicaOrInstanceList, ParallelOptions, (repOrInst, state) => { token.ThrowIfCancellationRequested(); - if (!ReplicaOrInstanceList.TryDequeue(out ReplicaOrInstanceMonitoringInfo repOrInst)) - { - return; - } - ApplicationInfo app = deployedTargetList.First( a => (a.TargetApp != null && a.TargetApp == repOrInst.ApplicationName.OriginalString) || (a.TargetAppType != null && a.TargetAppType == repOrInst.ApplicationTypeName)); @@ -183,6 +188,20 @@ private async Task InitializeAsync(CancellationToken token) return false; } + // Concurrency/Parallelism support. + if (bool.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableConcurrentMonitoring), out bool enableConcurrency)) + { + EnableConcurrentMonitoring = enableConcurrency; + } + + ParallelOptions = new ParallelOptions + { + // Parallelism only makes sense for capable CPU configurations. The minimum requirement is 4 logical processors; which would map to more than 1 available core. + MaxDegreeOfParallelism = EnableConcurrentMonitoring && Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = Token, + TaskScheduler = TaskScheduler.Default + }; + userTargetList = new List(); deployedTargetList = new ConcurrentQueue(); ReplicaOrInstanceList = new ConcurrentQueue(); @@ -512,10 +531,8 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc return false; } - _ = Parallel.For(0, ReplicaOrInstanceList.Count, (i, state) => + _ = Parallel.ForEach(ReplicaOrInstanceList, ParallelOptions, (repOrInst, state) => { - // Do not TryDequeue here as ReplicaOrInstanceList is used in other functions (like ReportAsync). - var repOrInst = ReplicaOrInstanceList.ElementAt(i); string serviceName = repOrInst.ServiceName.OriginalString.Replace(repOrInst.ApplicationName.OriginalString, "").Replace("/", ""); string cpuId = $"{serviceName}_cpu"; string memId = $"{serviceName}_mem"; @@ -707,4 +724,4 @@ private void CleanUp() } } } -} +} \ No newline at end of file diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index d90b905e..18ac2701 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -23,14 +23,11 @@ namespace FabricObserver.Observers { - // When enabled, FabricSystemObserver monitors all Fabric system service processes across various resource usage metrics (CPU Time, Private Workingset, Ephemeral and Total Active TCP ports, File Handles). - // It will signal Warnings or Errors based on settings supplied in ApplicationManifest.xml (Like many observers, most of it's settings are overridable and can be reset with application parameter updates). - // If the FabricObserverWebApi service is deployed: The output (a local file) is created for and used by the API service (http://localhost:5000/api/ObserverManager). - // SF Health Report processor will also emit ETW telemetry if configured in ApplicationManifest.xml. - // As with all observers, you should first determine the good (normal) states across resource usage before you set thresholds for the bad ones. + // FabricSystemObserver monitors all Fabric system service processes across various resource usage metrics: + // CPU Time, Private Workingset, Ephemeral and Total Active TCP ports, File Handles, Threads. public class FabricSystemObserver : ObserverBase { - private string[] processWatchList; + private readonly List processWatchList; private Stopwatch stopwatch; // Health Report data container - For use in analysis to determine health state. @@ -39,6 +36,7 @@ public class FabricSystemObserver : ObserverBase private ConcurrentDictionary> allActiveTcpPortData; private ConcurrentDictionary> allEphemeralTcpPortData; private ConcurrentDictionary> allHandlesData; + private ConcurrentDictionary> allThreadsData; // Windows only. (EventLog). private List evtRecordList = null; @@ -51,7 +49,39 @@ public class FabricSystemObserver : ObserverBase public FabricSystemObserver(FabricClient fabricClient, StatelessServiceContext context) : base(fabricClient, context) { - + // Linux + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + processWatchList = new List + { + "Fabric", + "FabricDCA.dll", + "FabricDnsService", + "FabricCAS.dll", + "FabricFAS.dll", + "FabricGateway.exe", + "FabricHost", + "FabricIS.dll", + "FabricRM.exe", + "FabricUS.dll" + }; + } + else + { + // Windows + processWatchList = new List + { + "Fabric", + "FabricApplicationGateway", + "FabricDCA", + "FabricDnsService", + "FabricFAS", + "FabricGateway", + "FabricHost", + "FabricIS", + "FabricRM" + }; + } } public int CpuErrorUsageThresholdPct @@ -119,6 +149,31 @@ public int AllocatedHandlesError get; set; } + public bool EnableConcurrentMonitoring + { + get; set; + } + + public ParallelOptions ParallelOptions + { + get; set; + } + + public int ThreadCountError + { + get; set; + } + + public int ThreadCountWarning + { + get; set; + } + + public int TotalThreadsAllSystemServices + { + get; set; + } + public override async Task ObserveAsync(CancellationToken token) { // If set, this observer will only run during the supplied interval. @@ -138,12 +193,10 @@ public override async Task ObserveAsync(CancellationToken token) { Initialize(); - _ = Parallel.For (0, processWatchList.Length, ObserverManager.ParallelOptions, async (i, state) => + _ = Parallel.ForEach(processWatchList, ParallelOptions, async (procName, state) => { Token.ThrowIfCancellationRequested(); - string procName = processWatchList[i]; - try { string dotnet = string.Empty; @@ -196,28 +249,39 @@ public override Task ReportAsync(CancellationToken token) { Token.ThrowIfCancellationRequested(); - string memHandlesInfo = string.Empty; + string info = string.Empty; if (allMemData != null) { - memHandlesInfo += $"Fabric memory: {allMemData["Fabric"].AverageDataValue} MB{Environment.NewLine}" + - $"FabricDCA memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue} MB{Environment.NewLine}" + - $"FabricGateway memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue} MB{Environment.NewLine}" + + info += $"Fabric memory: {allMemData["Fabric"].AverageDataValue} MB{Environment.NewLine}" + + $"FabricDCA memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue} MB{Environment.NewLine}" + + $"FabricGateway memory: {allMemData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue} MB{Environment.NewLine}" + - // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. - (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? - $"FabricHost memory: {allMemData["FabricHost"].AverageDataValue} MB{Environment.NewLine}" : string.Empty); + // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. + (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? + $"FabricHost memory: {allMemData["FabricHost"].AverageDataValue} MB{Environment.NewLine}" : string.Empty); } if (allHandlesData != null) { - memHandlesInfo += $"Fabric file handles: {allHandlesData["Fabric"].AverageDataValue}{Environment.NewLine}" + - $"FabricDCA file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue}{Environment.NewLine}" + - $"FabricGateway file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue}{Environment.NewLine}" + + info += $"Fabric file handles: {allHandlesData["Fabric"].AverageDataValue}{Environment.NewLine}" + + $"FabricDCA file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue}{Environment.NewLine}" + + $"FabricGateway file handles: {allHandlesData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue}{Environment.NewLine}" + + + // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. + (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? + $"FabricHost file handles: {allHandlesData["FabricHost"]?.AverageDataValue}{Environment.NewLine}" : string.Empty); + } + + if (allThreadsData != null) + { + info += $"Fabric threads: {allThreadsData["Fabric"].AverageDataValue}{Environment.NewLine}" + + $"FabricDCA threads: {allThreadsData.FirstOrDefault(x => x.Key.Contains("FabricDCA")).Value.AverageDataValue}{Environment.NewLine}" + + $"FabricGateway threads: {allThreadsData.FirstOrDefault(x => x.Key.Contains("FabricGateway")).Value.AverageDataValue}{Environment.NewLine}" + - // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. - (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? - $"FabricHost file handles: {allHandlesData["FabricHost"]?.AverageDataValue}" : string.Empty); + // On Windows, FO runs as NetworkUser by default and therefore can't monitor FabricHost process, which runs as System. + (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? + $"FabricHost threads: {allThreadsData["FabricHost"]?.AverageDataValue}" : string.Empty); } // Informational report. @@ -228,7 +292,8 @@ public override Task ReportAsync(CancellationToken token) NodeName = NodeName, HealthMessage = $"TCP ports in use by Fabric System services: {TotalActivePortCountAllSystemServices}{Environment.NewLine}" + $"Ephemeral TCP ports in use by Fabric System services: {TotalActiveEphemeralPortCountAllSystemServices}{Environment.NewLine}" + - $"File handles in use by Fabric System services: {TotalAllocatedHandlesAllSystemServices}{Environment.NewLine}{memHandlesInfo}", + $"File handles in use by Fabric System services: {TotalAllocatedHandlesAllSystemServices}{Environment.NewLine}" + + $"Threads in use by Fabric System services: {TotalThreadsAllSystemServices}{Environment.NewLine}{info}", State = HealthState.Ok, HealthReportTimeToLive = timeToLiveWarning, @@ -241,6 +306,7 @@ public override Task ReportAsync(CancellationToken token) TotalActivePortCountAllSystemServices = 0; TotalActiveEphemeralPortCountAllSystemServices = 0; TotalAllocatedHandlesAllSystemServices = 0; + TotalThreadsAllSystemServices = 0; // CPU if (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0) @@ -272,8 +338,20 @@ public override Task ReportAsync(CancellationToken token) ProcessResourceDataList(allHandlesData, AllocatedHandlesError, AllocatedHandlesWarning); } + // Threads + if (ThreadCountError > 0 || ThreadCountWarning > 0) + { + ProcessResourceDataList(allThreadsData, ThreadCountError, ThreadCountWarning); + } + + // No need to progress on Linux. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) + { + return Task.CompletedTask; + } + // Windows Event Log - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && IsObserverWebApiAppDeployed && monitorWinEventLog) + if (IsObserverWebApiAppDeployed && monitorWinEventLog) { // SF Eventlog Errors? // Write this out to a new file, for use by the web front end log viewer. @@ -490,43 +568,9 @@ private void Initialize() { Token.ThrowIfCancellationRequested(); - // Linux - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - processWatchList = new [] - { - "Fabric", - "FabricDCA.dll", - "FabricDnsService", - "FabricCAS.dll", - "FabricFAS.dll", - "FabricGateway.exe", - "FabricHost", - "FabricIS.dll", - "FabricRM.exe", - "FabricUS.dll" - }; - } - else - { - // Windows - processWatchList = new [] - { - "Fabric", - "FabricApplicationGateway", - "FabricDCA", - "FabricDnsService", - "FabricFAS", - "FabricGateway", - "FabricHost", - "FabricIS", - "FabricRM" - }; - } - // fabric:/System MonitoredAppCount = 1; - MonitoredServiceProcessCount = processWatchList.Length; + MonitoredServiceProcessCount = processWatchList.Count; int frudCapacity = 4; if (UseCircularBuffer) @@ -626,6 +670,23 @@ private void Initialize() } } + // Threads + if (allThreadsData == null && (ThreadCountError > 0 || ThreadCountWarning > 0)) + { + allThreadsData = new ConcurrentDictionary>(); + + foreach (var proc in processWatchList) + { + _ = allThreadsData.TryAdd( + proc, + new FabricResourceUsageData( + ErrorWarningProperty.TotalThreadCount, + proc, + frudCapacity, + UseCircularBuffer)); + } + } + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && monitorWinEventLog && evtRecordList == null) { evtRecordList = new List(); @@ -692,6 +753,14 @@ private void SetThresholdSFromConfiguration() AllocatedHandlesError = threshold; } + var threadCountError = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.FabricSystemObserverErrorThreadCount); + + if (!string.IsNullOrEmpty(threadCountError)) + { + _ = int.TryParse(threadCountError, out int threshold); + ThreadCountError = threshold; + } + /* Warning thresholds */ Token.ThrowIfCancellationRequested(); @@ -750,6 +819,29 @@ private void SetThresholdSFromConfiguration() AllocatedHandlesWarning = threshold; } + // Threads + var threadCountWarning = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.FabricSystemObserverWarningThreadCount); + + if (!string.IsNullOrEmpty(threadCountWarning)) + { + _ = int.TryParse(threadCountWarning, out int threshold); + ThreadCountWarning = threshold; + } + + // Concurrency/Parallelism support. + if (bool.TryParse(GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableConcurrentMonitoring), out bool enableConcurrency)) + { + EnableConcurrentMonitoring = enableConcurrency; + } + + ParallelOptions = new ParallelOptions + { + // Parallelism only makes sense for capable CPU configurations. The minimum requirement is 4 logical processors; which would map to more than 1 available core. + MaxDegreeOfParallelism = EnableConcurrentMonitoring && Environment.ProcessorCount >= 4 ? -1 : 1, + CancellationToken = Token, + TaskScheduler = TaskScheduler.Default + }; + // Monitor Windows event log for SF and System Error/Critical events? // This can be noisy. Use wisely. Return if running on Linux. if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) @@ -799,8 +891,6 @@ private async Task GetProcessInfoAsync(string procName) { // Ports - Active TCP All int activePortCount = OSInfoProvider.Instance.GetActiveTcpPortCount(process.Id, FabricServiceContext); - - // This is used for info report. TotalActivePortCountAllSystemServices += activePortCount; if (ActiveTcpPortCountError > 0 || ActiveTcpPortCountWarning > 0) @@ -810,8 +900,6 @@ private async Task GetProcessInfoAsync(string procName) // Ports - Active TCP Ephemeral int activeEphemeralPortCount = OSInfoProvider.Instance.GetActiveEphemeralPortCount(process.Id, FabricServiceContext); - - // This is used for info report. TotalActiveEphemeralPortCountAllSystemServices += activeEphemeralPortCount; if (ActiveEphemeralPortCountError > 0 || ActiveEphemeralPortCountWarning > 0) @@ -821,14 +909,16 @@ private async Task GetProcessInfoAsync(string procName) // Allocated Handles float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); - - // This is used for info report. TotalAllocatedHandlesAllSystemServices += handles; + + // Threads + int threads = ProcessInfoProvider.GetProcessThreadCount(process.Id); + TotalThreadsAllSystemServices += threads; // No need to proceed further if there are no configuration settings for CPU, Memory, Handles thresholds. // Returning here is correct as supplied thresholds apply to all system services. if (CpuErrorUsageThresholdPct <= 0 && CpuWarnUsageThresholdPct <= 0 && MemErrorUsageThresholdMb <= 0 && MemWarnUsageThresholdMb <= 0 - && AllocatedHandlesError <= 0 && AllocatedHandlesWarning <= 0) + && AllocatedHandlesError <= 0 && AllocatedHandlesWarning <= 0 && ThreadCountError <= 0 && ThreadCountWarning <= 0) { return; } @@ -839,6 +929,12 @@ private async Task GetProcessInfoAsync(string procName) allHandlesData[dotnetArg].Data.Add(handles); } + // Threads + if (ThreadCountError > 0 || ThreadCountWarning > 0) + { + allThreadsData[dotnetArg].Data.Add(threads); + } + CpuUsage cpuUsage = new CpuUsage(); // Mem @@ -924,7 +1020,7 @@ private void ProcessResourceDataList( fileName = $"FabricSystemServices{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; } - _ = Parallel.ForEach (data, ObserverManager.ParallelOptions, (state) => + _ = Parallel.ForEach (data, ParallelOptions, (state) => { Token.ThrowIfCancellationRequested(); @@ -948,6 +1044,7 @@ private void ProcessResourceDataList( ErrorWarningProperty.TotalActivePorts => "Active TCP Ports", ErrorWarningProperty.TotalEphemeralPorts => "Active Ephemeral Ports", ErrorWarningProperty.TotalFileHandlesPct => "Allocated (in use) File Handles %", + ErrorWarningProperty.TotalThreadCount => "Threads", _ => propertyName }; @@ -996,8 +1093,6 @@ private void ProcessResourceDataList( private void CleanUp() { - processWatchList = null; - if (allCpuData != null && !allCpuData.Any(frud => frud.Value.ActiveErrorOrWarning)) { allCpuData?.Clear(); diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 336c609b..179e6d09 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -98,20 +98,6 @@ public static HealthState ObserverFailureHealthStateLevel get; set; } = HealthState.Unknown; - /// - /// This is for observers that support parallelized monitor loops. - /// AppObserver, ContainerObserver, FabricSystemObserver. - /// - public static ParallelOptions ParallelOptions - { - get; set; - } - - public static bool EnableConcurrentExecution - { - get; set; - } - private ObserverHealthReporter HealthReporter { get; @@ -225,13 +211,6 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie } HealthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); - - ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = EnableConcurrentExecution && Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = linkedSFRuntimeObserverTokenSource?.Token ?? token, - TaskScheduler = TaskScheduler.Default - }; } public async Task StartObserversAsync() @@ -263,7 +242,9 @@ public async Task StartObserversAsync() // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more // than 90 days. - if (FabricObserverOperationalTelemetryEnabled && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval) + if (FabricObserverOperationalTelemetryEnabled + && DateTime.UtcNow.Subtract(StartDateTime) >= OperationalTelemetryRunInterval + && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval) { try { @@ -633,7 +614,7 @@ private FabricObserverOperationalEventData GetFabricObserverInternalTelemetryDat Version = InternalVersionNumber, EnabledObserverCount = observers.Count(obs => obs.IsEnabled), HasPlugins = hasPlugins, - ParallelExecutionEnabled = EnableConcurrentExecution, + ParallelExecutionCapable = Environment.ProcessorCount >= 4, ObserverData = GetObserverData(), }; } @@ -793,21 +774,8 @@ private async void CodePackageActivationContext_ConfigurationPackageModifiedEven private void SetPropertiesFromConfigurationParameters(ConfigurationSettings settings = null) { ApplicationName = FabricServiceContext.CodePackageActivationContext.ApplicationName; - - // Parallelization settings for capable hardware. \\ - if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableConcurrentExecution, settings), out bool enableConcurrency)) - { - EnableConcurrentExecution = enableConcurrency; - } - ParallelOptions = new ParallelOptions - { - // Parallelism only makes sense for capable CPU configurations. The minimum requirement is 4 logical processors; which would map to more than 1 available core. - MaxDegreeOfParallelism = EnableConcurrentExecution && Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = linkedSFRuntimeObserverTokenSource?.Token ?? token, - TaskScheduler = TaskScheduler.Default - }; // ETW - Overridable if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableETWProvider, settings), out bool etwEnabled)) diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 6fa64914..15aa790b 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -1,7 +1,7 @@ 
- + - - - @@ -110,6 +107,11 @@ + + + @@ -209,6 +211,10 @@
+ + @@ -239,6 +245,8 @@ + +
@@ -325,6 +333,7 @@
+ diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 52c6aceb..005ab8c1 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -14,15 +14,14 @@ - - + - - + + @@ -64,11 +63,7 @@ - + @@ -103,6 +98,8 @@ + + @@ -111,6 +108,8 @@ + + @@ -120,6 +119,8 @@ + + @@ -131,6 +132,8 @@ + + @@ -183,6 +186,7 @@
+ @@ -226,6 +230,7 @@
+ @@ -249,6 +254,8 @@ + +
@@ -320,6 +327,7 @@
+ @@ -335,7 +343,6 @@ -
diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 1e3eb678..c0ac86b4 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -1,5 +1,4 @@ using System; -using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Fabric; @@ -13,7 +12,6 @@ using System.Threading.Tasks; using ClusterObserver; using FabricObserver.Observers; -using FabricObserver.Observers.MachineInfoModel; using FabricObserver.Observers.Utilities; using Microsoft.VisualStudio.TestTools.UnitTesting; using HealthReport = FabricObserver.Observers.Utilities.HealthReport; @@ -36,38 +34,154 @@ namespace FabricObserverTests [TestClass] public class ObserverTest { - private static readonly Uri ServiceName = new Uri("fabric:/app/service"); private const string NodeName = "_Node_0"; + private static readonly Uri ServiceName = new Uri("fabric:/app/service"); + private static readonly bool isSFRuntimePresentOnTestMachine = IsLocalSFRuntimePresent(); + private static readonly CancellationToken token = new CancellationToken(); + private static readonly FabricClient fabricClient = new FabricClient(); private static readonly ICodePackageActivationContext CodePackageContext = new MockCodePackageActivationContext( - ServiceName.AbsoluteUri, - "applicationType", - "Code", - "1.0.0.0", - Guid.NewGuid().ToString(), - @"C:\Log", - @"C:\Temp", - @"C:\Work", - "ServiceManifest", - "1.0.0.0"); + ServiceName.AbsoluteUri, + "applicationType", + "Code", + "1.0.0.0", + Guid.NewGuid().ToString(), + @"C:\Log", + @"C:\Temp", + @"C:\Work", + "ServiceManifest", + "1.0.0.0"); private static readonly StatelessServiceContext context = new StatelessServiceContext( - new NodeContext(NodeName, new NodeId(0, 1), 0, "NodeType0", "TEST.MACHINE"), - CodePackageContext, - "FabricObserver.FabricObserverType", - ServiceName, - null, - Guid.NewGuid(), - long.MaxValue); - - private static readonly bool isSFRuntimePresentOnTestMachine; - private static readonly CancellationToken token = new CancellationToken(); - private static readonly FabricClient fabricClient = new FabricClient(); + new NodeContext(NodeName, new NodeId(0, 1), 0, "NodeType0", "TEST.MACHINE"), + CodePackageContext, + "FabricObserver.FabricObserverType", + ServiceName, + null, + Guid.NewGuid(), + long.MaxValue); + /* Helpers */ + + private static bool IsLocalSFRuntimePresent() + { + try + { + var ps = Process.GetProcessesByName("Fabric"); + return ps.Length != 0; + } + catch (InvalidOperationException) + { + return false; + } + } - static ObserverTest() + private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) { - isSFRuntimePresentOnTestMachine = IsLocalSFRuntimePresent(); + // Clear any existing user app, node or fabric:/System app Test Health Reports. + try + { + var healthReport = new HealthReport + { + Code = FOErrorWarningCodes.Ok, + HealthMessage = "Clearing existing Error/Warning Test Health Reports.", + State = HealthState.Ok, + ReportType = HealthReportType.Application, + NodeName = "_Node_0" + }; + + var logger = new Logger("TestCleanUp"); + var client = new FabricClient(); + + // App reports + if (obs is { HasActiveFabricErrorOrWarning: true } && obs.ObserverName != ObserverConstants.NetworkObserverName) + { + if (obs.AppNames.Count > 0 && obs.AppNames.All(a => !string.IsNullOrWhiteSpace(a) && a.Contains("fabric:/"))) + { + foreach (var app in obs.AppNames) + { + try + { + var appName = new Uri(app); + var appHealth = await client.HealthManager.GetApplicationHealthAsync(appName).ConfigureAwait(false); + var unhealthyEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)); + + if (unhealthyEvents == null) + { + continue; + } + + foreach (HealthEvent evt in unhealthyEvents) + { + healthReport.AppName = appName; + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + + var healthReporter = new ObserverHealthReporter(logger, client); + healthReporter.ReportHealthToServiceFabric(healthReport); + + Thread.Sleep(50); + } + } + catch (FabricException) + { + + } + } + } + } + + // System reports + var sysAppHealth = await client.HealthManager.GetApplicationHealthAsync(new Uri("fabric:/System")).ConfigureAwait(false); + + if (sysAppHealth != null) + { + foreach (var evt in sysAppHealth.HealthEvents?.Where( + s => s.HealthInformation.SourceId.Contains("FabricSystemObserver") + && (s.HealthInformation.HealthState == HealthState.Error + || s.HealthInformation.HealthState == HealthState.Warning))) + { + healthReport.AppName = new Uri("fabric:/System"); + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + + var healthReporter = new ObserverHealthReporter(logger, client); + healthReporter.ReportHealthToServiceFabric(healthReport); + + Thread.Sleep(50); + } + } + + // Node reports + var nodeHealth = await client.HealthManager.GetNodeHealthAsync(context.NodeContext.NodeName).ConfigureAwait(false); + + var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where( + s => s.HealthInformation.SourceId.Contains("NodeObserver") + || s.HealthInformation.SourceId.Contains("DiskObserver") + && (s.HealthInformation.HealthState == HealthState.Error + || s.HealthInformation.HealthState == HealthState.Warning)); + + healthReport.ReportType = HealthReportType.Node; + + if (unhealthyFONodeEvents != null) + { + foreach (HealthEvent evt in unhealthyFONodeEvents) + { + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + + var healthReporter = new ObserverHealthReporter(logger, client); + healthReporter.ReportHealthToServiceFabric(healthReport); + + Thread.Sleep(50); + } + } + } + catch (FabricException) + { + + } } private static bool InstallCerts() @@ -107,7 +221,7 @@ private static void UnInstallCerts() } [ClassCleanup] - public static void TestClassCleanup() + public static async Task TestClassCleanupAsync() { // Remove any files generated. try @@ -124,9 +238,11 @@ public static void TestClassCleanup() } - CleanupTestHealthReportsAsync().GetAwaiter().GetResult(); + await CleanupTestHealthReportsAsync(); } + /* End Helpers */ + [TestMethod] public void AppObserver_Constructor_Test() { @@ -139,8 +255,6 @@ public void AppObserver_Constructor_Test() Assert.IsTrue(obs.ObserverLogger != null); Assert.IsTrue(obs.HealthReporter != null); Assert.IsTrue(obs.ObserverName == ObserverConstants.AppObserverName); - - } [TestMethod] @@ -267,10 +381,6 @@ public void SFConfigurationObserver_Constructor_Test() /****** NOTE: These tests below do NOT work without a running local SF cluster or in an Azure DevOps VSTest Pipeline ******/ - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() { @@ -286,18 +396,12 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() ObserverManager.FabricClientInstance = client; ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; using var obs = new AppObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json") + ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -329,18 +433,12 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is ObserverManager.FabricClientInstance = client; ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; using var obs = new AppObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.oldstyle.json") + ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.oldstyle.json"), + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -357,10 +455,6 @@ public async Task AppObserver_ObserveAsync_OldConfigStyle_Successful_Observer_Is await CleanupTestHealthReportsAsync(obs).ConfigureAwait(true); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task ContainerObserver_ObserveAsync_Successful_Observer_IsHealthy() { @@ -376,17 +470,11 @@ public async Task ContainerObserver_ObserveAsync_Successful_Observer_IsHealthy() ObserverManager.FabricClientInstance = client; ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; using var obs = new ContainerObserver(client, context) { - ConfigurationFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "ContainerObserver.config.json") + ConfigurationFilePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "ContainerObserver.config.json"), + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -543,15 +631,12 @@ public async Task CertificateObserver_expiredAndexpiringCerts() // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); + + // stop clears health warning await obsMgr.StopObserversAsync().ConfigureAwait(true); - await Task.Delay(1000).ConfigureAwait(true); Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } - /// - /// NodeObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Fail. - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NodeObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Fail() { @@ -587,10 +672,6 @@ public async Task NodeObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Fa Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NodeObserver_Negative_Integer_CPU_Mem_Ports_Firewalls_Values_No_Exceptions_In_Intialize() { @@ -627,10 +708,6 @@ public async Task NodeObserver_Negative_Integer_CPU_Mem_Ports_Firewalls_Values_N Assert.IsTrue(obs.LastRunDateTime > startDateTime); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NodeObserver_Negative_Integer_Thresholds_CPU_Mem_Ports_Firewalls_All_Data_Containers_Are_Null() { @@ -674,10 +751,6 @@ public async Task NodeObserver_Negative_Integer_Thresholds_CPU_Mem_Ports_Firewal Assert.IsTrue(obs.LastRunDateTime > startDateTime); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task OSObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarningsOrErrors() { @@ -830,14 +903,11 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin // Output file is not empty. Assert.IsTrue((await File.ReadAllLinesAsync(outputFilePath).ConfigureAwait(false)).Length > 0); + // Stop clears health warning await obsMgr.StopObserversAsync().ConfigureAwait(false); Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NetworkObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarningsOrErrors() { @@ -869,10 +939,6 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_IsHealthy_NoW Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFile_ObsWebDeployed() { @@ -919,10 +985,6 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFi Assert.IsTrue((await File.ReadAllLinesAsync(outputFilePath).ConfigureAwait(false)).Length > 0); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_WarningsOrErrorsDetected() { @@ -958,14 +1020,12 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); + + // Stop clears health warning await obsMgr.StopObserversAsync().ConfigureAwait(false); Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task SFConfigurationObserver_ObserveAsync_Successful_Observer_IsHealthy() { @@ -1013,10 +1073,6 @@ public async Task SFConfigurationObserver_ObserveAsync_Successful_Observer_IsHea Assert.IsTrue((await File.ReadAllLinesAsync(outputFilePath).ConfigureAwait(false)).Length > 0); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealthy_NoWarningsOrErrors() { @@ -1041,19 +1097,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = false; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { IsEnabled = true, DataCapacity = 5, - MonitorDuration = TimeSpan.FromSeconds(1) + MonitorDuration = TimeSpan.FromSeconds(1), + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -1070,10 +1120,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealthy_MemoryWarningsOrErrorsDetected() { @@ -1090,19 +1136,13 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { IsEnabled = true, MonitorDuration = TimeSpan.FromSeconds(1), - MemWarnUsageThresholdMb = 5 // This will definitely cause Warning alerts. + MemWarnUsageThresholdMb = 5, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -1117,11 +1157,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth Assert.IsFalse(obs.IsUnhealthy); } - - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealthy_ActiveTcpPortsWarningsOrErrorsDetected() { @@ -1146,18 +1181,12 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ActiveTcpPortCountWarning = 5 // This will definitely cause Warning. + ActiveTcpPortCountWarning = 5, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token); @@ -1173,10 +1202,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealthy_EphemeralPortsWarningsOrErrorsDetected() { @@ -1201,18 +1226,12 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - ActiveEphemeralPortCountWarning = 1 // This will definitely cause Warning. + ActiveEphemeralPortCountWarning = 1, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token).ConfigureAwait(true); @@ -1228,10 +1247,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealthy_HandlesWarningsOrErrorsDetected() { @@ -1256,18 +1271,12 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - AllocatedHandlesWarning = 100 // This will definitely cause Warning. + AllocatedHandlesWarning = 100, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token).ConfigureAwait(true); @@ -1283,10 +1292,6 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Unhandled_Exception() { @@ -1311,18 +1316,12 @@ public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Un ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - CpuWarnUsageThresholdPct = -42 + CpuWarnUsageThresholdPct = -42, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token).ConfigureAwait(true); @@ -1337,10 +1336,6 @@ public async Task FabricSystemObserver_Negative_Integer_CPU_Warn_Threshold_No_Un Assert.IsFalse(obs.IsUnhealthy); } - /// - /// . - /// - /// A representing the result of the asynchronous operation. [TestMethod] public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Threshold_No_Unhandled_Exception() { @@ -1365,18 +1360,12 @@ public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Thresho ObserverManager.TelemetryEnabled = false; ObserverManager.EtwEnabled = false; ObserverManager.FabricClientInstance = client; - ObserverManager.EnableConcurrentExecution = true; - ObserverManager.ParallelOptions = new ParallelOptions - { - MaxDegreeOfParallelism = Environment.ProcessorCount >= 4 ? -1 : 1, - CancellationToken = token, - TaskScheduler = TaskScheduler.Default - }; - + using var obs = new FabricSystemObserver(client, context) { MonitorDuration = TimeSpan.FromSeconds(1), - CpuWarnUsageThresholdPct = 420 + CpuWarnUsageThresholdPct = 420, + EnableConcurrentMonitoring = true }; await obs.ObserveAsync(token).ConfigureAwait(true); @@ -1390,128 +1379,5 @@ public async Task FabricSystemObserver_Integer_Greater_Than_100_CPU_Warn_Thresho // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); } - - /***** End Tests that require a currently running SF Cluster. *****/ - - private static bool IsLocalSFRuntimePresent() - { - try - { - var ps = Process.GetProcessesByName("Fabric"); - return ps.Length != 0; - } - catch (InvalidOperationException) - { - return false; - } - } - - private static async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) - { - // Clear any existing user app, node or fabric:/System app Test Health Reports. - try - { - var healthReport = new HealthReport - { - Code = FOErrorWarningCodes.Ok, - HealthMessage = "Clearing existing Error/Warning Test Health Reports.", - State = HealthState.Ok, - ReportType = HealthReportType.Application, - NodeName = "_Node_0" - }; - - var logger = new Logger("TestCleanUp"); - var client = new FabricClient(); - - // App reports - if (obs is {HasActiveFabricErrorOrWarning: true} && obs.ObserverName != ObserverConstants.NetworkObserverName) - { - if (obs.AppNames.Count > 0 && obs.AppNames.All(a => !string.IsNullOrWhiteSpace(a) && a.Contains("fabric:/"))) - { - foreach (var app in obs.AppNames) - { - try - { - var appName = new Uri(app); - var appHealth = await client.HealthManager.GetApplicationHealthAsync(appName).ConfigureAwait(false); - var unhealthyEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) - && (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)); - - if (unhealthyEvents == null) - { - continue; - } - - foreach (HealthEvent evt in unhealthyEvents) - { - healthReport.AppName = appName; - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; - - var healthReporter = new ObserverHealthReporter(logger, client); - healthReporter.ReportHealthToServiceFabric(healthReport); - - Thread.Sleep(50); - } - } - catch (FabricException) - { - - } - } - } - } - - // System reports - var sysAppHealth = await client.HealthManager.GetApplicationHealthAsync(new Uri("fabric:/System")).ConfigureAwait(false); - - if (sysAppHealth != null) - { - foreach (var evt in sysAppHealth.HealthEvents?.Where( - s => s.HealthInformation.SourceId.Contains("FabricSystemObserver") - && (s.HealthInformation.HealthState == HealthState.Error - || s.HealthInformation.HealthState == HealthState.Warning))) - { - healthReport.AppName = new Uri("fabric:/System"); - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; - - var healthReporter = new ObserverHealthReporter(logger, client); - healthReporter.ReportHealthToServiceFabric(healthReport); - - Thread.Sleep(50); - } - } - - // Node reports - var nodeHealth = await client.HealthManager.GetNodeHealthAsync(context.NodeContext.NodeName).ConfigureAwait(false); - - var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where( - s => s.HealthInformation.SourceId.Contains("NodeObserver") - || s.HealthInformation.SourceId.Contains("DiskObserver") - && (s.HealthInformation.HealthState == HealthState.Error - || s.HealthInformation.HealthState == HealthState.Warning)); - - healthReport.ReportType = HealthReportType.Node; - - if (unhealthyFONodeEvents != null) - { - foreach (HealthEvent evt in unhealthyFONodeEvents) - { - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; - - var healthReporter = new ObserverHealthReporter(logger, client); - healthReporter.ReportHealthToServiceFabric(healthReport); - - Thread.Sleep(50); - } - } - } - catch (FabricException) - { - - } - } } } \ No newline at end of file diff --git a/TelemetryLib/FabricObserverOperationalEventData.cs b/TelemetryLib/FabricObserverOperationalEventData.cs index 8c0d8531..3d260359 100644 --- a/TelemetryLib/FabricObserverOperationalEventData.cs +++ b/TelemetryLib/FabricObserverOperationalEventData.cs @@ -31,9 +31,9 @@ public bool HasPlugins get; set; } - public bool ParallelExecutionEnabled - { - get; set; + public bool ParallelExecutionCapable + { + get; set; } public List ObserverData diff --git a/TelemetryLib/ObserverData.cs b/TelemetryLib/ObserverData.cs index 9fc29a59..2e9422ca 100644 --- a/TelemetryLib/ObserverData.cs +++ b/TelemetryLib/ObserverData.cs @@ -34,5 +34,10 @@ public int MonitoredServiceProcessCount { get; set; } + + public bool ConcurrencyEnabled + { + get; set; + } } } diff --git a/TelemetryLib/TelemetryEvents.cs b/TelemetryLib/TelemetryEvents.cs index 4328f9d2..29ce0865 100644 --- a/TelemetryLib/TelemetryEvents.cs +++ b/TelemetryLib/TelemetryEvents.cs @@ -82,7 +82,7 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat { "NodeNameHash", nodeHashString }, { "FOVersion", foData.Version }, { "HasPlugins", foData.HasPlugins.ToString() }, - { "ParallelExecution", foData.ParallelExecutionEnabled.ToString() }, + { "ParallelCapable", foData.ParallelExecutionCapable.ToString() }, { "UpTime", foData.UpTime }, { "Timestamp", DateTime.UtcNow.ToString("o") }, { "OS", foData.OS } @@ -106,6 +106,7 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat const string apps = "TotalMonitoredApps"; const string procs = "TotalMonitoredServiceProcesses"; const string conts = "TotalMonitoredContainers"; + const string parallel = "ConcurrencyEnabled"; foreach (var obData in foData.ObserverData) { @@ -131,6 +132,16 @@ public bool EmitFabricObserverOperationalEvent(FabricObserverOperationalEventDat } metrics.Add(key, data); + + } + + // Concurrency + if (obData.ObserverName.Contains("AppObserver") || obData.ObserverName.Contains("FabricSystemObserver") + || obData.ObserverName.Contains("ContainerObserver")) + { + data = ((AppServiceObserverData)obData).ConcurrencyEnabled == false ? 0 : 1; + key = $"{obData.ObserverName}{parallel}"; + metrics.Add(key, data); } // AzureStorage and SFConfig observers do not generate health events. From ed41afbb8b2ea89b142bd28165798fe175fb6bc3 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Sat, 9 Oct 2021 10:35:29 -0700 Subject: [PATCH 33/35] Remove ForceGC logic. --- FabricObserver/Observers/ObserverManager.cs | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 179e6d09..ca90c572 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -32,7 +32,6 @@ public class ObserverManager : IDisposable private readonly string nodeName; private readonly List observers; private readonly TimeSpan OperationalTelemetryRunInterval = TimeSpan.FromDays(1); - private readonly TimeSpan ForcedGCInterval = TimeSpan.FromMinutes(15); private readonly CancellationToken token; private readonly IEnumerable serviceCollection; private volatile bool shutdownSignaled; @@ -123,11 +122,6 @@ private int MaxArchivedLogFileLifetimeDays get; } - private DateTime LastForcedGCDateTime - { - get; set; - } - private DateTime LastTelemetrySendDate { get; set; @@ -242,9 +236,7 @@ public async Task StartObserversAsync() // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more // than 90 days. - if (FabricObserverOperationalTelemetryEnabled - && DateTime.UtcNow.Subtract(StartDateTime) >= OperationalTelemetryRunInterval - && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval) + if (FabricObserverOperationalTelemetryEnabled && DateTime.UtcNow.Subtract(LastTelemetrySendDate) >= OperationalTelemetryRunInterval) { try { @@ -275,14 +267,6 @@ public async Task StartObserversAsync() } } - // Force Gen0-Gen2 collection with compaction, including LOH. This runs every 15 minutes. - if (DateTime.UtcNow.Subtract(LastForcedGCDateTime) >= ForcedGCInterval) - { - GCSettings.LargeObjectHeapCompactionMode = GCLargeObjectHeapCompactionMode.CompactOnce; - GC.Collect(2, GCCollectionMode.Forced, true, true); - LastForcedGCDateTime = DateTime.UtcNow; - } - if (ObserverExecutionLoopSleepSeconds > 0) { await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds), token); From 7474bb3f482c9b9017cbac29e8fdfcdc5296a9fa Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Sat, 9 Oct 2021 13:59:03 -0700 Subject: [PATCH 34/35] 3.1.18 --- Build-FabricObserver.ps1 | 2 +- .../Utilities/Telemetry/TelemetryData.cs | 11 ++++- FabricObserver/Observers/AppObserver.cs | 49 ------------------- FabricObserver/Observers/ContainerObserver.cs | 22 --------- .../Observers/FabricSystemObserver.cs | 34 ------------- FabricObserver/Observers/NodeObserver.cs | 47 ------------------ FabricObserver/Observers/ObserverManager.cs | 49 +++++-------------- .../ApplicationManifest.xml | 8 +-- 8 files changed, 26 insertions(+), 196 deletions(-) diff --git a/Build-FabricObserver.ps1 b/Build-FabricObserver.ps1 index 68041150..88385c41 100644 --- a/Build-FabricObserver.ps1 +++ b/Build-FabricObserver.ps1 @@ -26,7 +26,7 @@ function Update-ApplicationManifestForLinux { $NewFileContent += $newNode } - elseif ($FileContent[$i] -like "**") { + elseif ($FileContent[$i] -like "**") { $NewFileContent += $newNode2 } diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index 5132e3b1..198f05a0 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -107,8 +107,15 @@ public TelemetryData() public TelemetryData(FabricClient fabricClient, CancellationToken cancellationToken) { - var (clusterId, _, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, cancellationToken).Result; - ClusterId = clusterId; + try + { + var (clusterId, _, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, cancellationToken).Result; + ClusterId = clusterId; + } + catch (AggregateException) + { + + } } } } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 35374fd6..59cfd0e4 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -116,7 +116,6 @@ public override async Task ObserveAsync(CancellationToken token) await MonitorDeployedAppsAsync(token); await ReportAsync(token); stopwatch.Stop(); - CleanUp(); RunDuration = stopwatch.Elapsed; if (EnableVerboseLogging) @@ -1671,54 +1670,6 @@ any processes (children) that the service process (parent) created/spawned. */ } } - private void CleanUp() - { - deployedTargetList?.Clear(); - deployedTargetList = null; - - userTargetList?.Clear(); - userTargetList = null; - - ReplicaOrInstanceList?.Clear(); - ReplicaOrInstanceList = null; - - if (AllAppCpuData != null && AllAppCpuData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppCpuData?.Clear(); - AllAppCpuData = null; - } - - if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppEphemeralPortsData?.Clear(); - AllAppEphemeralPortsData = null; - } - - if (AllAppHandlesData != null && AllAppHandlesData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppHandlesData?.Clear(); - AllAppHandlesData = null; - } - - if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppMemDataMb?.Clear(); - AllAppMemDataMb = null; - } - - if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppMemDataPercent?.Clear(); - AllAppMemDataPercent = null; - } - - if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => frud.Value != null && !frud.Value.ActiveErrorOrWarning)) - { - AllAppTotalActivePortsData?.Clear(); - AllAppTotalActivePortsData = null; - } - } - private void LogAllAppResourceDataToCsv(string appName) { if (!EnableCsvLogging) diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index f70c5f3a..3576286c 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -80,7 +80,6 @@ public override async Task ObserveAsync(CancellationToken token) await ReportAsync(token); } - CleanUp(); runDurationTimer.Stop(); RunDuration = runDurationTimer.Elapsed; @@ -702,26 +701,5 @@ private async Task SetInstanceOrReplicaMonitoringList( ReplicaOrInstanceList.Enqueue(replicaInfo); } } - - private void CleanUp() - { - deployedTargetList?.Clear(); - deployedTargetList = null; - - ReplicaOrInstanceList?.Clear(); - ReplicaOrInstanceList = null; - - userTargetList?.Clear(); - userTargetList = null; - - if (!HasActiveFabricErrorOrWarning) - { - allCpuDataPercentage?.Clear(); - allCpuDataPercentage = null; - - allMemDataMB?.Clear(); - allMemDataMB = null; - } - } } } \ No newline at end of file diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 18ac2701..c6aa5369 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -228,7 +228,6 @@ public override async Task ObserveAsync(CancellationToken token) } await ReportAsync(token).ConfigureAwait(true); - CleanUp(); // The time it took to run this observer to completion. stopwatch.Stop(); @@ -1090,38 +1089,5 @@ private void ProcessResourceDataList( HealthReportType.Application); }); } - - private void CleanUp() - { - if (allCpuData != null && !allCpuData.Any(frud => frud.Value.ActiveErrorOrWarning)) - { - allCpuData?.Clear(); - allCpuData = null; - } - - if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.Value.ActiveErrorOrWarning)) - { - allEphemeralTcpPortData?.Clear(); - allEphemeralTcpPortData = null; - } - - if (allHandlesData != null && !allHandlesData.Any(frud => frud.Value.ActiveErrorOrWarning)) - { - allHandlesData?.Clear(); - allHandlesData = null; - } - - if (allMemData != null && !allMemData.Any(frud => frud.Value.ActiveErrorOrWarning)) - { - allMemData?.Clear(); - allMemData = null; - } - - if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud.Value.ActiveErrorOrWarning)) - { - allActiveTcpPortData?.Clear(); - allActiveTcpPortData = null; - } - } } } diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 59f90efb..ab83c6e3 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -139,7 +139,6 @@ public override async Task ObserveAsync(CancellationToken token) Initialize(); await GetSystemCpuMemoryValuesAsync(token).ConfigureAwait(true); await ReportAsync(token).ConfigureAwait(true); - CleanUp(); // The time it took to run this observer. stopwatch.Stop(); @@ -729,51 +728,5 @@ error on these conditions. } } } - - private void CleanUp() - { - if (ActivePortsData != null && !ActivePortsData.ActiveErrorOrWarning) - { - ActivePortsData = null; - } - - if (CpuTimeData != null && !CpuTimeData.ActiveErrorOrWarning) - { - CpuTimeData = null; - } - - if (EphemeralPortsData != null && !EphemeralPortsData.ActiveErrorOrWarning) - { - EphemeralPortsData = null; - } - - if (MemDataInUse != null && !MemDataInUse.ActiveErrorOrWarning) - { - MemDataInUse = null; - } - - if (MemDataPercent != null && !MemDataPercent.ActiveErrorOrWarning) - { - MemDataPercent = null; - } - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && FirewallData != null && !FirewallData.ActiveErrorOrWarning) - { - FirewallData = null; - } - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) - { - if (LinuxFileHandlesDataPercentAllocated != null && !LinuxFileHandlesDataPercentAllocated.ActiveErrorOrWarning) - { - LinuxFileHandlesDataPercentAllocated = null; - } - - if (LinuxFileHandlesDataTotalAllocated != null && !LinuxFileHandlesDataTotalAllocated.ActiveErrorOrWarning) - { - LinuxFileHandlesDataTotalAllocated = null; - } - } - } } } \ No newline at end of file diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index ca90c572..b8b7bf94 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -231,7 +231,7 @@ public async Task StartObserversAsync() break; } - _ = await RunObserversAsync().ConfigureAwait(false); + await RunObserversAsync().ConfigureAwait(false); // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more @@ -912,25 +912,22 @@ private void SignalAbortToRunningObserver() /// Runs all observers in a sequential loop. ///
/// A boolean value indicating success of a complete observer loop run. - private async Task RunObserversAsync() + private async Task RunObserversAsync() { - var exceptionBuilder = new StringBuilder(); - bool allExecuted = true; - for (int i = 0; i < observers.Count; ++i) { var observer = observers[i]; if (isConfigurationUpdateInProgress) { - return true; + return; } try { if (TaskCancelled || shutdownSignaled) { - return true; + return; } // Is it healthy? @@ -1043,51 +1040,29 @@ await File.WriteAllLinesAsync( } } } - catch (AggregateException ex) + catch (AggregateException ae) when (ae.InnerExceptions.Any(e => e is FabricException || + e is OperationCanceledException || + e is TaskCanceledException)) { - if (ex.InnerException is FabricException || - ex.InnerException is OperationCanceledException || - ex.InnerException is TaskCanceledException) + if (isConfigurationUpdateInProgress) { - if (isConfigurationUpdateInProgress) - { - return true; - } - - continue; + return; } - _ = exceptionBuilder.AppendLine($"Handled AggregateException from {observer.ObserverName}:{Environment.NewLine}{ex.InnerException}"); - allExecuted = false; + continue; } catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) { if (isConfigurationUpdateInProgress) { - return true; + return; } - - _ = exceptionBuilder.AppendLine($"Handled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); - allExecuted = false; } catch (Exception e) { - Logger.LogWarning($"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); - allExecuted = false; + Logger.LogWarning($"Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); } } - - if (allExecuted) - { - Logger.LogInfo(ObserverConstants.AllObserversExecutedMessage); - } - else - { - Logger.LogWarning(exceptionBuilder.ToString()); - _ = exceptionBuilder.Clear(); - } - - return allExecuted; } } } diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 005ab8c1..4ccd0609 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -347,10 +347,10 @@ - + --> + --> From 6318e6188609606fbaeb5a8a969fced112df23f6 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Sun, 10 Oct 2021 13:21:20 -0700 Subject: [PATCH 35/35] FO 3.1.18 RC --- Documentation/Observers.md | 6 +- FabricObserver/Observers/AppObserver.cs | 93 ++++++++++--------- FabricObserver/Observers/ContainerObserver.cs | 4 +- .../Observers/FabricSystemObserver.cs | 4 +- FabricObserver/Observers/ObserverManager.cs | 44 ++++++--- 5 files changed, 88 insertions(+), 63 deletions(-) diff --git a/Documentation/Observers.md b/Documentation/Observers.md index 9476a995..8239232f 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -147,7 +147,7 @@ AppObserver also supports non-JSON parameters for configuration unrelated to thr #### Non-json settings set in ApplicationManifest.xml -Version 3.1.18 introduces support for concurrent service process monitoring and reporting by AppObserver. You can enable/disable this feature by setting the boolean value for AppObserverEnableConcurrentMonitoring. Note that this is disabled by default. +**Version 3.1.18 introduces support for concurrent service process monitoring and reporting by AppObserver**. You can enable/disable this feature by setting the boolean value for AppObserverEnableConcurrentMonitoring. Note that this is disabled by default. If your compute configuration includes multiple CPUs (logical processors >= 4) and you monitor several services, then you should consider enabling this capability as it will significantly decrease the time it takes AppObserver to complete monitoring/reporting. If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. @@ -313,7 +313,7 @@ Monitors CPU and Memory use of Service Fabric containerized (docker) services. **In order for ContainerObserver to function properly on Windows, FabricObserver must be configured to run as Admin or System user.** This is not the case for Linux deployments. -Version 3.1.18 introduces support for concurrent docker stats data parsing and reporting by ContainerObserver. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. +**Version 3.1.18 introduces support for concurrent docker stats data parsing and reporting by ContainerObserver**. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. If your compute configuration includes multiple CPUs (logical processors >= 4) and you monitor several containerized services, then you should consider enabling this capability as it will significantly decrease the time it takes ContainerObserver to complete monitoring/reporting. If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. @@ -445,7 +445,7 @@ By default, FabricObserver runs as NetworkUser on Windows and sfappsuser on Linu running as System or root, default FabricObserver can't monitor process behavior (this is always true on Windows). That said, there are only a few system services you would care about: Fabric.exe and FabricGateway.exe. Fabric.exe is generally the system service that your code can directly impact with respect to machine resource usage. -Version 3.1.18 introduces support for concurrent service process monitoring and reporting by FabricSystemObserver. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. +**Version 3.1.18 introduces support for concurrent service process monitoring and reporting by FabricSystemObserver**. You can enable/disable this feature by setting the boolean value for ContainerObserverEnableConcurrentMonitoring. Note that this is disabled by default. If your compute configuration includes multiple CPUs (logical processors >= 4), then you should consider enabling this capability as it will significantly decrease the time it takes FabricSystemObserver to complete monitoring/reporting. If you do not have a capable CPU configuration, then enabling concurrent monitoring will not do anything. diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 59cfd0e4..4013f7d3 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -29,6 +29,7 @@ namespace FabricObserver.Observers // in AppObserver.config.json. This observer will also emit telemetry (ETW, LogAnalytics/AppInsights) if enabled in Settings.xml (ObserverManagerConfiguration) and ApplicationManifest.xml (AppObserverEnableEtw). public class AppObserver : ObserverBase { + // Support for concurrent monitoring. private ConcurrentDictionary> AllAppCpuData; private ConcurrentDictionary> AllAppMemDataMb; private ConcurrentDictionary> AllAppMemDataPercent; @@ -37,10 +38,12 @@ public class AppObserver : ObserverBase private ConcurrentDictionary> AllAppHandlesData; private ConcurrentDictionary> AllAppThreadsData; - // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. + // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. List is thread-safe for reads. + // There are no concurrent writes for this List. private List userTargetList; // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. + // List is thread-safe for reads. There are no concurrent writes for this List. private List deployedTargetList; private readonly ConfigSettings configSettings; private string fileName; @@ -57,6 +60,7 @@ public bool EnableChildProcessMonitoring get; set; } + // List is thread-safe for reads. There are no concurrent writes for this List. public List ReplicaOrInstanceList { get; set; @@ -135,8 +139,9 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); + TimeSpan TTL = GetHealthReportTimeToLive(); + // This will run sequentially if the underlying CPU config does not meet the requirements for concurrency (e.g., if logical procs < 4). _ = Parallel.ForEach(ReplicaOrInstanceList, ParallelOptions, (repOrInst, state) => { token.ThrowIfCancellationRequested(); @@ -214,13 +219,13 @@ public override Task ReportAsync(CancellationToken token) // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( - parentFrud, - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + TTL, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Memory MB - Parent process @@ -234,13 +239,13 @@ public override Task ReportAsync(CancellationToken token) } ProcessResourceDataReportHealth( - parentFrud, - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + TTL, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Memory Percent - Parent process @@ -254,13 +259,13 @@ public override Task ReportAsync(CancellationToken token) } ProcessResourceDataReportHealth( - parentFrud, - app.MemoryErrorLimitPercent, - app.MemoryWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.MemoryErrorLimitPercent, + app.MemoryWarningLimitPercent, + TTL, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Active - Parent process @@ -274,13 +279,13 @@ public override Task ReportAsync(CancellationToken token) } ProcessResourceDataReportHealth( - parentFrud, - app.NetworkErrorActivePorts, - app.NetworkWarningActivePorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.NetworkErrorActivePorts, + app.NetworkWarningActivePorts, + TTL, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process @@ -294,13 +299,13 @@ public override Task ReportAsync(CancellationToken token) } ProcessResourceDataReportHealth( - parentFrud, - app.NetworkErrorEphemeralPorts, - app.NetworkWarningEphemeralPorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError && EnableProcessDumps); + parentFrud, + app.NetworkErrorEphemeralPorts, + app.NetworkWarningEphemeralPorts, + TTL, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError && EnableProcessDumps); } // Allocated (in use) Handles - Parent process @@ -317,7 +322,7 @@ public override Task ReportAsync(CancellationToken token) parentFrud, app.ErrorOpenFileHandles, app.WarningOpenFileHandles, - healthReportTimeToLive, + TTL, HealthReportType.Application, repOrInst, app.DumpProcessOnError && EnableProcessDumps); @@ -337,7 +342,7 @@ public override Task ReportAsync(CancellationToken token) parentFrud, app.ErrorThreadCount, app.WarningThreadCount, - healthReportTimeToLive, + TTL, HealthReportType.Application, repOrInst, app.DumpProcessOnError && EnableProcessDumps); @@ -930,7 +935,7 @@ private async Task InitializeAsync() int repCount = ReplicaOrInstanceList.Count; - // For use in internal telemetry. + // For use in internal diagnostic telemetry. MonitoredServiceProcessCount = repCount; MonitoredAppCount = deployedTargetList.Count; @@ -991,8 +996,8 @@ private void SetDumpPath() private Task MonitorDeployedAppsAsync(CancellationToken token) { Stopwatch execTimer = Stopwatch.StartNew(); - ConcurrentQueue exceptions = new ConcurrentQueue(); int capacity = ReplicaOrInstanceList.Count; + var exceptions = new ConcurrentQueue(); AllAppCpuData ??= new ConcurrentDictionary>(); AllAppMemDataMb ??= new ConcurrentDictionary>(); AllAppMemDataPercent ??= new ConcurrentDictionary>(); @@ -1225,7 +1230,7 @@ any processes (children) that the service process (parent) created/spawned. */ id, token); } - catch (AggregateException e) when (e.InnerException is OperationCanceledException || e.InnerException is TaskCanceledException) + catch (AggregateException e) when (e.InnerExceptions.Any(ex => ex is OperationCanceledException || ex is TaskCanceledException)) { state.Stop(); } @@ -1261,7 +1266,7 @@ private void ComputeResourceUsage( string id, CancellationToken token) { - _ = Parallel.ForEach(procs, (proc, state) => + _ = Parallel.ForEach(procs, ParallelOptions, (proc, state) => { int procId = proc.Value; string procName = proc.Key; @@ -1445,8 +1450,6 @@ private void ComputeResourceUsage( } } } - - Thread.Sleep(150); } }); } diff --git a/FabricObserver/Observers/ContainerObserver.cs b/FabricObserver/Observers/ContainerObserver.cs index 3576286c..a8c66d13 100644 --- a/FabricObserver/Observers/ContainerObserver.cs +++ b/FabricObserver/Observers/ContainerObserver.cs @@ -442,10 +442,10 @@ fed0da6f7bad sf-243-723d5795-01c7-477f-950e-45a400000000_2cc293c0-929c-5c40-bc RedirectStandardError = true }; - var output = new ConcurrentQueue(); + var output = new List(); using Process p = new Process(); p.ErrorDataReceived += (sender, e) => { error += e.Data; }; - p.OutputDataReceived += (sender, e) => { if (!string.IsNullOrWhiteSpace(e.Data)) { output.Enqueue(e.Data); } }; + p.OutputDataReceived += (sender, e) => { if (!string.IsNullOrWhiteSpace(e.Data)) { output.Add(e.Data); } }; p.StartInfo = ps; _ = p.Start(); diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index c6aa5369..074e25cc 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -1013,6 +1013,7 @@ private void ProcessResourceDataList( where T : struct { string fileName = null; + TimeSpan TTL = GetHealthReportTimeToLive(); if (EnableCsvLogging) { @@ -1080,12 +1081,11 @@ private void ProcessResourceDataList( } } - // This function will clear Data items in list (will call Clear() on the supplied FabricResourceUsageData instance's Data field..) ProcessResourceDataReportHealth( dataItem, thresholdError, thresholdWarning, - GetHealthReportTimeToLive(), + TTL, HealthReportType.Application); }); } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index b8b7bf94..505a3f39 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -112,7 +112,7 @@ private Logger Logger get; } - private TimeSpan ObserverExecutionTimeout + private TimeSpan ObserverExecutionTimeout { get; set; } = TimeSpan.FromMinutes(30); @@ -231,7 +231,7 @@ public async Task StartObserversAsync() break; } - await RunObserversAsync().ConfigureAwait(false); + _ = await RunObserversAsync().ConfigureAwait(false); // Identity-agnostic internal operational telemetry sent to Service Fabric team (only) for use in // understanding generic behavior of FH in the real world (no PII). This data is sent once a day and will be retained for no more @@ -912,22 +912,25 @@ private void SignalAbortToRunningObserver() /// Runs all observers in a sequential loop. ///
/// A boolean value indicating success of a complete observer loop run. - private async Task RunObserversAsync() + private async Task RunObserversAsync() { + var exceptionBuilder = new StringBuilder(); + bool allExecuted = true; + for (int i = 0; i < observers.Count; ++i) { var observer = observers[i]; if (isConfigurationUpdateInProgress) { - return; + return true; } try { if (TaskCancelled || shutdownSignaled) { - return; + return true; } // Is it healthy? @@ -1040,29 +1043,48 @@ await File.WriteAllLinesAsync( } } } - catch (AggregateException ae) when (ae.InnerExceptions.Any(e => e is FabricException || - e is OperationCanceledException || - e is TaskCanceledException)) + catch (AggregateException ex) when (ex.InnerExceptions.Any(e => e.InnerException is FabricException || + e.InnerException is OperationCanceledException || + e.InnerException is TaskCanceledException)) { if (isConfigurationUpdateInProgress) { - return; + return true; } + _ = exceptionBuilder.AppendLine($"Handled AggregateException from {observer.ObserverName}:{Environment.NewLine}{ex.InnerException}"); + allExecuted = false; + continue; } catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) { if (isConfigurationUpdateInProgress) { - return; + return true; } + + _ = exceptionBuilder.AppendLine($"Handled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); + allExecuted = false; } catch (Exception e) { - Logger.LogWarning($"Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); + Logger.LogWarning($"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); + allExecuted = false; } } + + if (allExecuted) + { + Logger.LogInfo(ObserverConstants.AllObserversExecutedMessage); + } + else + { + Logger.LogWarning(exceptionBuilder.ToString()); + _ = exceptionBuilder.Clear(); + } + + return allExecuted; } } }