From c3ab81e12b151fbe889f7f9bc97d27f9622c4bd7 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 23 Jun 2021 21:07:33 -0700 Subject: [PATCH 01/14] FO 3.1.15: Process Tree Monitoring (AppObs) --- Build-SFPkgs.ps1 | 8 +- .../Utilities/FabricResourceUsageData.cs | 14 +- .../OperatingSystemInfo/LinuxInfoProvider.cs | 2 +- .../ProcessInfo/IProcessInfoProvider.cs | 11 + .../ProcessInfo/LinuxProcessInfoProvider.cs | 45 ++ .../ProcessInfo/ProcessInfoProvider.cs | 15 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 128 +++- FabricObserver.nuspec.template | 11 +- FabricObserver/Observers/AppObserver.cs | 697 +++++++++++++----- .../PackageRoot/ServiceManifest._linux.xml | 8 +- .../PackageRoot/ServiceManifest.xml | 8 +- .../ApplicationManifest.xml | 12 +- README.md | 2 +- 13 files changed, 699 insertions(+), 262 deletions(-) diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index c450c32f..677e2c8f 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.14" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.14" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.14" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.14" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs index e4afae77..8eda8028 100644 --- a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs +++ b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs @@ -64,27 +64,27 @@ public FabricResourceUsageData( } /// - /// Gets the name of the machine resource property this instance represents. + /// Gets or sets the name of the machine resource property this instance represents. /// public string Property { - get; + get; set; } /// - /// Gets the unique Id of this instance. + /// Gets or sets the unique Id of this instance. /// public string Id { - get; + get; set; } /// - /// Gets the unit of measure for the data (%, MB/GB, etc). + /// Gets or sets the unit of measure for the data (%, MB/GB, etc). /// public string Units { - get; + get; set; } /// @@ -93,7 +93,7 @@ public string Units /// public IList Data { - get; + get; set; } private bool isInWarningState; diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs index cec02061..e33240ed 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs @@ -288,7 +288,7 @@ public override int GetTotalAllocatedFileHandlesCount() } // https://loune.net/2017/06/running-shell-bash-commands-in-net-core/ - internal static class LinuxShellHelper + public static class LinuxShellHelper { /// /// This string extension will run a supplied linux bash command and return the console output. diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index 61e4ea43..5755f079 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -3,6 +3,8 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System.Collections.Generic; +using System.Diagnostics; using System.Fabric; namespace FabricObserver.Observers.Utilities @@ -18,5 +20,14 @@ public interface IProcessInfoProvider /// StatelessServiceContext instance. /// float value representing number of allocated file handles for the process. float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); + + /// + /// Returns a list of Process objects that are active descendants (e.g., children and grandchildren) of the provided Process object. + /// + /// + /// + List GetChildProcesses(Process process); + + void Dispose(); } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index ac8f9ec6..b288402c 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -3,8 +3,11 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System; +using System.Collections.Generic; using System.Diagnostics; using System.Fabric; +using System.Linq; namespace FabricObserver.Observers.Utilities { @@ -64,5 +67,47 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return result; } + + public override List GetChildProcesses(Process process) + { + // https://askubuntu.com/questions/512871/find-children-of-the-process + string cmdResult = "ps -o ppid= -o pid= -A | awk '$1 == " + process.Id.ToString() + " {print $2}'".Bash(); + List childProcesses = new List(); + + if (!string.IsNullOrWhiteSpace(cmdResult)) + { + var sPids = cmdResult.Split('\n')?.ToList(); + + if (sPids.Count > 0) + { + foreach (string pid in sPids) + { + if (int.TryParse(pid, out int proc)) + { + try + { + Process p = Process.GetProcessById(proc); + childProcesses.Add(p); + } + catch (ArgumentException) + { + // ignore -> process may no longer exist + } + catch (InvalidOperationException ie) + { + Logger.LogWarning("GetFlattenedProcessFamilyTree: Unsuccessful bash cmd (ps - o ppid = -o pid = -A | awk '$1 == " + process.Id.ToString() + " {print $2}')" + ie.ToString()); + } + } + } + } + } + + return childProcesses; + } + + protected override void Dispose(bool disposing) + { + // nothing to do here. + } } } \ No newline at end of file diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index ca24a183..8d92a02d 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -3,12 +3,15 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System; +using System.Collections.Generic; +using System.Diagnostics; using System.Fabric; using System.Runtime.InteropServices; namespace FabricObserver.Observers.Utilities { - public abstract class ProcessInfoProvider : IProcessInfoProvider + public abstract class ProcessInfoProvider : IProcessInfoProvider, IDisposable { private static IProcessInfoProvider instance; private static readonly object lockObj = new object(); @@ -39,6 +42,12 @@ public static IProcessInfoProvider Instance } } + public void Dispose() + { + Dispose(disposing: true); + instance = null; + } + protected Logger Logger { get; @@ -47,5 +56,9 @@ protected Logger Logger public abstract float GetProcessPrivateWorkingSetInMB(int processId); public abstract float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); + + public abstract List GetChildProcesses(Process process); + + protected abstract void Dispose(bool disposing); } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 099799ce..6b391305 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -4,21 +4,37 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; using System.Fabric; +using System.Management; namespace FabricObserver.Observers.Utilities { public class WindowsProcessInfoProvider : ProcessInfoProvider { - const string CategoryName = "Process"; + const string ProcessCategoryName = "Process"; + const string WorkingSetCounterName = "Working Set - Private"; + const string FileHandlesCounterName = "Handle Count"; private readonly object memPerfCounterLock = new object(); private readonly object fileHandlesPerfCounterLock = new object(); + private PerformanceCounter memProcessPrivateWorkingSetCounter = new PerformanceCounter + { + CategoryName = ProcessCategoryName, + CounterName = WorkingSetCounterName, + ReadOnly = true + }; + + private PerformanceCounter processFileHandleCounter = new PerformanceCounter + { + CategoryName = ProcessCategoryName, + CounterName = FileHandlesCounterName, + ReadOnly = true + }; public override float GetProcessPrivateWorkingSetInMB(int processId) { - const string WorkingSetCounterName = "Working Set - Private"; string processName; try @@ -37,37 +53,24 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) lock (memPerfCounterLock) { - PerformanceCounter memProcessPrivateWorkingSetCounter = null; - try { - memProcessPrivateWorkingSetCounter = new PerformanceCounter - { - CategoryName = CategoryName, - CounterName = WorkingSetCounterName, - InstanceName = processName - }; - + memProcessPrivateWorkingSetCounter.InstanceName = processName; return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); } catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return 0F; } catch (Exception e) { - Logger.LogError($"{CategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); + Logger.LogError($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); throw; } - finally - { - memProcessPrivateWorkingSetCounter?.Dispose(); - memProcessPrivateWorkingSetCounter = null; - } } } @@ -79,7 +82,6 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return -1F; } - const string FileHandlesCounterName = "Handle Count"; string processName; try @@ -98,38 +100,98 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService lock (fileHandlesPerfCounterLock) { - PerformanceCounter processFileHandleCounter = null; - try { - processFileHandleCounter = new PerformanceCounter - { - CategoryName = CategoryName, - CounterName = FileHandlesCounterName, - InstanceName = processName - }; - + processFileHandleCounter.InstanceName = processName; return processFileHandleCounter.NextValue(); } catch (Exception e) when (e is InvalidOperationException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{ProcessCategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return -1F; } catch (Exception e) { - Logger.LogError($"{CategoryName} {FileHandlesCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); + Logger.LogError($"{ProcessCategoryName} {FileHandlesCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); throw; } - finally + } + } + + public override List GetChildProcesses(Process process) + { + List childProcesses = new List(); + string query = $"select processid from win32_process where parentprocessid = {process.Id}"; + + try + { + using (var searcher = new ManagementObjectSearcher(query)) { - processFileHandleCounter?.Dispose(); - processFileHandleCounter = null; + var results = searcher.Get(); + + using (ManagementObjectCollection.ManagementObjectEnumerator enumerator = results.GetEnumerator()) + { + while (enumerator.MoveNext()) + { + try + { + using (ManagementObject mObj = (ManagementObject)enumerator.Current) + { + object childProcessObj = mObj.Properties["processid"].Value; + + if (childProcessObj == null) + { + continue; + } + + Process childProcess = Process.GetProcessById(Convert.ToInt32(childProcessObj)); + + if (childProcess != null) + { + if (childProcess.ProcessName == "conhost") + { + continue; + } + + childProcesses.Add(childProcess); + + // Now get child of child, if exists. + List grandChildren = GetChildProcesses(childProcess); + + if (grandChildren?.Count > 0) + { + childProcesses.AddRange(grandChildren); + } + } + } + } + catch (Exception e) when (e is ArgumentException || e is ManagementException) + { + Logger.LogWarning($"[Inner try-catch (enumeration)] Handled Exception in GetChildProcesses: {e}"); + continue; + } + } + } } } + catch (ManagementException me) + { + Logger.LogWarning($"[Containing try-catch] Handled Exception in GetChildProcesses: {me}"); + } + + return childProcesses; + } + + protected override void Dispose(bool disposing) + { + this.memProcessPrivateWorkingSetCounter?.Dispose(); + this.memProcessPrivateWorkingSetCounter = null; + + this.processFileHandleCounter?.Dispose(); + this.processFileHandleCounter = null; } } } \ No newline at end of file diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 7d9ed2de..a01a096b 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,17 +2,8 @@ %PACKAGE_ID% - 3.1.14 + 3.1.15 - - Bug fix in FO plugin loader: native dependencies will not crash loader (and thus FO). - - For Plugin authors: You MUST place all plugin dependency libraries in the same folder as your plugin dll. - - Plugins (and their dependencies) can now live in child folders in the Plugins directory, which will keep things cleaner for folks with multiple plugins. - The Plugins folder/file structure MUST be: - Config/Data/Plugins/MyPlugin/MyPlugin.dll (required), MyPlugin.pdb (optional), [ALL of MyPlugin.dll's private dependencies] (required) - OR Config/Data/Plugins/MyPlugin.dll (required), MyPlugin.pdb(optional), [ALL of MyPlugin.dll's private dependencies] (required). - A private plugin dependency is any package that you reference in your plugin project that is not already referenced by FabricObserver. - So, things like Nuget packages or Project References or COM References that are only used by your plugin. It is important to stress that if a dependency dll has dependencies, - then you MUST also place those in the plugin's directory. Microsoft MIT diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index efb826fd..1859c591 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -17,14 +17,14 @@ using System.Threading.Tasks; using FabricObserver.Observers.MachineInfoModel; using FabricObserver.Observers.Utilities; +using FabricObserver.Observers.Utilities.Telemetry; +using Newtonsoft.Json; using ConfigSettings = FabricObserver.Observers.MachineInfoModel.ConfigSettings; namespace FabricObserver.Observers { - // This observer monitors the behavior of user SF service processes - // and signals Warning and Error based on user-supplied resource thresholds - // in AppObserver.config.json - // Health Report processor will also emit ETW telemetry if configured in Settings.xml. + // This observer monitors the behavior of user SF service processes (and their children) and signals Warning and Error based on user-supplied resource thresholds + // in AppObserver.config.json. This observer will also emit telemetry (ETW, LogAnalytics/AppInsights) if enabled in Settings.xml (ObserverManagerConfiguration) and ApplicationManifest.xml (AppObserverEnableEtw). public class AppObserver : ObserverBase { // Health Report data containers - For use in analysis to determine health state. @@ -35,7 +35,6 @@ public class AppObserver : ObserverBase private List> AllAppTotalActivePortsData; private List> AllAppEphemeralPortsData; private List> AllAppHandlesData; - private readonly Stopwatch stopwatch; // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. private List userTargetList; @@ -44,6 +43,7 @@ public class AppObserver : ObserverBase private List deployedTargetList; private readonly ConfigSettings configSettings; private string fileName; + private readonly Stopwatch stopwatch; public List ReplicaOrInstanceList { @@ -113,8 +113,6 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { - token.ThrowIfCancellationRequested(); - if (deployedTargetList.Count == 0) { return Task.CompletedTask; @@ -122,163 +120,367 @@ public override Task ReportAsync(CancellationToken token) var healthReportTimeToLive = GetHealthReportTimeToLive(); - // App-specific reporting. - foreach (var app in deployedTargetList) + foreach (var repOrInst in ReplicaOrInstanceList) { token.ThrowIfCancellationRequested(); - // Process data for reporting. - foreach (var repOrInst in ReplicaOrInstanceList) + string processName = null; + int processId = 0; + ApplicationInfo app = null; + + try { - token.ThrowIfCancellationRequested(); + app = deployedTargetList.Find( + a => a.TargetApp == repOrInst.ApplicationName.OriginalString || a.TargetAppType == repOrInst.ApplicationTypeName); + + using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); - if (!string.IsNullOrWhiteSpace(app.TargetAppType) - && !string.Equals( - repOrInst.ApplicationTypeName, - app.TargetAppType, - StringComparison.CurrentCultureIgnoreCase)) + // If the process is no longer running, then don't report on it. + if (p.HasExited) { continue; } - if (!string.IsNullOrWhiteSpace(app.TargetApp) - && !string.Equals( - repOrInst.ApplicationName.OriginalString, - app.TargetApp, - StringComparison.CurrentCultureIgnoreCase)) + processName = p.ProcessName; + processId = p.Id; + } + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + ObserverLogger.LogWarning($"Handled Exception in ReportAsync:{Environment.NewLine}{e}"); + continue; + } + + string appNameOrType = GetAppNameOrType(repOrInst); + var id = $"{appNameOrType}:{processName}"; + + // Locally Log (csv) CPU/Mem/FileHandles/Ports per app service process. + if (EnableCsvLogging) + { + // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. + // Please use ContainerObserver for SF container app service monitoring. + if (processName == "Fabric") { continue; } - string processName = null; - int processId = 0; + fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - try + // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. + // This means a new folder will be added to the base path. + if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) { - using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); + CsvFileLogger.DataLogFolder = processName; + } - // If the process is no longer running, then don't report on it. - if (p.HasExited) - { - continue; - } + // Log pid.. + CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); - processName = p.ProcessName; - processId = p.Id; + // Log resource usage data to CSV files. + LogAllAppResourceDataToCsv(id); + } + + // CPU - Parent process + if (AllAppCpuData.Any(x => x.Id == id)) + { + var parentFrud = AllAppCpuData.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppCpuData, processName, repOrInst, app, token); + + // This will only be true if the parent has child procs that are currently executing. + if (sumValues > parentDataAvg) + { + parentFrud.Data.Clear(); + parentFrud.Data.Add(sumValues); } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + + ProcessResourceDataReportHealth( + parentFrud, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // Memory MB - Parent process + if (AllAppMemDataMb.Any(x => x.Id == id)) + { + var parentFrud = AllAppMemDataMb.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, processName, repOrInst, app, token); + + if (sumValues > parentDataAvg) { - continue; + parentFrud.Data.Clear(); + parentFrud.Data.Add((float)sumValues); } - string appNameOrType = GetAppNameOrType(repOrInst); + // Parent's aggregated (summed) spawned process data. + // This will generate an SF health event if the combined total exceeds the supplied threshold. + ProcessResourceDataReportHealth( + parentFrud, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } - var id = $"{appNameOrType}:{processName}"; + // Memory Percent - Parent process + if (AllAppMemDataPercent.Any(x => x.Id == id)) + { + var parentFrud = AllAppMemDataPercent.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, processName, repOrInst, app, token); - // Locally Log (csv) CPU/Mem/FileHandles/Ports per app service process. - if (EnableCsvLogging) + if (sumValues > parentDataAvg) { - // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. - // Please use ContainerObserver for SF container app service monitoring. - if (processName == "Fabric") - { - continue; - } + parentFrud.Data.Clear(); + parentFrud.Data.Add(sumValues); + } - fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; + // Parent's aggregated (summed) spawned process data. + ProcessResourceDataReportHealth( + parentFrud, + app.MemoryErrorLimitPercent, + app.MemoryWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } - // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. - // This means a new folder will be added to the base path. - if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) - { - CsvFileLogger.DataLogFolder = processName; - } + // TCP Ports - Active - Parent process + if (AllAppTotalActivePortsData.Any(x => x.Id == id)) + { + var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, processName, repOrInst, app, token); - // Log pid.. - CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); + if (sumValues > parentDataAvg) + { + parentFrud.Data.Clear(); + parentFrud.Data.Add((int)sumValues); + } + + ProcessResourceDataReportHealth( + parentFrud, + app.NetworkErrorActivePorts, + app.NetworkWarningActivePorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process + if (AllAppEphemeralPortsData.Any(x => x.Id == id)) + { + var parentFrud = AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, processName, repOrInst, app, token); - // Log resource usage data to CSV files. - LogAllAppResourceDataToCsv(id); + if (sumValues > parentDataAvg) + { + parentFrud.Data.Clear(); + parentFrud.Data.Add((int)sumValues); } - // CPU - if (AllAppCpuData.Any(x => x.Id == id)) + // Parent's aggregated (summed) process data. + ProcessResourceDataReportHealth( + parentFrud, + app.NetworkErrorEphemeralPorts, + app.NetworkWarningEphemeralPorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // Allocated (in use) Handles - Parent process + if (AllAppHandlesData.Any(x => x.Id == id)) + { + var parentFrud = AllAppHandlesData.FirstOrDefault(x => x.Id == id); + var parentDataAvg = Math.Round(parentFrud.AverageDataValue); + double sumValues = Math.Round(parentDataAvg, 0); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppHandlesData, processName, repOrInst, app, token); + + if (sumValues > parentDataAvg) { - ProcessResourceDataReportHealth( - AllAppCpuData.FirstOrDefault(x => x.Id == id), - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + parentFrud.Data.Clear(); + parentFrud.Data.Add((float)sumValues); } - // Memory MB - if (AllAppMemDataMb.Any(x => x.Id == id)) + // Parent's aggregated (summed) spawned process data. + ProcessResourceDataReportHealth( + parentFrud, + app.ErrorOpenFileHandles, + app.WarningOpenFileHandles, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + } + + return Task.CompletedTask; + } + + private double ProcessChildFrudsGetDataSum( + ref List> fruds, + string parentProcessName, + ReplicaOrInstanceMonitoringInfo repOrInst, + ApplicationInfo app, + CancellationToken token) where T : struct + { + double sumValues = 0; + + // Child processes (sum) + if (fruds.Any(x => x.Id.Contains(parentProcessName) && x.Id.Contains("_child"))) + { + var childFruds = fruds.Where(x => x.Id.Contains(parentProcessName) && x.Id.Contains("_child")).ToList(); + + foreach (var frud in childFruds) + { + token.ThrowIfCancellationRequested(); + + sumValues += Math.Round(frud.AverageDataValue, 0); + string childProcName = frud.Id.Split("_")[1]; + int childPid = 0; + Process[] ps = null; + + try { - ProcessResourceDataReportHealth( - AllAppMemDataMb.FirstOrDefault(x => x.Id == id), - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + ps = Process.GetProcessesByName(childProcName); + childPid = ps[0].Id; } + catch (Exception e) when (e is ArgumentException || e is Win32Exception) + { - // Memory Percent - if (AllAppMemDataPercent.Any(x => x.Id == id)) + } + finally { - ProcessResourceDataReportHealth( - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id), - app.MemoryErrorLimitPercent, - app.MemoryWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + foreach (var proc in ps) + { + proc?.Dispose(); + } } - // TCP Ports - Active - if (AllAppTotalActivePortsData.Any(x => x.Id == id)) + if (IsEtwEnabled) { - ProcessResourceDataReportHealth( - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id), - app.NetworkErrorActivePorts, - app.NetworkWarningActivePorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); + var rawdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + Metric = frud.Property, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid : -1, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue, + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); } - // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - if (AllAppEphemeralPortsData.Any(x => x.Id == id)) + if (IsTelemetryEnabled) { - ProcessResourceDataReportHealth( - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id), - app.NetworkErrorEphemeralPorts, - app.NetworkWarningEphemeralPorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); + var telemData = new TelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + Metric = frud.Property, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + _ = TelemetryClient?.ReportMetricAsync(telemData, token); } - // Allocated (in use) Handles - if (AllAppHandlesData.Any(x => x.Id == id)) + if (frud.IsUnhealthy(app.MemoryWarningLimitMb)) { - ProcessResourceDataReportHealth( - AllAppHandlesData.FirstOrDefault(x => x.Id == id), - app.ErrorOpenFileHandles, - app.WarningOpenFileHandles, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); + if (IsEtwEnabled) + { + var warningdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", + Level = "Warning", + Metric = frud.Property, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid : -1, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, warningdata); + } + + if (IsTelemetryEnabled) + { + var telemWarnData = new TelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", + Metric = frud.Property, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + _ = TelemetryClient?.ReportHealthAsync(telemWarnData, token); + } + + var healthReport = new Utilities.HealthReport + { + AppName = repOrInst.ApplicationName, + Code = FOErrorWarningCodes.Ok, + EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, + HealthMessage = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", + HealthReportTimeToLive = GetHealthReportTimeToLive(), + ReportType = HealthReportType.Application, + State = HealthState.Ok, + NodeName = NodeName, + Observer = ObserverName, + Property = frud.Id, + ResourceUsageDataProperty = frud.Property, + SourceId = $"{ObserverName}({FOErrorWarningCodes.Ok})" + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); } + + fruds.Remove(frud); } } - return Task.CompletedTask; + return sumValues; } private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst) @@ -292,9 +494,9 @@ private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst // be up to date across observer loop iterations. private async Task InitializeAsync() { - ReplicaOrInstanceList ??= new List(); - userTargetList ??= new List(); - deployedTargetList ??= new List(); + ReplicaOrInstanceList = new List(); + userTargetList = new List(); + deployedTargetList = new List(); configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( @@ -555,7 +757,8 @@ private async Task InitializeAsync() private async Task MonitorDeployedAppsAsync(CancellationToken token) { - Process currentProcess = null; + Process parentProc = null; + Process childProc = null; int capacity = ReplicaOrInstanceList.Count; AllAppCpuData ??= new List>(capacity); AllAppMemDataMb ??= new List>(capacity); @@ -569,7 +772,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) token.ThrowIfCancellationRequested(); var timer = new Stopwatch(); - int processId = (int)repOrInst.HostProcessId; + int parentPid = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false; var application = deployedTargetList?.Find( @@ -577,6 +780,8 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); + List procTree = null; + if (application?.TargetApp == null && application?.TargetAppType == null) { continue; @@ -585,18 +790,18 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) try { // App level. - currentProcess = Process.GetProcessById(processId); - string procName = currentProcess.ProcessName; + parentProc = Process.GetProcessById(parentPid); + string parentProcName = parentProc.ProcessName; // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. // Please use ContainerObserver for SF container app service monitoring. - if (procName == "Fabric") + if (parentProcName == "Fabric") { continue; } string appNameOrType = GetAppNameOrType(repOrInst); - string id = $"{appNameOrType}:{procName}"; + string id = $"{appNameOrType}:{parentProcName}"; if (UseCircularBuffer) { @@ -658,17 +863,6 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) checkEphemeralPorts = true; } - // Measure Total and Ephemeral ports. - if (checkAllPorts) - { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(currentProcess.Id, FabricServiceContext)); - } - - if (checkEphemeralPorts) - { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext)); - } - // File Handles (FD on linux) if (AllAppHandlesData.All(list => list.Id != id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) { @@ -680,105 +874,204 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) checkHandles = true; } - // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. - if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) - { - continue; - } - /* CPU and Memory Usage */ - TimeSpan duration = TimeSpan.FromSeconds(1); - - if (MonitorDuration > TimeSpan.MinValue) + // Get list of child processes of parentProc should they exist. + // In order to provide accurate resource usage of an SF service process we need to also account for + // any processes (children) that the service process (parent) created/spawned. + procTree = new List { - duration = MonitorDuration; - } - /* Warm up counters. */ + // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, + // then only the parent process will be in this list. + parentProc + }; + procTree.AddRange(ProcessInfoProvider.Instance.GetChildProcesses(parentProc)); - if (checkCpu) + foreach (Process proc in procTree) { - _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); - } - - if (checkHandles) - { - _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); - } + // Total TCP ports usage + if (checkAllPorts) + { + // Parent process (the service process) + if (proc.ProcessName == parentProcName) + { + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); + } + else + { + // Children (spawned by the parent service process) + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); + } + } - if (checkMemMb || checkMemPct) - { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); - } + // Ephemeral TCP ports usage + if (checkEphemeralPorts) + { + if (proc.ProcessName == parentProcName) + { + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); + } + else + { + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); + } + } - timer.Start(); + TimeSpan duration = TimeSpan.FromSeconds(1); - while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) - { - token.ThrowIfCancellationRequested(); + if (MonitorDuration > TimeSpan.MinValue) + { + duration = MonitorDuration; + } - if (checkCpu) + // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. + if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) { - // CPU (all cores). - double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); + continue; + } - if (cpu >= 0) - { - if (cpu > 100) - { - cpu = 100; - } + /* Warm up counters. */ - AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); - } + if (checkCpu) + { + _ = cpuUsage.GetCpuUsagePercentageProcess(proc); } - float processMem = 0; + if (checkHandles) + { + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(proc.Id, FabricServiceContext); + } if (checkMemMb || checkMemPct) { - processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(proc.Id); } - if (checkMemMb) + float processMem = 0; + + if (checkMemMb || checkMemPct) { - // Memory (private working set (process)). - AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(proc.Id); } - if (checkMemPct) + if (checkHandles) { - // Memory (percent in use (total)). - var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(proc.Id, FabricServiceContext); - if (TotalMemory > 0) + if (handles > -1) { - double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + if (proc.ProcessName == parentProc.ProcessName) + { + AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + } + else + { + if (!AllAppHandlesData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(handles); + } } } - if (checkHandles) + timer.Start(); + + while (!proc.HasExited && timer.Elapsed.Seconds <= duration.Seconds) { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); + token.ThrowIfCancellationRequested(); - if (handles > -1) + if (checkCpu) { - AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + // CPU (all cores). + double cpu = cpuUsage.GetCpuUsagePercentageProcess(proc); + + if (cpu >= 0) + { + if (cpu > 100) + { + cpu = 100; + } + + if (proc.ProcessName == parentProc.ProcessName) + { + AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + } + else + { + if (!AllAppCpuData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + + AllAppCpuData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(cpu); + } + } + + // Memory (private working set (process)). + if (checkMemMb) + { + if (proc.ProcessName == parentProcName) + { + AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); + } + else + { + if (!AllAppMemDataMb.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + + AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(processMem); + } + } + + // Memory (percent in use (total)). + if (checkMemPct) + { + var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + + if (TotalMemory > 0) + { + double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); + if (proc.ProcessName == parentProc.ProcessName) + { + AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + } + else + { + if (!AllAppMemDataPercent.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + { + AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + } + + AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(Math.Round(usedPct, 1)); + } + } + } } + + await Task.Delay(250, Token); } - await Task.Delay(250, Token); + timer.Stop(); + timer.Reset(); } - - timer.Stop(); - timer.Reset(); } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { ObserverLogger.LogWarning( - $"Handled exception in MonitorDeployedAppsAsync: process {processId} is not running or it's running at a higher privilege than FabricObserver.{Environment.NewLine}" + + $"Handled exception in MonitorDeployedAppsAsync: Process {parentPid} is not running or it's running at a higher privilege than FabricObserver.{Environment.NewLine}" + $"ServiceName: {repOrInst.ServiceName?.OriginalString ?? "unknown"}{Environment.NewLine}Error message: {e.Message}"); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) @@ -790,10 +1083,24 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } finally { - currentProcess?.Dispose(); - currentProcess = null; + if (procTree != null) + { + foreach (var p in procTree) + { + p?.Dispose(); + } + } } } + + try + { + ProcessInfoProvider.Instance.Dispose(); + } + catch (Exception e) + { + ObserverLogger.LogWarning($"Can't dispose ProcessInfoProvider.Instance:{Environment.NewLine}{e}"); + } } private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index bc25de37..0dae5929 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,10 +27,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 12b9eb8c..52abdf89 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,10 +21,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 3366420e..c3a4ee68 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + - + @@ -274,6 +274,9 @@ + \ No newline at end of file diff --git a/README.md b/README.md index 1ad32b68..60b3da43 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.14 +# FabricObserver 3.1.15 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of machine resources that tend to be very important to all Service Fabric applications, like disk space consumption, CPU use, memory use, endpoint availability, ephemeral TCP port use, and app/cluster certificate health out-of-the-box. From b508a2d1a6014ddc9c25812e297335fb33e0820e Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 24 Jun 2021 18:03:24 -0700 Subject: [PATCH 02/14] FO 3.1.15 RC - Process Tree Monitoring --- .../ReplicaOrInstanceMonitoringInfo.cs | 7 + FabricObserver.Extensibility/ObserverBase.cs | 15 +- FabricObserver/Observers/AppObserver.cs | 359 ++++++++++-------- FabricObserver/Observers/ObserverManager.cs | 22 +- 4 files changed, 240 insertions(+), 163 deletions(-) diff --git a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs index a6b125a1..d5d95169 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs @@ -4,6 +4,8 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; +using System.Diagnostics; namespace FabricObserver.Observers.MachineInfoModel { @@ -48,5 +50,10 @@ public string ServicePackageActivationId { get; set; } + + public List ChildProcesses + { + get; set; + } } } diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 882d64d4..1e87cb23 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -887,9 +887,22 @@ public void ProcessResourceDataReportHealth( } var healthMessage = new StringBuilder(); + string childProcMsg = string.Empty; + + if (replicaOrInstance != null && replicaOrInstance.ChildProcesses != null) + { + childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcesses.Count}). " + + $"Their cumulative impact on {name}'s resource usage has been applied."; + } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); - _ = healthMessage.Append($" - {data.Property}: {Math.Round(data.AverageDataValue, 0)}{data.Units}"); + _ = healthMessage.Append($" - {data.Property}: {Math.Round(data.AverageDataValue, 0)}{data.Units} "); + + if (childProcMsg != string.Empty) + { + _ = healthMessage.AppendLine(); + _ = healthMessage.AppendLine(childProcMsg); + } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 1859c591..5fe9dbd3 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -185,7 +185,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppCpuData.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppCpuData, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppCpuData, repOrInst, app, token); // This will only be true if the parent has child procs that are currently executing. if (sumValues > parentDataAvg) @@ -210,7 +210,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppMemDataMb.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, repOrInst, app, token); if (sumValues > parentDataAvg) { @@ -236,7 +236,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppMemDataPercent.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, repOrInst, app, token); if (sumValues > parentDataAvg) { @@ -261,7 +261,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, repOrInst, app, token); if (sumValues > parentDataAvg) { @@ -285,7 +285,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, repOrInst, app, token); if (sumValues > parentDataAvg) { @@ -310,7 +310,7 @@ public override Task ReportAsync(CancellationToken token) var parentFrud = AllAppHandlesData.FirstOrDefault(x => x.Id == id); var parentDataAvg = Math.Round(parentFrud.AverageDataValue); double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppHandlesData, processName, repOrInst, app, token); + sumValues += ProcessChildFrudsGetDataSum(ref AllAppHandlesData, repOrInst, app, token); if (sumValues > parentDataAvg) { @@ -335,148 +335,159 @@ public override Task ReportAsync(CancellationToken token) private double ProcessChildFrudsGetDataSum( ref List> fruds, - string parentProcessName, ReplicaOrInstanceMonitoringInfo repOrInst, ApplicationInfo app, CancellationToken token) where T : struct { + var childProcs = repOrInst.ChildProcesses; + + if (childProcs == null) + { + return 0; + } + double sumValues = 0; - // Child processes (sum) - if (fruds.Any(x => x.Id.Contains(parentProcessName) && x.Id.Contains("_child"))) + foreach (Process childProc in childProcs) { - var childFruds = fruds.Where(x => x.Id.Contains(parentProcessName) && x.Id.Contains("_child")).ToList(); + token.ThrowIfCancellationRequested(); - foreach (var frud in childFruds) + try { - token.ThrowIfCancellationRequested(); - - sumValues += Math.Round(frud.AverageDataValue, 0); - string childProcName = frud.Id.Split("_")[1]; - int childPid = 0; - Process[] ps = null; - - try + if (childProc.HasExited) { - ps = Process.GetProcessesByName(childProcName); - childPid = ps[0].Id; + continue; } - catch (Exception e) when (e is ArgumentException || e is Win32Exception) - { - } - finally + if (fruds.Any(x => x.Id.Contains(childProc.ProcessName))) { - foreach (var proc in ps) - { - proc?.Dispose(); - } - } + var childFruds = fruds.Where(x => x.Id.Contains(childProc.ProcessName)).ToList(); - if (IsEtwEnabled) - { - var rawdata = new + foreach (var frud in childFruds) { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - Metric = frud.Property, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid : -1, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue, - }; + token.ThrowIfCancellationRequested(); - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); - } + sumValues += Math.Round(frud.AverageDataValue, 0); + string childProcName = childProc.ProcessName; + int childPid = childProc.Id; - if (IsTelemetryEnabled) - { - var telemData = new TelemetryData - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - Metric = frud.Property, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - _ = TelemetryClient?.ReportMetricAsync(telemData, token); - } + if (IsEtwEnabled) + { + var rawdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + Metric = frud.Property, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid : -1, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue, + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + } - if (frud.IsUnhealthy(app.MemoryWarningLimitMb)) - { - if (IsEtwEnabled) - { - var warningdata = new + if (IsTelemetryEnabled) { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", - Level = "Warning", - Metric = frud.Property, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid : -1, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, warningdata); - } + var telemData = new TelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + Metric = frud.Property, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + _ = TelemetryClient?.ReportMetricAsync(telemData, token); + } - if (IsTelemetryEnabled) - { - var telemWarnData = new TelemetryData + if (frud.IsUnhealthy(app.CpuWarningLimitPercent) ||frud.IsUnhealthy(app.MemoryWarningLimitMb) || + frud.IsUnhealthy(app.MemoryWarningLimitPercent) || frud.IsUnhealthy(app.NetworkWarningEphemeralPorts) || + frud.IsUnhealthy(app.WarningOpenFileHandles)) { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", - Metric = frud.Property, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - _ = TelemetryClient?.ReportHealthAsync(telemWarnData, token); - } + if (IsEtwEnabled) + { + var warningdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", + HealthState = "Warning", + Metric = frud.Property, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = $"{repOrInst.ServiceName.OriginalString}/{childProcName}_{childPid}", + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, warningdata); + } - var healthReport = new Utilities.HealthReport - { - AppName = repOrInst.ApplicationName, - Code = FOErrorWarningCodes.Ok, - EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, - HealthMessage = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", - HealthReportTimeToLive = GetHealthReportTimeToLive(), - ReportType = HealthReportType.Application, - State = HealthState.Ok, - NodeName = NodeName, - Observer = ObserverName, - Property = frud.Id, - ResourceUsageDataProperty = frud.Property, - SourceId = $"{ObserverName}({FOErrorWarningCodes.Ok})" - }; + if (IsTelemetryEnabled) + { + var telemWarnData = new TelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", + HealthState = "Warning", + Metric = frud.Property, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = $"{repOrInst.ServiceName.OriginalString}/{childProcName}_{childPid}", + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; + + _ = TelemetryClient?.ReportHealthAsync(telemWarnData, token); + } - // Generate a Service Fabric Health Report. - HealthReporter.ReportHealthToServiceFabric(healthReport); - } + // This provides information in SFX to help you understand that your App is in Warning because one of its services' child processes + // is misbehaving. Now you know exactly which one you need to fix. + var healthReport = new Utilities.HealthReport + { + AppName = repOrInst.ApplicationName, + Code = FOErrorWarningCodes.Ok, + EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, + HealthMessage = $"Note that service {repOrInst.ServiceName.OriginalString} spawned a child process, {childProcName}({childPid}), " + + $"that has exceeded your supplied threshold for {frud.Property} for Application {repOrInst.ApplicationName.OriginalString}.", + HealthReportTimeToLive = GetHealthReportTimeToLive(), + ReportType = HealthReportType.Application, + State = HealthState.Ok, + NodeName = NodeName, + Observer = ObserverName, + Property = frud.Id, + ResourceUsageDataProperty = frud.Property, + SourceId = $"{ObserverName}({FOErrorWarningCodes.Ok})" + }; + + // Generate a Service Fabric Health Report. + HealthReporter.ReportHealthToServiceFabric(healthReport); + } - fruds.Remove(frud); + fruds.Remove(frud); + } + + } + } + catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException) + { + continue; } } @@ -757,8 +768,6 @@ private async Task InitializeAsync() private async Task MonitorDeployedAppsAsync(CancellationToken token) { - Process parentProc = null; - Process childProc = null; int capacity = ReplicaOrInstanceList.Count; AllAppCpuData ??= new List>(capacity); AllAppMemDataMb ??= new List>(capacity); @@ -789,8 +798,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) try { - // App level. - parentProc = Process.GetProcessById(parentPid); + using Process parentProc = Process.GetProcessById(parentPid); string parentProcName = parentProc.ProcessName; // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. @@ -881,31 +889,34 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // any processes (children) that the service process (parent) created/spawned. procTree = new List { - // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, // then only the parent process will be in this list. parentProc }; - procTree.AddRange(ProcessInfoProvider.Instance.GetChildProcesses(parentProc)); + + if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) + { + procTree.AddRange(repOrInst.ChildProcesses); + } foreach (Process proc in procTree) { // Total TCP ports usage if (checkAllPorts) { - // Parent process (the service process) + // Parent process (the service process). if (proc.ProcessName == parentProcName) { AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); } else { - // Children (spawned by the parent service process) - if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + // Child procs spawned by the parent service process. + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); } } @@ -918,11 +929,11 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } else { - if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); } } @@ -975,12 +986,12 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } else { - if (!AllAppHandlesData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppHandlesData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(handles); + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(handles); } } } @@ -1009,12 +1020,12 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } else { - if (!AllAppCpuData.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + if (!AllAppCpuData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppCpuData.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(cpu); + AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(cpu); } } @@ -1027,12 +1038,12 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } else { - if (!AllAppMemDataMb.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(processMem); + AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(processMem); } } @@ -1050,12 +1061,12 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) } else { - if (!AllAppMemDataPercent.Any(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child")) + if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{proc.ProcessName}")) { - AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{parentProcName}_{proc.ProcessName}_child", capacity, UseCircularBuffer)); + AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); } - AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{parentProcName}_{proc.ProcessName}_child").Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(Math.Round(usedPct, 1)); } } } @@ -1081,16 +1092,6 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Fix the bug.. throw; } - finally - { - if (procTree != null) - { - foreach (var p in procTree) - { - p?.Dispose(); - } - } - } } try @@ -1257,6 +1258,14 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statefulReplica.Partitionid, ServiceName = statefulReplica.ServiceName }; + + using var p = Process.GetProcessById((int)statefulReplica.HostProcessId); + var childProcs = ProcessInfoProvider.Instance.GetChildProcesses(p); + if (childProcs?.Count > 0) + { + replicaInfo.ChildProcesses = childProcs; + } + break; } case DeployedStatelessServiceInstance statelessInstance: @@ -1282,6 +1291,13 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statelessInstance.Partitionid, ServiceName = statelessInstance.ServiceName }; + + using var p = Process.GetProcessById((int)statelessInstance.HostProcessId); + var childProcs = ProcessInfoProvider.Instance.GetChildProcesses(p); + if (childProcs?.Count > 0) + { + replicaInfo.ChildProcesses = childProcs; + } break; } } @@ -1301,6 +1317,29 @@ private void CleanUp() userTargetList?.Clear(); userTargetList = null; + // Clean up service child Process objects, if any. + if (ReplicaOrInstanceList.Any(repOrInst => repOrInst.ChildProcesses != null)) + { + foreach (var rep in ReplicaOrInstanceList) + { + if (rep.ChildProcesses == null) + { + continue; + } + + for (int i = 0; i < rep.ChildProcesses.Count; ++i) + { + var p = rep.ChildProcesses[i]; + + p?.Dispose(); + p = null; + } + + rep.ChildProcesses.Clear(); + rep.ChildProcesses = null; + } + } + ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 026b2d34..49875be5 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -300,7 +300,7 @@ make that connection. You should generally not have to call GC.Collect from user catch (Exception e) { var message = - $"Unhanded Exception in {ObserverConstants.ObserverManagerName} on node " + + $"Unhandled Exception in {ObserverConstants.ObserverManagerName} on node " + $"{nodeName}. Taking down FO process. " + $"Error info:{Environment.NewLine}{e}"; @@ -379,6 +379,13 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf var appHealth = await FabricClientInstance.HealthManager.GetApplicationHealthAsync(appName).ConfigureAwait(true); var fabricObserverAppHealthEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + if (isConfigurationUpdateInProgress) + { + fabricObserverAppHealthEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && s.HealthInformation.HealthState == HealthState.Warning + || s.HealthInformation.HealthState == HealthState.Error); + } + foreach (var evt in fabricObserverAppHealthEvents) { healthReport.AppName = appName; @@ -405,6 +412,14 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf { var nodeHealth = await FabricClientInstance.HealthManager.GetNodeHealthAsync(obs.NodeName).ConfigureAwait(true); var fabricObserverNodeHealthEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + + if (isConfigurationUpdateInProgress) + { + fabricObserverNodeHealthEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && s.HealthInformation.HealthState == HealthState.Warning + || s.HealthInformation.HealthState == HealthState.Error); + } + healthReport.ReportType = HealthReportType.Node; foreach (var evt in fabricObserverNodeHealthEvents) @@ -762,8 +777,10 @@ private async Task RunObserversAsync() var exceptionBuilder = new StringBuilder(); bool allExecuted = true; - foreach (var observer in observers) + for (int i = 0; i < observers.Count(); ++i) { + var observer = observers[i]; + if (isConfigurationUpdateInProgress) { return true; @@ -865,6 +882,7 @@ await File.WriteAllLinesAsync( } catch (IOException) { + } } } From 51ec509ce4b6a38770fe6c54e5a41fa1b72afca9 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 30 Jun 2021 16:15:08 -0700 Subject: [PATCH 03/14] FO 3.1.15 RC2 --- .gitignore | 3 + ClusterObserver/ClusterObserver.cs | 6 +- .../Utilities/Telemetry/TelemetryData.cs | 8 +- .../ReplicaOrInstanceMonitoringInfo.cs | 2 +- FabricObserver.Extensibility/ObserverBase.cs | 4 +- .../Utilities/CpuUsage.cs | 43 +- .../Utilities/ErrorWarningProperty.cs | 3 + .../WindowsInfoProvider.cs | 127 ++-- .../ProcessInfo/IProcessInfoProvider.cs | 2 +- .../ProcessInfo/LinuxProcessInfoProvider.cs | 24 +- .../ProcessInfo/ProcessInfoProvider.cs | 3 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 93 ++- .../Telemetry/AppInsightsTelemetry.cs | 2 + .../Telemetry/LogAnalyticsTelemetry.cs | 58 +- .../Utilities/Telemetry/TelemetryData.cs | 5 + FabricObserver/FabricObserver.cs | 30 +- FabricObserver/FabricObserver.csproj | 7 +- FabricObserver/Observers/AppObserver.cs | 600 ++++++++++-------- FabricObserver/Observers/DiskObserver.cs | 24 +- .../Observers/FabricSystemObserver.cs | 109 ++-- FabricObserver/Observers/NodeObserver.cs | 64 +- FabricObserver/Observers/OSObserver.cs | 4 - FabricObserver/Observers/ObserverManager.cs | 24 +- .../Config/NetworkObserver.config.json | 6 +- 24 files changed, 649 insertions(+), 602 deletions(-) diff --git a/.gitignore b/.gitignore index 42a0c818..226269ea 100644 --- a/.gitignore +++ b/.gitignore @@ -336,3 +336,6 @@ ASALocalRun/ /FabricObserver/observer_logs /FabricObserver/PackageRoot/Data/Plugins/SampleNewObserver.dll /nuget.exe +/FabricObserver/PackageRoot/Data/Plugins/ContainerObserver +/FabricObserver/PackageRoot/Data/Plugins/FabricObserverMdm +/FabricObserver/PackageRoot/Config/containerobserver.config.json diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index fd2410e7..11b27907 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -372,7 +372,7 @@ private async Task ProcessApplicationHealthAsync(IList a udText = $"in UD {udsInAppUpgrade.First(ud => ud > -1 && ud < int.MaxValue)}"; } - telemetryDescription += $"{appName} is upgrading {udText}.{Environment.NewLine}"; + telemetryDescription += $" Note: {appName} is upgrading {udText}.{Environment.NewLine}"; } } @@ -390,9 +390,12 @@ private async Task ProcessApplicationHealthAsync(IList a // From FabricObserver? if (foTelemetryData != null) { + foTelemetryData.Description += telemetryDescription; + // Telemetry. if (TelemetryEnabled && ObserverTelemetryClient != null) { + await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); } @@ -407,6 +410,7 @@ private async Task ProcessApplicationHealthAsync(IList a { foTelemetryData.ApplicationName, foTelemetryData.ServiceName, + foTelemetryData.ChildProcessName, foTelemetryData.HealthState, foTelemetryData.Description, foTelemetryData.Metric, diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs index 370ffe53..0c06ac73 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs @@ -17,6 +17,11 @@ public string ApplicationName get; set; } + public string ChildProcessName + { + get; set; + } + public string ClusterId { get; set; @@ -100,9 +105,10 @@ public object Value [JsonConstructor] public TelemetryData() { + } - public TelemetryData( FabricClient fabricClient, CancellationToken cancellationToken) + public TelemetryData(FabricClient fabricClient, CancellationToken cancellationToken) { var (clusterId, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, cancellationToken).Result; ClusterId = clusterId; diff --git a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs index d5d95169..e924d4b3 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs @@ -51,7 +51,7 @@ public string ServicePackageActivationId get; set; } - public List ChildProcesses + public List ChildProcessIds { get; set; } diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 1e87cb23..fcbce434 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -889,9 +889,9 @@ public void ProcessResourceDataReportHealth( var healthMessage = new StringBuilder(); string childProcMsg = string.Empty; - if (replicaOrInstance != null && replicaOrInstance.ChildProcesses != null) + if (replicaOrInstance != null && replicaOrInstance.ChildProcessIds != null) { - childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcesses.Count}). " + + childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcessIds.Count}). " + $"Their cumulative impact on {name}'s resource usage has been applied."; } diff --git a/FabricObserver.Extensibility/Utilities/CpuUsage.cs b/FabricObserver.Extensibility/Utilities/CpuUsage.cs index f2587211..e7fa565c 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUsage.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUsage.cs @@ -4,6 +4,7 @@ // ------------------------------------------------------------ using System; +using System.ComponentModel; using System.Diagnostics; namespace FabricObserver.Observers.Utilities @@ -21,28 +22,38 @@ public class CpuUsage /// /// Target Process object /// CPU percentage in use as double value - public double GetCpuUsagePercentageProcess(Process p) + public double GetCpuUsagePercentageProcess(int procId) { - if (p == null || p.HasExited) + try { - return 0; - } + using (Process p = Process.GetProcessById(procId)) + { + if (p.HasExited) + { + return 0.0; + } - if (prevTime == DateTime.MinValue) - { - prevTime = DateTime.Now; - prevTotalProcessorTime = p.TotalProcessorTime; + if (prevTime == DateTime.MinValue) + { + prevTime = DateTime.Now; + prevTotalProcessorTime = p.TotalProcessorTime; + } + else + { + currentTimeTime = DateTime.Now; + currentTotalProcessorTime = p.TotalProcessorTime; + double currentUsage = (currentTotalProcessorTime.TotalMilliseconds - prevTotalProcessorTime.TotalMilliseconds) / currentTimeTime.Subtract(prevTime).TotalMilliseconds; + double cpuUsage = currentUsage / Environment.ProcessorCount; + prevTime = currentTimeTime; + prevTotalProcessorTime = currentTotalProcessorTime; + + return cpuUsage * 100.0; + } + } } - else + catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException || e is NotSupportedException) { - currentTimeTime = DateTime.Now; - currentTotalProcessorTime = p.TotalProcessorTime; - double currentUsage = (currentTotalProcessorTime.TotalMilliseconds - prevTotalProcessorTime.TotalMilliseconds) / currentTimeTime.Subtract(prevTime).TotalMilliseconds; - double cpuUsage = currentUsage / Environment.ProcessorCount; - prevTime = currentTimeTime; - prevTotalProcessorTime = currentTotalProcessorTime; - return cpuUsage * 100.0; } return 0.0; diff --git a/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs b/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs index 728fdfda..15ca2de9 100644 --- a/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs +++ b/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs @@ -31,5 +31,8 @@ public sealed class ErrorWarningProperty // File Handles public const string TotalFileHandles = "Total Allocated File Handles"; public const string TotalFileHandlesPct = "Total Allocated File Handles %"; + + // Child procs + public const string ChildProcessCount = "Child Process Count"; } } diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 69bced84..396ccf51 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -39,28 +39,21 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - PropertyDataCollection.PropertyDataEnumerator propEnumerator = mObj.Properties.GetEnumerator(); + object visibleTotalObj = mObj.Properties["TotalVisibleMemorySize"].Value; + object freePhysicalObj = mObj.Properties["FreePhysicalMemory"].Value; - while (propEnumerator.MoveNext()) + if (visibleTotalObj == null || freePhysicalObj == null) { - PropertyData prop = propEnumerator.Current; - string name = prop.Name; - string value = prop.Value.ToString(); - - if (name.Contains("TotalVisible")) - { - visibleTotal = !string.IsNullOrWhiteSpace(value) ? long.Parse(value) : -1L; - } - else - { - freePhysical = !string.IsNullOrWhiteSpace(value) ? long.Parse(value) : -1L; - } + continue; } + + visibleTotal = Convert.ToInt64(visibleTotalObj); + freePhysical = Convert.ToInt64(freePhysicalObj); } } } - if (visibleTotal == -1L || freePhysical == -1L) + if (visibleTotal < 1) { return (-1L, -1); } @@ -197,7 +190,7 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) { win32OsInfo = new ManagementObjectSearcher( "SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory," + - "TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); + "TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); results = win32OsInfo.Get(); @@ -209,80 +202,48 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - PropertyDataCollection.PropertyDataEnumerator propEnumerator = mObj.Properties.GetEnumerator(); - - while (propEnumerator.MoveNext()) + object captionObj = mObj.Properties["Caption"].Value; + object versionObj = mObj.Properties["Version"].Value; + object statusObj = mObj.Properties["Status"].Value; + object osLanguageObj = mObj.Properties["OSLanguage"].Value; + object numProcsObj = mObj.Properties["NumberOfProcesses"].Value; + object freePhysicalObj = mObj.Properties["FreePhysicalMemory"].Value; + object freeVirtualTotalObj = mObj.Properties["FreeVirtualMemory"].Value; + object totalVirtualObj = mObj.Properties["TotalVirtualMemorySize"].Value; + object totalVisibleObj = mObj.Properties["TotalVisibleMemorySize"].Value; + object installDateObj = mObj.Properties["InstallDate"].Value; + object lastBootDateObj = mObj.Properties["LastBootUpTime"].Value; + + osInfo.Name = captionObj?.ToString(); + + if (int.TryParse(numProcsObj?.ToString(), out int numProcesses)) { - PropertyData prop = propEnumerator.Current; - string name = prop.Name; - string value = prop.Value?.ToString(); - - if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(value)) - { - continue; - } - - switch (name.ToLowerInvariant()) - { - case "caption": - osInfo.Name = value; - break; - - case "numberofprocesses": - if (int.TryParse(value, out int numProcesses)) - { - osInfo.NumberOfProcesses = numProcesses; - } - else - { - osInfo.NumberOfProcesses = -1; - } - - break; - - case "status": - osInfo.Status = value; - break; - - case "oslanguage": - osInfo.Language = value; - break; - - case "version": - osInfo.Version = value; - break; - - case "installdate": - osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - - case "lastbootuptime": - osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - - case "freephysicalmemory": - osInfo.FreePhysicalMemoryKB = ulong.Parse(value); - break; - - case "freevirtualmemory": - osInfo.FreeVirtualMemoryKB = ulong.Parse(value); - break; - - case "totalvirtualmemorysize": - osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); - break; - - case "totalvisiblememorysize": - osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); - break; - } + osInfo.NumberOfProcesses = numProcesses; } - } + else + { + osInfo.NumberOfProcesses = -1; + } + + osInfo.Status = statusObj?.ToString(); + osInfo.Language = osLanguageObj?.ToString(); + osInfo.Version = versionObj?.ToString(); + osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(installDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(lastBootDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.FreePhysicalMemoryKB = ulong.TryParse(freePhysicalObj?.ToString(), out ulong freePhysical) ? freePhysical : 0; + osInfo.FreeVirtualMemoryKB = ulong.TryParse(freeVirtualTotalObj?.ToString(), out ulong freeVirtual) ? freeVirtual : 0; + osInfo.TotalVirtualMemorySizeKB = ulong.TryParse(totalVirtualObj?.ToString(), out ulong totalVirtual) ? totalVirtual : 0; + osInfo.TotalVisibleMemorySizeKB = ulong.TryParse(totalVisibleObj?.ToString(), out ulong totalVisible) ? totalVisible : 0; + } } catch (ManagementException me) { Logger.LogInfo($"Handled ManagementException in GetOSInfoAsync retrieval:{Environment.NewLine}{me}"); } + catch (Exception e) + { + Logger.LogInfo($"Bug? => Exception in GetOSInfoAsync:{Environment.NewLine}{e}"); + } } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index 5755f079..c34cd081 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -26,7 +26,7 @@ public interface IProcessInfoProvider /// /// /// - List GetChildProcesses(Process process); + List GetChildProcessIds(int pid); void Dispose(); } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index b288402c..5c4b6feb 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -68,11 +68,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return result; } - public override List GetChildProcesses(Process process) + public override List GetChildProcessIds(int processId) { // https://askubuntu.com/questions/512871/find-children-of-the-process - string cmdResult = "ps -o ppid= -o pid= -A | awk '$1 == " + process.Id.ToString() + " {print $2}'".Bash(); - List childProcesses = new List(); + string cmdResult = "ps -o ppid= -o pid= -A | awk '$1 == " + processId.ToString() + " {print $2}'".Bash(); + List childProcesses = new List(); if (!string.IsNullOrWhiteSpace(cmdResult)) { @@ -80,23 +80,11 @@ public override List GetChildProcesses(Process process) if (sPids.Count > 0) { - foreach (string pid in sPids) + for (int i = 0; i < sPids.Count; ++i) { - if (int.TryParse(pid, out int proc)) + if (int.TryParse(sPids[i], out int childProcId)) { - try - { - Process p = Process.GetProcessById(proc); - childProcesses.Add(p); - } - catch (ArgumentException) - { - // ignore -> process may no longer exist - } - catch (InvalidOperationException ie) - { - Logger.LogWarning("GetFlattenedProcessFamilyTree: Unsuccessful bash cmd (ps - o ppid = -o pid = -A | awk '$1 == " + process.Id.ToString() + " {print $2}')" + ie.ToString()); - } + childProcesses.Add(childProcId); } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index 8d92a02d..1276e318 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -5,7 +5,6 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Fabric; using System.Runtime.InteropServices; @@ -57,7 +56,7 @@ protected Logger Logger public abstract float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); - public abstract List GetChildProcesses(Process process); + public abstract List GetChildProcessIds(int processId); protected abstract void Dispose(bool disposing); } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 6b391305..21d9a54c 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -121,10 +121,68 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService } } - public override List GetChildProcesses(Process process) + public override List GetChildProcessIds(int processId) { - List childProcesses = new List(); - string query = $"select processid from win32_process where parentprocessid = {process.Id}"; + if (processId < 1) + { + return null; + } + + // Get child procs. + List childProcesses = GetProcessTreeIds(processId); + + if (childProcesses == null) + { + return null; + } + + // Get grandchild procs. + for (var i = 0; i < childProcesses.Count; ++i) + { + List grandChildren = GetProcessTreeIds(childProcesses[i]); + + if (grandChildren?.Count > 0) + { + childProcesses.AddRange(grandChildren); + } + } + + return childProcesses; + } + + public float GetProcessPrivateWorkingSetInMB(string processName) + { + if (string.IsNullOrWhiteSpace(processName)) + { + return 0F; + } + + lock (memPerfCounterLock) + { + try + { + memProcessPrivateWorkingSetCounter.InstanceName = processName; + return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); + } + catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) + { + Logger.LogWarning($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + + // Don't throw. + return 0F; + } + catch (Exception e) + { + Logger.LogError($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); + throw; + } + } + } + + private List GetProcessTreeIds(int processId) + { + List childProcesses = null; + string query = $"select caption,processid from win32_process where parentprocessid = {processId}"; try { @@ -140,32 +198,27 @@ public override List GetChildProcesses(Process process) { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - object childProcessObj = mObj.Properties["processid"].Value; + object childProcessIdObj = mObj.Properties["processid"].Value; + object childProcessNameObj = mObj.Properties["caption"].Value; - if (childProcessObj == null) + if (childProcessIdObj == null || childProcessNameObj == null) { continue; } - Process childProcess = Process.GetProcessById(Convert.ToInt32(childProcessObj)); - - if (childProcess != null) + if (childProcessNameObj.ToString() == "conhost.exe") { - if (childProcess.ProcessName == "conhost") - { - continue; - } - - childProcesses.Add(childProcess); + continue; + } - // Now get child of child, if exists. - List grandChildren = GetChildProcesses(childProcess); + int childProcessId = Convert.ToInt32(childProcessIdObj); - if (grandChildren?.Count > 0) - { - childProcesses.AddRange(grandChildren); - } + if (childProcesses == null) + { + childProcesses = new List(); } + + childProcesses.Add(childProcessId); } } catch (Exception e) when (e is ArgumentException || e is ManagementException) diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index ddbca6b4..becf35e6 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -184,6 +184,7 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "HealthState", telemetryData.HealthState ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, + { "ChildProcessName", telemetryData.ChildProcessName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, @@ -260,6 +261,7 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, + { "ChildProcessName", telemetryData.ChildProcessName ?? string.Empty }, { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs index fc72de27..63c3f3d0 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -8,6 +8,7 @@ using System.Fabric; using System.Fabric.Health; using System.Net; +using System.Net.Sockets; using System.Runtime.InteropServices; using System.Security.Cryptography; using System.Text; @@ -26,7 +27,6 @@ public class LogAnalyticsTelemetry : ITelemetryProvider private readonly FabricClient fabricClient; private readonly CancellationToken token; private readonly Logger logger; - private int retries; private string WorkspaceId { @@ -100,7 +100,7 @@ public async Task ReportHealthAsync( public async Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (telemetryData == null) + if (telemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -111,7 +111,7 @@ public async Task ReportHealthAsync(TelemetryData telemetryData, CancellationTok public async Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (telemetryData == null) + if (telemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -122,7 +122,7 @@ public async Task ReportMetricAsync(TelemetryData telemetryData, CancellationTok public async Task ReportMetricAsync(MachineTelemetryData machineTelemetryData, CancellationToken cancellationToken) { - if (machineTelemetryData == null) + if (machineTelemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -226,11 +226,16 @@ public Task ReportMetricAsync( /// A completed task or task containing exception info. private async Task SendTelemetryAsync(string payload, CancellationToken cancellationToken) { - var requestUri = new Uri($"https://{WorkspaceId}.ods.opinsights.azure.com/api/logs?api-version={ApiVersion}"); + if (string.IsNullOrWhiteSpace(payload) || cancellationToken.IsCancellationRequested) + { + return; + } + + Uri requestUri = new Uri($"https://{WorkspaceId}.ods.opinsights.azure.com/api/logs?api-version={ApiVersion}"); string date = DateTime.UtcNow.ToString("r"); string signature = GetSignature("POST", payload.Length, "application/json", date, "/api/logs"); - - var request = (HttpWebRequest)WebRequest.Create(requestUri); + + HttpWebRequest request = (HttpWebRequest)WebRequest.Create(requestUri); request.ContentType = "application/json"; request.Method = "POST"; request.Headers["Log-Type"] = LogType; @@ -256,10 +261,8 @@ private async Task SendTelemetryAsync(string payload, CancellationToken cancella return; } - if (responseAsync != null && (responseAsync.StatusCode == HttpStatusCode.OK || - responseAsync.StatusCode == HttpStatusCode.Accepted)) + if (responseAsync != null && (responseAsync.StatusCode == HttpStatusCode.OK || responseAsync.StatusCode == HttpStatusCode.Accepted)) { - retries = 0; return; } @@ -271,37 +274,24 @@ private async Task SendTelemetryAsync(string payload, CancellationToken cancella } } } - catch (Exception e) + catch (Exception e) when (e is SocketException || e is WebException) { - // An Exception during telemetry data submission should never take down FO process. Log it. - logger.LogWarning($"Handled Exception in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{e}"); + logger.LogInfo($"Exception sending telemetry to LogAnalytics service:{Environment.NewLine}{e}"); } - - if (retries < MaxRetries) - { - if (cancellationToken.IsCancellationRequested) - { - return; - } - - retries++; - await Task.Delay(1000).ConfigureAwait(true); - await SendTelemetryAsync(payload, cancellationToken).ConfigureAwait(true); - } - else + catch (Exception e) { - // Exhausted retries. Reset counter. - logger.LogWarning($"Exhausted request retries in LogAnalyticsTelemetry.SendTelemetryAsync: {MaxRetries}. See logs for error details."); - retries = 0; + // Do not take down FO with a telemetry fault. Log it. Warning level will always log. + // This means there is either a bug in this code or something else that needs your attention.. + logger.LogWarning($"Unhandled exception sending telemetry to LogAnalytics service:{Environment.NewLine}{e}"); } } private string GetSignature( - string method, - int contentLength, - string contentType, - string date, - string resource) + string method, + int contentLength, + string contentType, + string date, + string resource) { string message = $"{method}\n{contentLength}\n{contentType}\nx-ms-date:{date}\n{resource}"; byte[] bytes = Encoding.UTF8.GetBytes(message); diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index 743cd065..f276360c 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -19,6 +19,11 @@ public string ApplicationName get; set; } + public string ChildProcessName + { + get; set; + } + public string ClusterId { get; set; diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index 88aff4ec..0c00d336 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -4,11 +4,11 @@ // ------------------------------------------------------------ using System; -using System.Collections.Generic; using System.Fabric; using System.IO; using System.Linq; using System.Reflection; +using System.Runtime.Loader; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers; @@ -89,19 +89,19 @@ private void LoadObserversFromPlugins(IServiceCollection services) return; } - var pluginLoaders = new List(pluginDlls.Length); + PluginLoader[] pluginLoaders = new PluginLoader[pluginDlls.Length]; Type[] sharedTypes = { typeof(FabricObserverStartupAttribute), typeof(IFabricObserverStartup), typeof(IServiceCollection) }; - foreach (string dll in pluginDlls) + for (int i = 0; i < pluginDlls.Length; ++i) { - // This does not create an Assembly. It creates a PluginLoader instance for each dll in the Plugins folder. - // TODO: Figure out how to only load the plugin dll in an efficient way. For now, this is fine. This is not resource intensive. - PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes); - pluginLoaders.Add(loader); + string dll = pluginDlls[i]; + PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes, a => a.IsUnloadable = true); + pluginLoaders[i] = loader; } - foreach (PluginLoader pluginLoader in pluginLoaders) + for (int i = 0; i < pluginLoaders.Length; ++i) { + var pluginLoader = pluginLoaders[i]; Assembly pluginAssembly; try @@ -109,12 +109,11 @@ private void LoadObserversFromPlugins(IServiceCollection services) // If your plugin has native library dependencies (that's fine), then we will land in the catch (BadImageFormatException). // This is by design. The Managed FO plugin assembly will successfully load, of course. pluginAssembly = pluginLoader.LoadDefaultAssembly(); - FabricObserverStartupAttribute[] startupAttributes = pluginAssembly.GetCustomAttributes().ToArray(); - for (int i = 0; i < startupAttributes.Length; ++i) + for (int j = 0; j < startupAttributes.Length; ++j) { - object startupObject = Activator.CreateInstance(startupAttributes[i].StartupType); + object startupObject = Activator.CreateInstance(startupAttributes[j].StartupType); if (startupObject is IFabricObserverStartup fabricObserverStartup) { @@ -123,17 +122,14 @@ private void LoadObserversFromPlugins(IServiceCollection services) else { // This will bring down FO, which it should: This means your plugin is not supported. Fix your bug. - throw new InvalidOperationException($"{startupAttributes[i].StartupType.FullName} must implement IFabricObserverStartup."); + throw new InvalidOperationException($"{startupAttributes[j].StartupType.FullName} must implement IFabricObserverStartup."); } } } - catch (BadImageFormatException) - { - continue; - } - finally + catch (Exception e) when (e is ArgumentException || e is BadImageFormatException || e is IOException) { pluginLoader?.Dispose(); + continue; } } } diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index 75df3c28..e0716841 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -12,11 +12,11 @@ linux-x64;win-x64 - 3.1.11.0 + 3.1.15.0 Copyright © 2020 FabricObserver Service Fabric Observer - 3.1.11 + 3.1.15 true true FabricObserver.Program @@ -97,6 +97,9 @@ PreserveNewest + + PreserveNewest + PreserveNewest diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 5fe9dbd3..9d3680df 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -107,7 +107,6 @@ public override async Task ObserveAsync(CancellationToken token) } stopwatch.Reset(); - LastRunDateTime = DateTime.Now; } @@ -120,13 +119,15 @@ public override Task ReportAsync(CancellationToken token) var healthReportTimeToLive = GetHealthReportTimeToLive(); - foreach (var repOrInst in ReplicaOrInstanceList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { token.ThrowIfCancellationRequested(); + var repOrInst = ReplicaOrInstanceList[i]; string processName = null; int processId = 0; - ApplicationInfo app = null; + ApplicationInfo app = null; + bool hasChildProcs = repOrInst.ChildProcessIds != null; try { @@ -183,17 +184,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppCpuData.Any(x => x.Id == id)) { var parentFrud = AllAppCpuData.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppCpuData, repOrInst, app, token); - // This will only be true if the parent has child procs that are currently executing. - if (sumValues > parentDataAvg) + if (hasChildProcs) { + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppCpuData, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add(sumValues); + parentFrud.Data.Add(sumAllValues); } + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.CpuErrorLimitPercent, @@ -208,18 +208,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppMemDataMb.Any(x => x.Id == id)) { var parentFrud = AllAppMemDataMb.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, repOrInst, app, token); - if (sumValues > parentDataAvg) + if (hasChildProcs) { + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add((float)sumValues); + parentFrud.Data.Add((float)sumAllValues); } - // Parent's aggregated (summed) spawned process data. - // This will generate an SF health event if the combined total exceeds the supplied threshold. + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.MemoryErrorLimitMb, @@ -234,17 +232,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppMemDataPercent.Any(x => x.Id == id)) { var parentFrud = AllAppMemDataPercent.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, repOrInst, app, token); - if (sumValues > parentDataAvg) + if (hasChildProcs) { + double parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add(sumValues); + parentFrud.Data.Add(sumAllValues); } - // Parent's aggregated (summed) spawned process data. + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.MemoryErrorLimitPercent, @@ -259,16 +256,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppTotalActivePortsData.Any(x => x.Id == id)) { var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, repOrInst, app, token); - if (sumValues > parentDataAvg) + if (hasChildProcs) { + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add((int)sumValues); + parentFrud.Data.Add((int)sumAllValues); } + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.NetworkErrorActivePorts, @@ -283,17 +280,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppEphemeralPortsData.Any(x => x.Id == id)) { var parentFrud = AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, repOrInst, app, token); - if (sumValues > parentDataAvg) + if (hasChildProcs) { + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add((int)sumValues); + parentFrud.Data.Add((int)sumAllValues); } - // Parent's aggregated (summed) process data. + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.NetworkErrorEphemeralPorts, @@ -308,17 +304,16 @@ public override Task ReportAsync(CancellationToken token) if (AllAppHandlesData.Any(x => x.Id == id)) { var parentFrud = AllAppHandlesData.FirstOrDefault(x => x.Id == id); - var parentDataAvg = Math.Round(parentFrud.AverageDataValue); - double sumValues = Math.Round(parentDataAvg, 0); - sumValues += ProcessChildFrudsGetDataSum(ref AllAppHandlesData, repOrInst, app, token); - if (sumValues > parentDataAvg) + if (hasChildProcs) { + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppHandlesData, repOrInst, app, token); parentFrud.Data.Clear(); - parentFrud.Data.Add((float)sumValues); + parentFrud.Data.Add((float)sumAllValues); } - // Parent's aggregated (summed) spawned process data. + // Parent's and aggregated (summed) spawned process data (if any). ProcessResourceDataReportHealth( parentFrud, app.ErrorOpenFileHandles, @@ -339,76 +334,134 @@ private double ProcessChildFrudsGetDataSum( ApplicationInfo app, CancellationToken token) where T : struct { - var childProcs = repOrInst.ChildProcesses; + var childProcs = repOrInst.ChildProcessIds; - if (childProcs == null) + if (childProcs == null || childProcs.Count == 0 || token.IsCancellationRequested) { return 0; } + if (IsEtwEnabled) + { + var rawdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: Total number of current child processes: {childProcs.Count}.", + HealthState = "Ok", + Level = "Verbose", + Metric = ErrorWarningProperty.ChildProcessCount, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = childProcs.Count, + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + } + + if (IsTelemetryEnabled) + { + var telemData = new TelemetryData(FabricClientInstance, token) + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: Total number of current child processes: {childProcs.Count}.", + HealthState = "Ok", + Metric = ErrorWarningProperty.ChildProcessCount, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = childProcs.Count, + }; + + _ = TelemetryClient?.ReportHealthAsync(telemData, token); + } + double sumValues = 0; + string metric = string.Empty; - foreach (Process childProc in childProcs) + for (int i = 0; i < childProcs.Count; ++i) { token.ThrowIfCancellationRequested(); + var childPid = childProcs[i]; + try { - if (childProc.HasExited) + if (fruds.Any(x => x.Id.Contains(childPid.ToString()))) { - continue; - } + var childFruds = fruds.Where(x => x.Id.Contains(childPid.ToString())).ToList(); + metric = childFruds[0].Property; - if (fruds.Any(x => x.Id.Contains(childProc.ProcessName))) - { - var childFruds = fruds.Where(x => x.Id.Contains(childProc.ProcessName)).ToList(); + // re-order the list by data value so we can emit raw telemetry for the top 10. + childFruds = childFruds.OrderByDescending(x => x.AverageDataValue).ToList(); - foreach (var frud in childFruds) + for (int j = 0; j < childFruds.Count; ++j) { token.ThrowIfCancellationRequested(); - + + var frud = childFruds[j]; sumValues += Math.Round(frud.AverageDataValue, 0); - string childProcName = childProc.ProcessName; - int childPid = childProc.Id; + using Process p = Process.GetProcessById(childPid); + string childProcName = p.ProcessName; if (IsEtwEnabled) { - var rawdata = new + if (j < 10) { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - Metric = frud.Property, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid : -1, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue, - }; + var rawdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + ChildProcessName = childProcName, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + HealthState = "Ok", + Metric = frud.Property, + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid : -1, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue, + }; - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + } } if (IsTelemetryEnabled) { - var telemData = new TelemetryData + if (j < 10) { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - Metric = frud.Property, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString + "::" + childProcName, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; + var telemData = new TelemetryData(FabricClientInstance, token) + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + ChildProcessName = childProcName, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", + HealthState = "Ok", + Metric = frud.Property, + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = frud.AverageDataValue + }; - _ = TelemetryClient?.ReportMetricAsync(telemData, token); + _ = TelemetryClient?.ReportMetricAsync(telemData, token); + } } if (frud.IsUnhealthy(app.CpuWarningLimitPercent) ||frud.IsUnhealthy(app.MemoryWarningLimitMb) || @@ -420,6 +473,8 @@ private double ProcessChildFrudsGetDataSum( var warningdata = new { ApplicationName = repOrInst.ApplicationName.OriginalString, + ChildProcessName = childProcName, + Code = "", Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", HealthState = "Warning", Metric = frud.Property, @@ -428,7 +483,7 @@ private double ProcessChildFrudsGetDataSum( PartitionId = repOrInst.PartitionId.ToString(), ProcessId = childPid, ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = $"{repOrInst.ServiceName.OriginalString}/{childProcName}_{childPid}", + ServiceName = repOrInst.ServiceName.OriginalString, Source = ObserverConstants.FabricObserverName, Value = frud.AverageDataValue }; @@ -438,9 +493,11 @@ private double ProcessChildFrudsGetDataSum( if (IsTelemetryEnabled) { - var telemWarnData = new TelemetryData + var telemWarnData = new TelemetryData(FabricClientInstance, token) { ApplicationName = repOrInst.ApplicationName.OriginalString, + ChildProcessName = childProcName, + Code = "", Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", HealthState = "Warning", Metric = frud.Property, @@ -449,7 +506,7 @@ private double ProcessChildFrudsGetDataSum( PartitionId = repOrInst.PartitionId.ToString(), ProcessId = childPid.ToString(), ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = $"{repOrInst.ServiceName.OriginalString}/{childProcName}_{childPid}", + ServiceName = repOrInst.ServiceName.OriginalString, Source = ObserverConstants.FabricObserverName, Value = frud.AverageDataValue }; @@ -471,7 +528,7 @@ private double ProcessChildFrudsGetDataSum( State = HealthState.Ok, NodeName = NodeName, Observer = ObserverName, - Property = frud.Id, + Property = $"{NodeName}_{frud.Id.Split(':')[0]}_{childProcName}", ResourceUsageDataProperty = frud.Property, SourceId = $"{ObserverName}({FOErrorWarningCodes.Ok})" }; @@ -480,9 +537,12 @@ private double ProcessChildFrudsGetDataSum( HealthReporter.ReportHealthToServiceFabric(healthReport); } + // Remove child FRUD from ref FRUD. fruds.Remove(frud); } + childFruds?.Clear(); + childFruds = null; } } catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException) @@ -491,6 +551,48 @@ private double ProcessChildFrudsGetDataSum( } } + if (IsEtwEnabled) + { + var rawdata = new + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: child processes ({childProcs.Count}) sum total for {metric}.", + HealthState = "Ok", + Metric = $"{metric} - Sum", + NodeName, + ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = sumValues, + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + } + + if (IsTelemetryEnabled) + { + var telemData = new TelemetryData(FabricClientInstance, token) + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + Code = FOErrorWarningCodes.Ok, + Description = $"{repOrInst.ServiceName.OriginalString}: child processes ({childProcs.Count}) sum total for {metric}.", + HealthState = "Ok", + Metric = $"{metric} - Sum", + NodeName = NodeName, + ObserverName = ObserverName, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ServiceName = repOrInst.ServiceName.OriginalString, + Source = ObserverConstants.FabricObserverName, + Value = sumValues, + }; + + _ = TelemetryClient?.ReportMetricAsync(telemData, token); + } + return sumValues; } @@ -520,11 +622,7 @@ private async Task InitializeAsync() if (!File.Exists(appObserverConfigFileName)) { - WriteToLogWithLevel( - ObserverName, - $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", - LogLevel.Information); - + ObserverLogger.LogWarning($"Will not observe resource consumption on node {NodeName} as no configuration file has been supplied."); return false; } @@ -538,11 +636,7 @@ private async Task InitializeAsync() // Are any of the config-supplied apps deployed?. if (userTargetList.Count == 0) { - WriteToLogWithLevel( - ObserverName, - $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", - LogLevel.Information); - + ObserverLogger.LogWarning($"Will not observe service resource consumption on node {NodeName} as no configuration parameters have been supplied."); return false; } @@ -591,10 +685,12 @@ private async Task InitializeAsync() await Task.Delay(250, Token).ConfigureAwait(true); } - foreach (var app in apps) + for (int i = 0; i < apps.Count; ++i) { Token.ThrowIfCancellationRequested(); - + + var app = apps[i]; + if (app.ApplicationName.OriginalString == "fabric:/System") { continue; @@ -677,10 +773,11 @@ private async Task InitializeAsync() int settingsFail = 0; - foreach (var application in userTargetList) + for (int i = 0; i < userTargetList.Count; ++i) { Token.ThrowIfCancellationRequested(); + var application = userTargetList[i]; Uri appUri = null; if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) @@ -741,10 +838,12 @@ private async Task InitializeAsync() } } - foreach (var rep in ReplicaOrInstanceList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { Token.ThrowIfCancellationRequested(); - + + var rep = ReplicaOrInstanceList[i]; + try { // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. @@ -756,7 +855,7 @@ private async Task InitializeAsync() continue; } - ObserverLogger.LogInfo($"Will observe resource consumption by {rep.ServiceName?.OriginalString}({rep.HostProcessId}) on Node {NodeName}."); + ObserverLogger.LogInfo($"Will observe resource consumption by {rep.ServiceName?.OriginalString}({rep.HostProcessId}) (and child procs, if any) on Node {NodeName}."); } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { @@ -776,10 +875,11 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) AllAppEphemeralPortsData ??= new List>(capacity); AllAppHandlesData ??= new List>(capacity); - foreach (var repOrInst in ReplicaOrInstanceList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { token.ThrowIfCancellationRequested(); + var repOrInst = ReplicaOrInstanceList[i]; var timer = new Stopwatch(); int parentPid = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); @@ -789,7 +889,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - List procTree = null; + List procTree = null; if (application?.TargetApp == null && application?.TargetAppType == null) { @@ -887,53 +987,55 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Get list of child processes of parentProc should they exist. // In order to provide accurate resource usage of an SF service process we need to also account for // any processes (children) that the service process (parent) created/spawned. - procTree = new List + procTree = new List { // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, // then only the parent process will be in this list. - parentProc + parentProc.Id }; - if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) + if (repOrInst.ChildProcessIds != null && repOrInst.ChildProcessIds.Count > 0) { - procTree.AddRange(repOrInst.ChildProcesses); + procTree.AddRange(repOrInst.ChildProcessIds); } - foreach (Process proc in procTree) + for (int j = 0; j < procTree.Count; ++j) { + int procId = procTree[j]; + // Total TCP ports usage if (checkAllPorts) { // Parent process (the service process). - if (proc.ProcessName == parentProcName) + if (procId == parentPid) { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } else { // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procId}")) { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procId}", capacity, UseCircularBuffer)); } - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(proc.Id, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); } } // Ephemeral TCP ports usage if (checkEphemeralPorts) { - if (proc.ProcessName == parentProcName) + if (procId == parentPid) { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } else { - if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procId}")) { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procId}", capacity, UseCircularBuffer)); } - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(proc.Id, FabricServiceContext)); + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } } @@ -954,58 +1056,103 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) if (checkCpu) { - _ = cpuUsage.GetCpuUsagePercentageProcess(proc); + _ = cpuUsage.GetCpuUsagePercentageProcess(procId); } if (checkHandles) { - _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(proc.Id, FabricServiceContext); + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); } if (checkMemMb || checkMemPct) { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(proc.Id); + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); } float processMem = 0; - if (checkMemMb || checkMemPct) + // Memory (private working set (process)). + if (checkMemMb) + { + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + + if (procId == parentPid) + { + AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); + } + else + { + if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{procId}")) + { + AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procId}", capacity, UseCircularBuffer)); + } + + AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(processMem); + } + } + + // Memory (percent in use (total)). + if (checkMemPct) { - processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(proc.Id); + if (processMem == 0) + { + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + } + + var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + + if (TotalMemory > 0) + { + double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); + + if (procId == parentPid) + { + AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + } + else + { + if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{procId}")) + { + AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procId}", capacity, UseCircularBuffer)); + } + + AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(Math.Round(usedPct, 1)); + } + } } if (checkHandles) { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(proc.Id, FabricServiceContext); + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); if (handles > -1) { - if (proc.ProcessName == parentProc.ProcessName) + if (procId == parentPid) { AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); } else { - if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) + if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procId}")) { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procId}", capacity, UseCircularBuffer)); } - AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(handles); + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(handles); } } } timer.Start(); - while (!proc.HasExited && timer.Elapsed.Seconds <= duration.Seconds) + while (timer.Elapsed.Seconds <= duration.Seconds) { token.ThrowIfCancellationRequested(); if (checkCpu) { // CPU (all cores). - double cpu = cpuUsage.GetCpuUsagePercentageProcess(proc); + double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); if (cpu >= 0) { @@ -1014,60 +1161,18 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) cpu = 100; } - if (proc.ProcessName == parentProc.ProcessName) + if (procId == parentPid) { AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); } else { - if (!AllAppCpuData.Any(x => x.Id == $"{id}:{proc.ProcessName}")) + if (!AllAppCpuData.Any(x => x.Id == $"{id}:{procId}")) { - AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); + AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procId}", capacity, UseCircularBuffer)); } - AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(cpu); - } - } - - // Memory (private working set (process)). - if (checkMemMb) - { - if (proc.ProcessName == parentProcName) - { - AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); - } - else - { - if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{proc.ProcessName}")) - { - AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); - } - - AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(processMem); - } - } - - // Memory (percent in use (total)). - if (checkMemPct) - { - var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); - - if (TotalMemory > 0) - { - double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); - if (proc.ProcessName == parentProc.ProcessName) - { - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); - } - else - { - if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{proc.ProcessName}")) - { - AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{proc.ProcessName}", capacity, UseCircularBuffer)); - } - - AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{proc.ProcessName}").Data.Add(Math.Round(usedPct, 1)); - } + AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(cpu); } } } @@ -1092,16 +1197,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Fix the bug.. throw; } - } - - try - { - ProcessInfoProvider.Instance.Dispose(); - } - catch (Exception e) - { - ObserverLogger.LogWarning($"Can't dispose ProcessInfoProvider.Instance:{Environment.NewLine}{e}"); - } + } } private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) @@ -1151,10 +1247,11 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType).ToList(); } - foreach (var deployedApp in deployedApps) + for (int i = 0; i < deployedApps.Count; ++i) { Token.ThrowIfCancellationRequested(); + var deployedApp = deployedApps[i]; string[] filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. @@ -1227,78 +1324,78 @@ private void SetInstanceOrReplicaMonitoringList( DeployedServiceReplicaList deployedReplicaList, ref List replicaMonitoringList) { - foreach (var deployedReplica in deployedReplicaList) + for (int i = 0; i < deployedReplicaList.Count; ++i) { Token.ThrowIfCancellationRequested(); + var deployedReplica = deployedReplicaList[i]; ReplicaOrInstanceMonitoringInfo replicaInfo = null; switch (deployedReplica) { case DeployedStatefulServiceReplica {ReplicaRole: ReplicaRole.Primary} statefulReplica: { - if (filterList != null && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - - switch (filterType) + if (filterList != null && filterType != ServiceFilterType.None) { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } - } - - replicaInfo = new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = appName, - ApplicationTypeName = appTypeName, - HostProcessId = statefulReplica.HostProcessId, - ReplicaOrInstanceId = statefulReplica.ReplicaId, - PartitionId = statefulReplica.Partitionid, - ServiceName = statefulReplica.ServiceName - }; + bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - using var p = Process.GetProcessById((int)statefulReplica.HostProcessId); - var childProcs = ProcessInfoProvider.Instance.GetChildProcesses(p); - if (childProcs?.Count > 0) - { - replicaInfo.ChildProcesses = childProcs; - } + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; + } + } - break; + replicaInfo = new ReplicaOrInstanceMonitoringInfo + { + ApplicationName = appName, + ApplicationTypeName = appTypeName, + HostProcessId = statefulReplica.HostProcessId, + ReplicaOrInstanceId = statefulReplica.ReplicaId, + PartitionId = statefulReplica.Partitionid, + ServiceName = statefulReplica.ServiceName + }; + + var childPids = ProcessInfoProvider.Instance.GetChildProcessIds((int)statefulReplica.HostProcessId); + + if (childPids != null && childPids.Count > 0) + { + replicaInfo.ChildProcessIds = childPids; + } + break; } case DeployedStatelessServiceInstance statelessInstance: { - if (filterList != null && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - - switch (filterType) + if (filterList != null && filterType != ServiceFilterType.None) { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } - } + bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - replicaInfo = new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = appName, - ApplicationTypeName = appTypeName, - HostProcessId = statelessInstance.HostProcessId, - ReplicaOrInstanceId = statelessInstance.InstanceId, - PartitionId = statelessInstance.Partitionid, - ServiceName = statelessInstance.ServiceName - }; + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; + } + } - using var p = Process.GetProcessById((int)statelessInstance.HostProcessId); - var childProcs = ProcessInfoProvider.Instance.GetChildProcesses(p); - if (childProcs?.Count > 0) - { - replicaInfo.ChildProcesses = childProcs; - } - break; + replicaInfo = new ReplicaOrInstanceMonitoringInfo + { + ApplicationName = appName, + ApplicationTypeName = appTypeName, + HostProcessId = statelessInstance.HostProcessId, + ReplicaOrInstanceId = statelessInstance.InstanceId, + PartitionId = statelessInstance.Partitionid, + ServiceName = statelessInstance.ServiceName + }; + + var childProcs = ProcessInfoProvider.Instance.GetChildProcessIds((int)statelessInstance.HostProcessId); + + if (childProcs != null && childProcs.Count > 0) + { + replicaInfo.ChildProcessIds = childProcs; + } + break; } } @@ -1317,29 +1414,6 @@ private void CleanUp() userTargetList?.Clear(); userTargetList = null; - // Clean up service child Process objects, if any. - if (ReplicaOrInstanceList.Any(repOrInst => repOrInst.ChildProcesses != null)) - { - foreach (var rep in ReplicaOrInstanceList) - { - if (rep.ChildProcesses == null) - { - continue; - } - - for (int i = 0; i < rep.ChildProcesses.Count; ++i) - { - var p = rep.ChildProcesses[i]; - - p?.Dispose(); - p = null; - } - - rep.ChildProcesses.Clear(); - rep.ChildProcesses = null; - } - } - ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index b34aa6e2..ba83152c 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -218,9 +218,10 @@ public override Task ReportAsync(CancellationToken token) var timeToLiveWarning = GetHealthReportTimeToLive(); // User-supplied Disk Space Usage % thresholds from ApplicationManifest.xml. - foreach (var data in DiskSpaceUsagePercentageData) + for (int i = 0; i < DiskSpaceUsagePercentageData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceUsagePercentageData[i]; ProcessResourceDataReportHealth( data, @@ -232,9 +233,10 @@ public override Task ReportAsync(CancellationToken token) // User-supplied Average disk queue length thresholds from ApplicationManifest.xml. Windows only. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - foreach (var data in DiskAverageQueueLengthData) + for (int i = 0; i < DiskAverageQueueLengthData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskAverageQueueLengthData[i]; ProcessResourceDataReportHealth( data, @@ -249,27 +251,21 @@ in FabricObserver.Extensibility project. */ if (IsEtwEnabled) { // Disk Space Available - foreach (var data in DiskSpaceAvailableMbData) + for (int i = 0; i < DiskSpaceAvailableMbData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceAvailableMbData[i]; - ProcessResourceDataReportHealth( - data, - 0, - 0, - timeToLiveWarning); + ProcessResourceDataReportHealth(data, 0, 0, timeToLiveWarning); } // Disk Space Total - foreach (var data in DiskSpaceTotalMbData) + for (int i = 0; i < DiskSpaceTotalMbData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceTotalMbData[i]; - ProcessResourceDataReportHealth( - data, - 0, - 0, - timeToLiveWarning); + ProcessResourceDataReportHealth(data, 0, 0, timeToLiveWarning); } } diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index be4456e9..b32df6bd 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -29,7 +29,7 @@ namespace FabricObserver.Observers // As with all observers, you should first determine the good (normal) states across resource usage before you set thresholds for the bad ones. public class FabricSystemObserver : ObserverBase { - private List processWatchList; + private string[] processWatchList; private Stopwatch stopwatch; // Health Report data container - For use in analysis to determine health state. @@ -40,7 +40,7 @@ public class FabricSystemObserver : ObserverBase private List> allHandlesData; // Windows only. (EventLog). - private List evtRecordList; + private List evtRecordList = null; private bool monitorWinEventLog; /// @@ -117,11 +117,6 @@ public int AllocatedHandlesError get; set; } - public string ErrorOrWarningKind - { - get; set; - } = null; - public override async Task ObserveAsync(CancellationToken token) { // If set, this observer will only run during the supplied interval. @@ -141,11 +136,14 @@ public override async Task ObserveAsync(CancellationToken token) { Initialize(); - foreach (var procName in processWatchList) + for (int i = 0; i < processWatchList.Length; ++i) { + Token.ThrowIfCancellationRequested(); + + string procName = processWatchList[i]; + try { - Token.ThrowIfCancellationRequested(); string dotnet = string.Empty; if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) && procName.EndsWith(".dll")) @@ -157,15 +155,13 @@ public override async Task ObserveAsync(CancellationToken token) } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { + } } } catch (Exception e) when (!(e is OperationCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError( $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -350,10 +346,7 @@ public override Task ReportAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in ReportAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError($"Unhandled exception in ReportAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -446,10 +439,12 @@ private Process[] GetDotnetLinuxProcessesByFirstArgument(string argument) var result = new List(); var processes = Process.GetProcessesByName("dotnet"); - foreach (var process in processes) + for (int i = 0; i < processes.Length; ++i) { Token.ThrowIfCancellationRequested(); + Process process = processes[i]; + try { string cmdline = File.ReadAllText($"/proc/{process.Id}/cmdline"); @@ -491,7 +486,7 @@ private void Initialize() // Linux if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { - processWatchList = new List + processWatchList = new [] { "Fabric", "FabricDCA.dll", @@ -508,7 +503,7 @@ private void Initialize() else { // Windows - processWatchList = new List + processWatchList = new [] { "Fabric", "FabricApplicationGateway", @@ -522,7 +517,7 @@ private void Initialize() }; } - int listcapacity = processWatchList.Count; + int listcapacity = processWatchList.Length; int frudCapacity = 4; if (UseCircularBuffer) @@ -780,12 +775,14 @@ private async Task GetProcessInfoAsync(string procName) Stopwatch timer = new Stopwatch(); - foreach (var process in processes) + for (int i = 0; i < processes.Length; ++i) { - try - { - Token.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); + + Process process = processes[i]; + try + { // Ports - Active TCP All int activePortCount = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(process.Id, FabricServiceContext); @@ -824,10 +821,20 @@ private async Task GetProcessInfoAsync(string procName) CpuUsage cpuUsage = new CpuUsage(); - // Warm up the perf counters. + // Mem if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) { + // Warm up the perf counters. _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + float mem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); + } + + // Allocated Handles + if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) + { + float handleCount = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); + allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handleCount); } TimeSpan duration = TimeSpan.FromSeconds(1); @@ -848,32 +855,15 @@ private async Task GetProcessInfoAsync(string procName) // CPU Time for service process. if (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0) { - int cpu = (int)cpuUsage.GetCpuUsagePercentageProcess(process); + int cpu = (int)cpuUsage.GetCpuUsagePercentageProcess(process.Id); allCpuData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(cpu); } - // Private Working Set for service process. - if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) - { - float mem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); - allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); - } - - // Allocated Handles - if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) - { - float handleCount = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); - allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handleCount); - } - await Task.Delay(250, Token).ConfigureAwait(true); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}", - LogLevel.Warning); + ObserverLogger.LogWarning($"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -886,19 +876,13 @@ private async Task GetProcessInfoAsync(string procName) // It's OK. Just means that the elevated process (like FabricHost.exe) won't be observed. // It is generally *not* worth running FO process as a Windows elevated user just for this scenario. On Linux, FO always should be run as normal user, not root. #if DEBUG - WriteToLogWithLevel( - ObserverName, - $"Can't observe {procName} due to it's privilege level. FabricObserver must be running as System or Admin on Windows for this specific task.", - LogLevel.Warning); + ObserverLogger.LogWarning($"Can't observe {procName} due to it's privilege level. FabricObserver must be running as System or Admin on Windows for this specific task."); #endif continue; } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError("Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -914,7 +898,7 @@ private async Task GetProcessInfoAsync(string procName) } private void ProcessResourceDataList( - IReadOnlyCollection> data, + List> data, T thresholdError, T thresholdWarning) where T : struct @@ -926,10 +910,12 @@ private void ProcessResourceDataList( fileName = $"FabricSystemServices{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; } - foreach (var dataItem in data) + for (int i = 0; i < data.Count; ++i) { Token.ThrowIfCancellationRequested(); + var dataItem = data[i]; + if (dataItem.Data.Count == 0 || dataItem.AverageDataValue <= 0) { continue; @@ -990,7 +976,6 @@ private void ProcessResourceDataList( private void CleanUp() { - processWatchList.Clear(); processWatchList = null; if (allCpuData != null && !allCpuData.Any(frud => frud.ActiveErrorOrWarning)) @@ -999,6 +984,12 @@ private void CleanUp() allCpuData = null; } + if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) + { + allEphemeralTcpPortData?.Clear(); + allEphemeralTcpPortData = null; + } + if (allHandlesData != null && !allHandlesData.Any(frud => frud.ActiveErrorOrWarning)) { allHandlesData?.Clear(); @@ -1011,12 +1002,6 @@ private void CleanUp() allMemData = null; } - if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) - { - allEphemeralTcpPortData?.Clear(); - allEphemeralTcpPortData = null; - } - if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) { allActiveTcpPortData?.Clear(); diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 7c98dd14..6eb1c8ed 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -22,48 +22,24 @@ public class NodeObserver : ObserverBase private readonly Stopwatch stopwatch; // These are public properties because they are used in unit tests. - public FabricResourceUsageData MemDataCommittedBytes - { - get; set; - } + public FabricResourceUsageData MemDataCommittedBytes; - public FabricResourceUsageData FirewallData - { - get; set; - } + public FabricResourceUsageData FirewallData; - public FabricResourceUsageData ActivePortsData - { - get; set; - } + public FabricResourceUsageData ActivePortsData; - public FabricResourceUsageData EphemeralPortsData - { - get; set; - } + public FabricResourceUsageData EphemeralPortsData; - public FabricResourceUsageData MemDataPercentUsed - { - get; set; - } + public FabricResourceUsageData MemDataPercentUsed; - public FabricResourceUsageData CpuTimeData - { - get; set; - } + public FabricResourceUsageData CpuTimeData; // These are only useful for Linux.\\ // Holds data for percentage of total configured file descriptors that are in use. - public FabricResourceUsageData LinuxFileHandlesDataPercentAllocated - { - get; set; - } + public FabricResourceUsageData LinuxFileHandlesDataPercentAllocated; - public FabricResourceUsageData LinuxFileHandlesDataTotalAllocated - { - get; set; - } + public FabricResourceUsageData LinuxFileHandlesDataTotalAllocated; public float CpuErrorUsageThresholdPct { @@ -716,6 +692,17 @@ error on these conditions. } } + if (MemDataCommittedBytes != null && (MemErrorUsageThresholdMb > 0 || MemWarningUsageThresholdMb > 0)) + { + float committedMegaBytes = MemoryUsageProvider.Instance.GetCommittedBytes() / 1048576.0f; + MemDataCommittedBytes.Data.Add(committedMegaBytes); + } + + if (MemDataPercentUsed != null && (MemoryErrorLimitPercent > 0 || MemoryWarningLimitPercent > 0)) + { + MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); + } + timer.Start(); while (timer.Elapsed <= duration) @@ -727,18 +714,7 @@ error on these conditions. CpuTimeData.Data.Add(await cpuUtilizationProvider.NextValueAsync()); } - if (MemDataCommittedBytes != null && (MemErrorUsageThresholdMb > 0 || MemWarningUsageThresholdMb > 0)) - { - float committedMegaBytes = MemoryUsageProvider.Instance.GetCommittedBytes() / 1048576.0f; - MemDataCommittedBytes.Data.Add(committedMegaBytes); - } - - if (MemDataPercentUsed != null && (MemoryErrorLimitPercent > 0 || MemoryWarningLimitPercent > 0)) - { - MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); - } - - await Task.Delay(250, Token).ConfigureAwait(true); + await Task.Delay(500, Token).ConfigureAwait(true); } timer.Stop(); diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 0e0283e9..9ec1748c 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -15,7 +15,6 @@ using System.Runtime.InteropServices; using System.Security; using System.Text; -using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.Utilities; @@ -670,9 +669,6 @@ await TelemetryClient.ReportMetricAsync( ObserverName, HealthState.Error, $"Unhandled Exception processing OS information:{Environment.NewLine}{e}"); - - // Fix the bug.. - throw; } } } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 49875be5..78b6c03f 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -912,7 +912,6 @@ ex.InnerException is OperationCanceledException || if (isConfigurationUpdateInProgress) { IsObserverRunning = false; - return true; } @@ -922,10 +921,10 @@ ex.InnerException is OperationCanceledException || catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Error, - $"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); + ObserverConstants.ObserverManagerName, + ApplicationName, + HealthState.Error, + $"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); allExecuted = false; } @@ -939,15 +938,12 @@ ex.InnerException is OperationCanceledException || } else { - if (Logger.EnableVerboseLogging) - { - HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Warning, - exceptionBuilder.ToString()); - } - + HealthReporter.ReportFabricObserverServiceHealth( + ObserverConstants.ObserverManagerName, + ApplicationName, + HealthState.Warning, + exceptionBuilder.ToString()); + _ = exceptionBuilder.Clear(); } diff --git a/FabricObserver/PackageRoot/Config/NetworkObserver.config.json b/FabricObserver/PackageRoot/Config/NetworkObserver.config.json index 4b71e127..03113376 100644 --- a/FabricObserver/PackageRoot/Config/NetworkObserver.config.json +++ b/FabricObserver/PackageRoot/Config/NetworkObserver.config.json @@ -8,9 +8,9 @@ "protocol": "http" }, { - "hostname": "somesqlservername.database.windows.net", - "port": 1433, - "protocol": "tcp" + "hostname": "https://mycosmosdb.documents.azure.com/dbs/mydb", + "port": 443, + "protocol": "http" } ] }, From 2e9c25870749e5cdfcead2b68ffd1a7c8ff12b5f Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 6 Jul 2021 18:36:35 -0700 Subject: [PATCH 04/14] FO 3.1.15 (TODO: AppInsights change..) --- .../Interfaces/ITelemetryProvider.cs | 9 + .../ReplicaOrInstanceMonitoringInfo.cs | 2 +- FabricObserver.Extensibility/ObserverBase.cs | 4 +- .../ProcessInfo/IProcessInfoProvider.cs | 2 +- .../ProcessInfo/LinuxProcessInfoProvider.cs | 19 +- .../ProcessInfo/ProcessInfoProvider.cs | 2 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 47 +- .../Telemetry/AppInsightsTelemetry.cs | 40 +- .../Utilities/Telemetry/ChildProcessInfo.cs | 13 + .../Telemetry/ChildProcessTelemetryData.cs | 22 + .../Telemetry/LogAnalyticsTelemetry.cs | 11 + .../Utilities/Telemetry/TelemetryData.cs | 5 - FabricObserver/Observers/AppObserver.cs | 593 ++++++++---------- 13 files changed, 396 insertions(+), 373 deletions(-) create mode 100644 FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs create mode 100644 FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs diff --git a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs index f2ea5499..4da1ef3b 100644 --- a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs +++ b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs @@ -113,6 +113,15 @@ Task ReportMetricAsync( MachineTelemetryData telemetryData, CancellationToken cancellationToken); + /// + /// Calls telemetry provider to report a metric. + /// + /// List of ChildProcessTelemetry. + /// CancellationToken instance. + Task ReportMetricAsync( + List telemetryData, + CancellationToken cancellationToken); + /// /// Calls telemetry provider to report a metric. /// diff --git a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs index e924d4b3..c3b126b5 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs @@ -51,7 +51,7 @@ public string ServicePackageActivationId get; set; } - public List ChildProcessIds + public List<(string procName, int Pid)> ChildProcessInfo { get; set; } diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index fcbce434..5a6f69d4 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -889,9 +889,9 @@ public void ProcessResourceDataReportHealth( var healthMessage = new StringBuilder(); string childProcMsg = string.Empty; - if (replicaOrInstance != null && replicaOrInstance.ChildProcessIds != null) + if (replicaOrInstance != null && replicaOrInstance.ChildProcessInfo != null) { - childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcessIds.Count}). " + + childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcessInfo.Count}). " + $"Their cumulative impact on {name}'s resource usage has been applied."; } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index c34cd081..41812b61 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -26,7 +26,7 @@ public interface IProcessInfoProvider /// /// /// - List GetChildProcessIds(int pid); + List<(string ProcName, int Pid)> GetChildProcessInfo(int processId); void Dispose(); } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index 5c4b6feb..3dea34c9 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -68,23 +68,24 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return result; } - public override List GetChildProcessIds(int processId) + public override List<(string ProcName, int Pid)> GetChildProcessInfo(int processId) { - // https://askubuntu.com/questions/512871/find-children-of-the-process - string cmdResult = "ps -o ppid= -o pid= -A | awk '$1 == " + processId.ToString() + " {print $2}'".Bash(); - List childProcesses = new List(); + string pidCmdResult = $"ps -o pid= --ppid {processId}".Bash(); + string procNameCmdResult = $"ps -o comm= --ppid {processId}".Bash(); + List<(string procName, int Pid)> childProcesses = new List<(string procName, int Pid)>(); - if (!string.IsNullOrWhiteSpace(cmdResult)) + if (!string.IsNullOrWhiteSpace(pidCmdResult) && !string.IsNullOrWhiteSpace(procNameCmdResult)) { - var sPids = cmdResult.Split('\n')?.ToList(); + var sPids = pidCmdResult.Trim().Split('\n'); + var sProcNames = procNameCmdResult.Trim().Split('\n'); - if (sPids.Count > 0) + if (sPids?.Length > 0 && sProcNames.Length > 0) { - for (int i = 0; i < sPids.Count; ++i) + for (int i = 0; i < sPids.Length; ++i) { if (int.TryParse(sPids[i], out int childProcId)) { - childProcesses.Add(childProcId); + childProcesses.Add((sProcNames[i], childProcId)); } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index 1276e318..d5b63341 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -56,7 +56,7 @@ protected Logger Logger public abstract float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); - public abstract List GetChildProcessIds(int processId); + public abstract List<(string ProcName, int Pid)> GetChildProcessInfo(int processId); protected abstract void Dispose(bool disposing); } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 21d9a54c..b7b4d2b5 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -121,7 +121,7 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService } } - public override List GetChildProcessIds(int processId) + public override List<(string ProcName, int Pid)> GetChildProcessInfo(int processId) { if (processId < 1) { @@ -129,21 +129,41 @@ public override List GetChildProcessIds(int processId) } // Get child procs. - List childProcesses = GetProcessTreeIds(processId); + List<(string procName, int pid)> childProcesses = TupleGetChildProcessInfo(processId); if (childProcesses == null) { return null; } - // Get grandchild procs. - for (var i = 0; i < childProcesses.Count; ++i) + // Get descendent procs, max depth = 3. + for (int i = 0; i < childProcesses.Count; ++i) { - List grandChildren = GetProcessTreeIds(childProcesses[i]); + List<(string procName, int pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].pid); - if (grandChildren?.Count > 0) + if (c1?.Count > 0) { - childProcesses.AddRange(grandChildren); + childProcesses.AddRange(c1); + + for (int j = 0; j < c1.Count; ++j) + { + List<(string procName, int pid)> c2 = TupleGetChildProcessInfo(c1[j].pid); + + if (c2?.Count > 0) + { + childProcesses.AddRange(c2); + + for (int k = 0; k < c2.Count; ++k) + { + List<(string procName, int pid)> c3 = TupleGetChildProcessInfo(c2[k].pid); + + if (c3?.Count > 0) + { + childProcesses.AddRange(c3); + } + } + } + } } } @@ -179,9 +199,9 @@ public float GetProcessPrivateWorkingSetInMB(string processName) } } - private List GetProcessTreeIds(int processId) + private List<(string procName, int pid)> TupleGetChildProcessInfo(int processId) { - List childProcesses = null; + List<(string procName, int pid)> childProcesses = null; string query = $"select caption,processid from win32_process where parentprocessid = {processId}"; try @@ -211,14 +231,15 @@ private List GetProcessTreeIds(int processId) continue; } - int childProcessId = Convert.ToInt32(childProcessIdObj); - if (childProcesses == null) { - childProcesses = new List(); + childProcesses = new List<(string procName, int pid)>(); } - childProcesses.Add(childProcessId); + int childProcessId = Convert.ToInt32(childProcessIdObj); + string procName = childProcessNameObj.ToString(); + + childProcesses.Add((procName, childProcessId)); } } catch (Exception e) when (e is ArgumentException || e is ManagementException) diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index becf35e6..eaa6b1b4 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -184,7 +184,6 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "HealthState", telemetryData.HealthState ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, - { "ChildProcessName", telemetryData.ChildProcessName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, @@ -261,7 +260,6 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, - { "ChildProcessName", telemetryData.ChildProcessName ?? string.Empty }, { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, @@ -283,6 +281,44 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can return Task.CompletedTask; } + public Task ReportMetricAsync(List telemetryData, CancellationToken cancellationToken) + { + if (telemetryData == null || cancellationToken.IsCancellationRequested) + { + return Task.CompletedTask; + } + + // TODO... + /* + try + { + var properties = new Dictionary + { + { "ClusterId", telemetryData.ClusterId ?? string.Empty }, + { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, + { "ServiceName", telemetryData.ServiceName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "Metric", telemetryData.Metric ?? string.Empty }, + { "Value", value ?? string.Empty }, + { "PartitionId", telemetryData.PartitionId }, + { "ReplicaId", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty } + }; + + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } + */ + return Task.CompletedTask; + } + /// /// Reports a metric to a telemetry service. /// diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs new file mode 100644 index 00000000..66f86353 --- /dev/null +++ b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs @@ -0,0 +1,13 @@ +using Newtonsoft.Json; +using System.Diagnostics.Tracing; + +namespace FabricObserver.Observers.Utilities.Telemetry +{ + [EventData] + [JsonObject] + public class ChildProcessInfo + { + public string ProcessName; + public double Value; + } +} diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs new file mode 100644 index 00000000..93767c75 --- /dev/null +++ b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs @@ -0,0 +1,22 @@ +using Newtonsoft.Json; +using System.Collections.Generic; +using System.Diagnostics.Tracing; + +namespace FabricObserver.Observers.Utilities.Telemetry +{ + [EventData] + [JsonObject] + public class ChildProcessTelemetryData + { + public string ApplicationName; + public string ServiceName; + public string Metric; + public double Value; + public long ProcessId; + public string PartitionId; + public string ReplicaId; + public string NodeName; + public int ChildProcessCount; + public List ChildProcessInfo; + } +} diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs index 63c3f3d0..3c1409e7 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -120,6 +120,17 @@ public async Task ReportMetricAsync(TelemetryData telemetryData, CancellationTok await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(true); } + public async Task ReportMetricAsync(List telemetryData, CancellationToken cancellationToken) + { + if (telemetryData == null || cancellationToken.IsCancellationRequested) + { + return; + } + + string jsonPayload = JsonConvert.SerializeObject(telemetryData); + await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(true); + } + public async Task ReportMetricAsync(MachineTelemetryData machineTelemetryData, CancellationToken cancellationToken) { if (machineTelemetryData == null || cancellationToken.IsCancellationRequested) diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index f276360c..743cd065 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -19,11 +19,6 @@ public string ApplicationName get; set; } - public string ChildProcessName - { - get; set; - } - public string ClusterId { get; set; diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 9d3680df..c61fde07 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -100,7 +100,7 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Stop(); CleanUp(); RunDuration = stopwatch.Elapsed; - + if (EnableVerboseLogging) { ObserverLogger.LogInfo($"Run Duration: {RunDuration}"); @@ -117,7 +117,9 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - var healthReportTimeToLive = GetHealthReportTimeToLive(); + // For use in family tree monitoring. + List childProcessTelemetryDataList = null; + TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { @@ -127,7 +129,11 @@ public override Task ReportAsync(CancellationToken token) string processName = null; int processId = 0; ApplicationInfo app = null; - bool hasChildProcs = repOrInst.ChildProcessIds != null; + bool hasChildProcs = repOrInst.ChildProcessInfo != null; + if (hasChildProcs) + { + childProcessTelemetryDataList = new List(); + } try { @@ -187,10 +193,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppCpuData, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add(sumAllValues); + ProcessChildProcs(ref AllAppCpuData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -211,10 +214,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppMemDataMb, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add((float)sumAllValues); + ProcessChildProcs(ref AllAppMemDataMb, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -235,10 +235,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - double parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppMemDataPercent, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add(sumAllValues); + ProcessChildProcs(ref AllAppMemDataPercent, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -256,13 +253,10 @@ public override Task ReportAsync(CancellationToken token) if (AllAppTotalActivePortsData.Any(x => x.Id == id)) { var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); - + if (hasChildProcs) { - var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppTotalActivePortsData, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add((int)sumAllValues); + ProcessChildProcs(ref AllAppTotalActivePortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -283,10 +277,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppEphemeralPortsData, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add((int)sumAllValues); + ProcessChildProcs(ref AllAppEphemeralPortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -307,10 +298,7 @@ public override Task ReportAsync(CancellationToken token) if (hasChildProcs) { - var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - double sumAllValues = parentDataAvg + ProcessChildFrudsGetDataSum(ref AllAppHandlesData, repOrInst, app, token); - parentFrud.Data.Clear(); - parentFrud.Data.Add((float)sumAllValues); + ProcessChildProcs(ref AllAppHandlesData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } // Parent's and aggregated (summed) spawned process data (if any). @@ -323,144 +311,114 @@ public override Task ReportAsync(CancellationToken token) repOrInst, app.DumpProcessOnError); } + + // Child proc info telemetry.. + if (IsEtwEnabled && childProcessTelemetryDataList != null) + { + var data = new + { + ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList) + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); + } + + if (IsTelemetryEnabled && childProcessTelemetryDataList != null) + { + _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); + } + + childProcessTelemetryDataList = null; } return Task.CompletedTask; } - private double ProcessChildFrudsGetDataSum( - ref List> fruds, - ReplicaOrInstanceMonitoringInfo repOrInst, - ApplicationInfo app, - CancellationToken token) where T : struct + private void ProcessChildProcs( + ref List> allAppEphemeralPortsData, + ref List childProcessTelemetryDataList, + ReplicaOrInstanceMonitoringInfo repOrInst, + ApplicationInfo app, + ref FabricResourceUsageData parentFrud, + CancellationToken token) where T : struct { - var childProcs = repOrInst.ChildProcessIds; - - if (childProcs == null || childProcs.Count == 0 || token.IsCancellationRequested) + token.ThrowIfCancellationRequested(); + + try { - return 0; + string metric = parentFrud.Property; + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref allAppEphemeralPortsData, repOrInst, app, token); + double sumAllValues = Sum + parentDataAvg; + childProcInfo.Metric = metric; + childProcInfo.Value = sumAllValues; + childProcessTelemetryDataList.Add(childProcInfo); + parentFrud.Data.Clear(); + parentFrud.Data.Add((T)Convert.ChangeType(sumAllValues, typeof(T))); } - - if (IsEtwEnabled) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - var rawdata = new - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: Total number of current child processes: {childProcs.Count}.", - HealthState = "Ok", - Level = "Verbose", - Metric = ErrorWarningProperty.ChildProcessCount, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = childProcs.Count, - }; - - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); + ObserverLogger.LogWarning($"Error processing child processes:{Environment.NewLine}{e}"); } + } - if (IsTelemetryEnabled) - { - var telemData = new TelemetryData(FabricClientInstance, token) - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: Total number of current child processes: {childProcs.Count}.", - HealthState = "Ok", - Metric = ErrorWarningProperty.ChildProcessCount, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = childProcs.Count, - }; + private (ChildProcessTelemetryData childProcInfo, double Sum) ProcessChildFrudsGetDataSum( + ref List> fruds, + ReplicaOrInstanceMonitoringInfo repOrInst, + ApplicationInfo app, + CancellationToken token) where T : struct + { + var childProcs = repOrInst.ChildProcessInfo; - _ = TelemetryClient?.ReportHealthAsync(telemData, token); + if (childProcs == null || childProcs.Count == 0 || token.IsCancellationRequested) + { + return (null, 0); } double sumValues = 0; string metric = string.Empty; + var childProcessInfoData = new ChildProcessTelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + ServiceName = repOrInst.ServiceName.OriginalString, + NodeName = NodeName, + ProcessId = repOrInst.HostProcessId, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ChildProcessCount = childProcs.Count, + ChildProcessInfo = new List() + }; for (int i = 0; i < childProcs.Count; ++i) { token.ThrowIfCancellationRequested(); - var childPid = childProcs[i]; + int childPid = childProcs[i].Pid; + string childProcName = childProcs[i].procName; try { - if (fruds.Any(x => x.Id.Contains(childPid.ToString()))) + if (fruds.Any(x => x.Id.Contains(childProcName))) { - var childFruds = fruds.Where(x => x.Id.Contains(childPid.ToString())).ToList(); + var childFruds = fruds.Where(x => x.Id.Contains(childProcName)).ToList(); metric = childFruds[0].Property; // re-order the list by data value so we can emit raw telemetry for the top 10. - childFruds = childFruds.OrderByDescending(x => x.AverageDataValue).ToList(); + var childFrudsOrdered = childFruds.OrderByDescending(x => x.AverageDataValue).ToList(); - for (int j = 0; j < childFruds.Count; ++j) + for (int j = 0; j < childFrudsOrdered.Count; ++j) { token.ThrowIfCancellationRequested(); - var frud = childFruds[j]; + var frud = childFrudsOrdered[j]; sumValues += Math.Round(frud.AverageDataValue, 0); - using Process p = Process.GetProcessById(childPid); - string childProcName = p.ProcessName; - if (IsEtwEnabled) + if (IsEtwEnabled || IsTelemetryEnabled) { - if (j < 10) + if (j < 15) { - var rawdata = new - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - ChildProcessName = childProcName, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - HealthState = "Ok", - Metric = frud.Property, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid : -1, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue, - }; - - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); - } - } - - if (IsTelemetryEnabled) - { - if (j < 10) - { - var telemData = new TelemetryData(FabricClientInstance, token) - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - ChildProcessName = childProcName, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} {frud.Property}.", - HealthState = "Ok", - Metric = frud.Property, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid > 0 ? childPid.ToString() : string.Empty, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - _ = TelemetryClient?.ReportMetricAsync(telemData, token); + var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = frud.AverageDataValue }; + childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } } @@ -473,7 +431,6 @@ private double ProcessChildFrudsGetDataSum( var warningdata = new { ApplicationName = repOrInst.ApplicationName.OriginalString, - ChildProcessName = childProcName, Code = "", Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", HealthState = "Warning", @@ -496,7 +453,6 @@ private double ProcessChildFrudsGetDataSum( var telemWarnData = new TelemetryData(FabricClientInstance, token) { ApplicationName = repOrInst.ApplicationName.OriginalString, - ChildProcessName = childProcName, Code = "", Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", HealthState = "Warning", @@ -541,6 +497,8 @@ private double ProcessChildFrudsGetDataSum( fruds.Remove(frud); } + childFrudsOrdered?.Clear(); + childFrudsOrdered = null; childFruds?.Clear(); childFruds = null; } @@ -551,49 +509,7 @@ private double ProcessChildFrudsGetDataSum( } } - if (IsEtwEnabled) - { - var rawdata = new - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: child processes ({childProcs.Count}) sum total for {metric}.", - HealthState = "Ok", - Metric = $"{metric} - Sum", - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = sumValues, - }; - - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, rawdata); - } - - if (IsTelemetryEnabled) - { - var telemData = new TelemetryData(FabricClientInstance, token) - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = FOErrorWarningCodes.Ok, - Description = $"{repOrInst.ServiceName.OriginalString}: child processes ({childProcs.Count}) sum total for {metric}.", - HealthState = "Ok", - Metric = $"{metric} - Sum", - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = sumValues, - }; - - _ = TelemetryClient?.ReportMetricAsync(telemData, token); - } - - return sumValues; + return (childProcessInfoData, sumValues); } private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst) @@ -888,8 +804,8 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString.ToLower() || !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - - List procTree = null; + + List<(string procName, int Pid)> procTree = null; if (application?.TargetApp == null && application?.TargetAppType == null) { @@ -982,62 +898,27 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) checkHandles = true; } - /* CPU and Memory Usage */ - // Get list of child processes of parentProc should they exist. // In order to provide accurate resource usage of an SF service process we need to also account for // any processes (children) that the service process (parent) created/spawned. - procTree = new List + procTree = new List<(string procName, int Pid)> { // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, // then only the parent process will be in this list. - parentProc.Id + (parentProc.ProcessName, parentProc.Id) }; - if (repOrInst.ChildProcessIds != null && repOrInst.ChildProcessIds.Count > 0) + if (repOrInst.ChildProcessInfo != null && repOrInst.ChildProcessInfo.Count > 0) { - procTree.AddRange(repOrInst.ChildProcessIds); + procTree.AddRange(repOrInst.ChildProcessInfo); } for (int j = 0; j < procTree.Count; ++j) { - int procId = procTree[j]; + int procId = procTree[j].Pid; + string procName = procTree[j].procName; - // Total TCP ports usage - if (checkAllPorts) - { - // Parent process (the service process). - if (procId == parentPid) - { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); - } - else - { - // Child procs spawned by the parent service process. - if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procId}")) - { - AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procId}", capacity, UseCircularBuffer)); - } - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); - } - } - - // Ephemeral TCP ports usage - if (checkEphemeralPorts) - { - if (procId == parentPid) - { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); - } - else - { - if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procId}")) - { - AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procId}", capacity, UseCircularBuffer)); - } - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); - } - } + TimeSpan duration = TimeSpan.FromSeconds(1); @@ -1069,111 +950,144 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); } - float processMem = 0; + // Monitor Duration applies to the code below. + timer.Start(); - // Memory (private working set (process)). - if (checkMemMb) + while (timer.Elapsed.Seconds <= duration.Seconds) { - processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + token.ThrowIfCancellationRequested(); - if (procId == parentPid) - { - AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); - } - else + float processMem = 0; + + if (checkCpu) { - if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{procId}")) + // CPU (all cores). + double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); + + if (cpu >= 0) { - AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procId}", capacity, UseCircularBuffer)); - } + if (cpu > 100) + { + cpu = 100; + } - AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(processMem); + if (procId == parentPid) + { + AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + } + else + { + if (!AllAppCpuData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(cpu); + } + } } - } - // Memory (percent in use (total)). - if (checkMemPct) - { - if (processMem == 0) + // Memory (private working set (process)). + if (checkMemMb) { processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); - } - - var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); - if (TotalMemory > 0) - { - double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); - if (procId == parentPid) { - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); } else { - if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{procId}")) + if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{procName}")) { - AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procId}", capacity, UseCircularBuffer)); + AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); } - - AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(Math.Round(usedPct, 1)); + AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(processMem); } } - } - - if (checkHandles) - { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); - if (handles > -1) + // Memory (percent in use (total)). + if (checkMemPct) { - if (procId == parentPid) + if (processMem == 0) { - AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); } - else + + var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + + if (TotalMemory > 0) { - if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procId}")) + double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); + + if (procId == parentPid) { - AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procId}", capacity, UseCircularBuffer)); + AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + } + else + { + if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(Math.Round(usedPct, 1)); } - - AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(handles); } } - } - - timer.Start(); - while (timer.Elapsed.Seconds <= duration.Seconds) - { - token.ThrowIfCancellationRequested(); - - if (checkCpu) + if (checkHandles) { - // CPU (all cores). - double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); - if (cpu >= 0) + if (handles > -1) { - if (cpu > 100) - { - cpu = 100; - } - if (procId == parentPid) { - AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); } else { - if (!AllAppCpuData.Any(x => x.Id == $"{id}:{procId}")) + if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procName}")) { - AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procId}", capacity, UseCircularBuffer)); + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(handles); + } + } + } + + // Total TCP ports usage + if (checkAllPorts) + { + // Parent process (the service process). + if (procId == parentPid) + { + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + else + { + // Child procs spawned by the parent service process. + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + } - AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{procId}").Data.Add(cpu); + // Ephemeral TCP ports usage + if (checkEphemeralPorts) + { + if (procId == parentPid) + { + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + else + { + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); } + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); } } @@ -1333,69 +1247,70 @@ private void SetInstanceOrReplicaMonitoringList( switch (deployedReplica) { - case DeployedStatefulServiceReplica {ReplicaRole: ReplicaRole.Primary} statefulReplica: + case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary || + statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary: { - if (filterList != null && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + if (filterList != null && filterType != ServiceFilterType.None) + { + bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - switch (filterType) - { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; } + } - replicaInfo = new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = appName, - ApplicationTypeName = appTypeName, - HostProcessId = statefulReplica.HostProcessId, - ReplicaOrInstanceId = statefulReplica.ReplicaId, - PartitionId = statefulReplica.Partitionid, - ServiceName = statefulReplica.ServiceName - }; - - var childPids = ProcessInfoProvider.Instance.GetChildProcessIds((int)statefulReplica.HostProcessId); + replicaInfo = new ReplicaOrInstanceMonitoringInfo + { + ApplicationName = appName, + ApplicationTypeName = appTypeName, + HostProcessId = statefulReplica.HostProcessId, + ReplicaOrInstanceId = statefulReplica.ReplicaId, + PartitionId = statefulReplica.Partitionid, + ServiceName = statefulReplica.ServiceName + }; + + var childPids = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statefulReplica.HostProcessId); - if (childPids != null && childPids.Count > 0) - { - replicaInfo.ChildProcessIds = childPids; - } - break; + if (childPids != null && childPids.Count > 0) + { + replicaInfo.ChildProcessInfo = childPids; + } + break; } case DeployedStatelessServiceInstance statelessInstance: { - if (filterList != null && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + if (filterList != null && filterType != ServiceFilterType.None) + { + bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - switch (filterType) - { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; } + } - replicaInfo = new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = appName, - ApplicationTypeName = appTypeName, - HostProcessId = statelessInstance.HostProcessId, - ReplicaOrInstanceId = statelessInstance.InstanceId, - PartitionId = statelessInstance.Partitionid, - ServiceName = statelessInstance.ServiceName - }; - - var childProcs = ProcessInfoProvider.Instance.GetChildProcessIds((int)statelessInstance.HostProcessId); + replicaInfo = new ReplicaOrInstanceMonitoringInfo + { + ApplicationName = appName, + ApplicationTypeName = appTypeName, + HostProcessId = statelessInstance.HostProcessId, + ReplicaOrInstanceId = statelessInstance.InstanceId, + PartitionId = statelessInstance.Partitionid, + ServiceName = statelessInstance.ServiceName + }; + + var childProcs = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statelessInstance.HostProcessId); - if (childProcs != null && childProcs.Count > 0) - { - replicaInfo.ChildProcessIds = childProcs; - } - break; + if (childProcs != null && childProcs.Count > 0) + { + replicaInfo.ChildProcessInfo = childProcs; + } + break; } } @@ -1417,37 +1332,37 @@ private void CleanUp() ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; - if (AllAppCpuData != null && !AllAppCpuData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppCpuData != null && AllAppCpuData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppCpuData?.Clear(); AllAppCpuData = null; } - if (AllAppEphemeralPortsData != null && !AllAppEphemeralPortsData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppEphemeralPortsData?.Clear(); AllAppEphemeralPortsData = null; } - if (AllAppHandlesData != null && !AllAppHandlesData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppHandlesData != null && AllAppHandlesData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppHandlesData?.Clear(); AllAppHandlesData = null; } - if (AllAppMemDataMb != null && !AllAppMemDataMb.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => !frud.ActiveErrorOrWarning)) { AllAppMemDataMb?.Clear(); AllAppMemDataMb = null; } - if (AllAppMemDataPercent != null && !AllAppMemDataPercent.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => !frud.ActiveErrorOrWarning)) { AllAppMemDataPercent?.Clear(); AllAppMemDataPercent = null; } - if (AllAppTotalActivePortsData != null && !AllAppTotalActivePortsData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppTotalActivePortsData?.Clear(); AllAppTotalActivePortsData = null; From 79dd78f0d0aba2e30dc4f2d5cd905f01aba6d7b1 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 7 Jul 2021 15:22:59 -0700 Subject: [PATCH 05/14] FO 3.1.5 --- ClusterObserver/ClusterObserver.cs | 1 - .../Utilities/Telemetry/TelemetryData.cs | 5 -- .../Utilities/ObserverConstants.cs | 1 + .../ProcessInfo/LinuxProcessInfoProvider.cs | 2 - .../Telemetry/AppInsightsTelemetry.cs | 72 ++++++++++++------- FabricObserver/Observers/AppObserver.cs | 50 ++++++++----- .../PackageRoot/Config/Settings.xml | 8 ++- .../ApplicationManifest.xml | 17 ++--- FabricObserverTests/ObserverTest.cs | 8 +-- 9 files changed, 94 insertions(+), 70 deletions(-) diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index 11b27907..bc39da9c 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -410,7 +410,6 @@ private async Task ProcessApplicationHealthAsync(IList a { foTelemetryData.ApplicationName, foTelemetryData.ServiceName, - foTelemetryData.ChildProcessName, foTelemetryData.HealthState, foTelemetryData.Description, foTelemetryData.Metric, diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs index 0c06ac73..c641e710 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs @@ -17,11 +17,6 @@ public string ApplicationName get; set; } - public string ChildProcessName - { - get; set; - } - public string ClusterId { get; set; diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 49e826f2..e4b62b1f 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -55,6 +55,7 @@ public sealed class ObserverConstants // AppObserver. public const string AppObserverName = "AppObserver"; public const string AppObserverConfigurationSectionName = "AppObserverConfiguration"; + public const string MaxChildProcessesToMonitorParameter = "MaxChildProcessesToMonitor"; // Certificate Observer public const string CertificateObserverName = "CertificateObserver"; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index 3dea34c9..d9665905 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -3,11 +3,9 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; using System.Collections.Generic; using System.Diagnostics; using System.Fabric; -using System.Linq; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index eaa6b1b4..de385434 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; +using System.Fabric; using System.Fabric.Health; using System.Globalization; using System.Runtime.InteropServices; @@ -14,6 +15,8 @@ using Microsoft.ApplicationInsights; using Microsoft.ApplicationInsights.DataContracts; using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.ServiceFabric.TelemetryLib; +using Newtonsoft.Json; namespace FabricObserver.Observers.Utilities.Telemetry { @@ -281,41 +284,58 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can return Task.CompletedTask; } - public Task ReportMetricAsync(List telemetryData, CancellationToken cancellationToken) + /// + /// Reports metric for a collection (List) of ChildProcessTelemetryData instances. + /// + /// List of ChildProcessTelemetryData. + /// Cancellation Token + /// + public Task ReportMetricAsync(List telemetryDataList, CancellationToken cancellationToken) { - if (telemetryData == null || cancellationToken.IsCancellationRequested) + if (telemetryDataList == null || cancellationToken.IsCancellationRequested) { return Task.CompletedTask; } - // TODO... - /* - try - { - var properties = new Dictionary - { - { "ClusterId", telemetryData.ClusterId ?? string.Empty }, - { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, - { "ServiceName", telemetryData.ServiceName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, - { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, - { "PartitionId", telemetryData.PartitionId }, - { "ReplicaId", telemetryData.ReplicaId }, - { "Source", telemetryData.ObserverName }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "OS", telemetryData.OS ?? string.Empty } - }; - - telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + string clusterid = string.Empty; + using (FabricClient fabClient = new FabricClient()) + { + var (clusterId, _, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabClient, cancellationToken).Result; + clusterid = clusterId; } - catch (Exception e) + + string OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux"; + + foreach (var telemData in telemetryDataList) { - logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + try + { + var properties = new Dictionary + { + { "ClusterId", clusterid }, + { "ApplicationName", telemData.ApplicationName ?? string.Empty }, + { "ServiceName", telemData.ServiceName ?? string.Empty }, + { "ProcessId", telemData.ProcessId.ToString() }, + { "Metric", telemData.Metric ?? string.Empty }, + { "Value", telemData.Value.ToString() }, + { "ChildProcessCount", telemData.ChildProcessCount.ToString() }, + { "ChildProcessInfo", JsonConvert.SerializeObject(telemData.ChildProcessInfo) }, + { "PartitionId", telemData.PartitionId }, + { "ReplicaId", telemData.ReplicaId }, + { "Source", ObserverConstants.AppObserverName }, + { "NodeName", telemData.NodeName }, + { "OS", OS } + }; + + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } } - */ + return Task.CompletedTask; } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index c61fde07..2ab09808 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -45,6 +45,11 @@ public class AppObserver : ObserverBase private string fileName; private readonly Stopwatch stopwatch; + public int MaxChildProcessesToMonitor + { + get; set; + } = 15; + public List ReplicaOrInstanceList { get; set; @@ -85,7 +90,8 @@ public override async Task ObserveAsync(CancellationToken token) FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, - "This observer was unable to initialize correctly due to missing configuration info."); + "AppObserver was unable to initialize correctly due to misconfiguration. " + + "Please check your AppObserver configuration settings."); stopwatch.Stop(); stopwatch.Reset(); @@ -93,8 +99,17 @@ public override async Task ObserveAsync(CancellationToken token) return; } - await MonitorDeployedAppsAsync(token).ConfigureAwait(true); - await ReportAsync(token).ConfigureAwait(true); + // For child process monitoring. + if (int.TryParse( + GetSettingParameterValue( + ConfigurationSectionName, + ObserverConstants.MaxChildProcessesToMonitorParameter), out int maxChildProcs)) + { + MaxChildProcessesToMonitor = maxChildProcs; + } + + await MonitorDeployedAppsAsync(token); + await ReportAsync(token); // The time it took to run this observer. stopwatch.Stop(); @@ -335,7 +350,7 @@ public override Task ReportAsync(CancellationToken token) } private void ProcessChildProcs( - ref List> allAppEphemeralPortsData, + ref List> fruds, ref List childProcessTelemetryDataList, ReplicaOrInstanceMonitoringInfo repOrInst, ApplicationInfo app, @@ -348,7 +363,7 @@ private void ProcessChildProcs( { string metric = parentFrud.Property; var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); - var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref allAppEphemeralPortsData, repOrInst, app, token); + var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref fruds, repOrInst, app, token); double sumAllValues = Sum + parentDataAvg; childProcInfo.Metric = metric; childProcInfo.Value = sumAllValues; @@ -415,7 +430,7 @@ private void ProcessChildProcs( if (IsEtwEnabled || IsTelemetryEnabled) { - if (j < 15) + if (j < MaxChildProcessesToMonitor) { var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = frud.AverageDataValue }; childProcessInfoData.ChildProcessInfo.Add(childProcInfo); @@ -507,6 +522,11 @@ private void ProcessChildProcs( { continue; } + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + { + ObserverLogger.LogWarning($"Error processing child processes:{Environment.NewLine}{e}"); + continue; + } } return (childProcessInfoData, sumValues); @@ -532,7 +552,7 @@ private async Task InitializeAsync() ObserverConstants.ObserverConfigurationPackageName)?.Settings, ConfigurationSectionName, "AppObserverDataFileName"); - + // Unit tests may have null path and filename, thus the null equivalence operations. var appObserverConfigFileName = Path.Combine(ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); @@ -598,7 +618,7 @@ private async Task InitializeAsync() apps.AddRange(appList.ToList()); // TODO: Add random wait (ms) impl, include cluster size in calc. - await Task.Delay(250, Token).ConfigureAwait(true); + await Task.Delay(250, Token); } for (int i = 0; i < apps.Count; ++i) @@ -746,11 +766,11 @@ private async Task InitializeAsync() if (!string.IsNullOrWhiteSpace(application.TargetAppType)) { - await SetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetAppType).ConfigureAwait(true); + await SetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetAppType); } else { - await SetDeployedApplicationReplicaOrInstanceListAsync(appUri).ConfigureAwait(true); + await SetDeployedApplicationReplicaOrInstanceListAsync(appUri); } } @@ -1120,7 +1140,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat if (applicationNameFilter != null) { - var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(true); + var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter); deployedApps.AddRange(app.ToList()); } else if (!string.IsNullOrWhiteSpace(applicationType)) @@ -1155,7 +1175,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat Token); deployedApps.AddRange(appList.ToList()); - await Task.Delay(250, Token).ConfigureAwait(true); + await Task.Delay(250, Token); } deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType).ToList(); @@ -1189,11 +1209,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } } - var replicasOrInstances = await GetDeployedPrimaryReplicaAsync( - deployedApp.ApplicationName, - filteredServiceList, - filterType, - applicationType).ConfigureAwait(true); + var replicasOrInstances = await GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType); ReplicaOrInstanceList.AddRange(replicasOrInstances); diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 9bc7787c..c421775f 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -143,6 +143,10 @@ the observer. The default value for capacity is 30 if you omit the ResourceUsageDataCapacity parameter or use an invalid value like 0 or a negative number (or omit the parameter altogether). --> + +
@@ -281,12 +285,12 @@ [ObserverName]Configuration. Example: SampleNewObserverConfiguration where SampleNewObserver is the type name of the observer plugin. See the SampleObserverPlugin project for a complete example of implementing an observer plugin. --> - +
--> diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index c3a4ee68..5110b294 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -18,7 +18,6 @@ - @@ -29,7 +28,6 @@ - @@ -40,12 +38,10 @@ - - @@ -56,13 +52,11 @@ - - @@ -73,27 +67,26 @@ - + - + + - - @@ -111,7 +104,6 @@ - @@ -134,10 +126,8 @@ - - @@ -162,6 +152,7 @@ +
diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 13ac023c..b2285489 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -455,11 +455,11 @@ public async Task Successful_NetworkObserver_Run_Cancellation_Via_ObserverManage _ = Task.Run(async () => { - await obsMgr.StartObserversAsync().ConfigureAwait(true); - }).ConfigureAwait(true); + await obsMgr.StartObserversAsync(); + }); - Assert.IsTrue(await WaitAsync(() => obsMgr.IsObserverRunning, 1).ConfigureAwait(true)); - await obsMgr.StopObserversAsync().ConfigureAwait(true); + Assert.IsTrue(await WaitAsync(() => obsMgr.IsObserverRunning, 1)); + await obsMgr.StopObserversAsync(); Assert.IsFalse(obsMgr.IsObserverRunning); } From c8e8f0d43a04ed58b3fab3e6e62da040b97c2aa8 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 7 Jul 2021 15:24:39 -0700 Subject: [PATCH 06/14] FO 3.1.15 --- FabricObserver/FabricObserver.csproj | 3 --- 1 file changed, 3 deletions(-) diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index e0716841..0166967c 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -97,9 +97,6 @@ PreserveNewest - - PreserveNewest - PreserveNewest From fe533ab0ae56554114a964942f6b79483357db51 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 7 Jul 2021 17:14:22 -0700 Subject: [PATCH 07/14] FO 3.1.15 - Doc updates. DO NOT MERGE --- Documentation/Observers.md | 11 ++++++++--- .../ProcessInfo/WindowsProcessInfoProvider.cs | 13 ++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/Documentation/Observers.md b/Documentation/Observers.md index 6731003d..ff6bc2b0 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -19,7 +19,7 @@ Service Fabric Error Health Events can block upgrades and other important Fabric | Observer | Description | | :--- | :--- | -| [AppObserver](#appobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric Application services (processes). Alerts when user-supplied thresholds are reached. | +| [AppObserver](#appobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric Application services (processes) and their spawn (child processes). Alerts when user-supplied thresholds are reached. | | [CertificateObserver](#certificateobserver) | Monitors the expiration date of the cluster certificate and any other certificates provided by the user. Warns when close to expiration. | | [DiskObserver](#diskobserver) | Monitors, storage disk information like capacity and IO rates. Alerts when user-supplied thresholds are reached. | | [FabricSystemObserver](#fabricsystemobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric System services (compare to AppObserver) | @@ -42,8 +42,8 @@ For every other observer, it's XML as per usual. ``` ## AppObserver -Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application services (processes). This -observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. +Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application service processes and the child processes they spawn. If a service process creates child processes, then these processes will be monitored and their summed resource usage for some metric you are observing will be applied to the parent process (added) and a threshold breach will be determined based on the sum of children and parent resource usage. +This observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. ### Input JSON config file supplied by user, stored in @@ -77,6 +77,8 @@ All settings are optional, ***except target OR targetType***, and can be omitted | :--- | :--- | | **targetApp** | App URI string to observe. Optional (Required if targetType not specified). | | **targetAppType** | ApplicationType name (this is not a Uri format). FO will observe **all** app services belonging to it. Optional (Required if target not specified). | +| **appExcludeList** | This setting is only useful when targetApp is set to "*" or "All". A comma-separated list of app names (***URI format***) to ***exclude from observation***. Just omit the object or set value to "" to mean ***include all***. (excluding all does not make sense) | +| **appIncludeList** | This setting is only useful when targetApp is set to "*" or "All". A comma-separated list of app names (***URI format***) to ***include in observation***. Just omit the object or set value to "" to mean ***include all***. | | **serviceExcludeList** | A comma-separated list of service names (***not URI format***, just the service name as we already know the app name URI) to ***exclude from observation***. Just omit the object or set value to "" to mean ***include all***. (excluding all does not make sense) | | **serviceIncludeList** | A comma-separated list of service names (***not URI format***, just the service name as we already know the app name URI) to ***include in observation***. Just omit the object or set value to "" to mean ***include all***. | | **memoryErrorLimitMb** | Maximum service process private working set in Megabytes that should generate a Fabric Error (SFX and local log) | @@ -95,6 +97,9 @@ All settings are optional, ***except target OR targetType***, and can be omitted **Output** Log text(Error/Warning), Service Fabric Application Health Report (Error/Warning/Ok), ETW (EventSource), Telemetry (AppInsights/LogAnalytics) +AppObserver also supports non-JSON parameters for configuration unrelated to thresholds. Like all observers these settings are located in ApplicationManifest.xml to support versionless configuration updates via application upgrade. +One of these settings is the maximum number of child processes to monitor (AppObserverMaxChildProcessesToMonitor). Note that there is a cap on generations. Up to 4 generations of child processes will be monitored. The resulting set will be ordered by resource usage from high to low for a given metric. This list size is determined by this setting. + Example SFX Output (Warning - Ephemeral Ports Usage): ![alt text](/Documentation/Images/AppObsWarn.png "AppObserver Warning output example.") diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index b7b4d2b5..bc05841e 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -136,7 +136,8 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return null; } - // Get descendent procs, max depth = 3. + // Get descendent procs, max depth = 4. *Not* an optimal algo... This is fine. It is much better than increased StackOverflow exception potential + // due to recursive calls which are FailFast and will take FO down. Most services will never reach c3, let alone c4, anyway... for (int i = 0; i < childProcesses.Count; ++i) { List<(string procName, int pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].pid); @@ -160,6 +161,16 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService if (c3?.Count > 0) { childProcesses.AddRange(c3); + + for (int l = 0; l < c3.Count; ++l) + { + List<(string procName, int pid)> c4 = TupleGetChildProcessInfo(c3[l].pid); + + if (c4?.Count > 0) + { + childProcesses.AddRange(c4); + } + } } } } From 7e82536b8aa4199f3f20df56dacdf28d2df0c6ab Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Fri, 9 Jul 2021 17:00:01 -0700 Subject: [PATCH 08/14] FO 3.1.15 RC --- .../ReplicaOrInstanceMonitoringInfo.cs | 2 +- FabricObserver.Extensibility/ObserverBase.cs | 163 +++++++++++------- .../CpuUtilization/CpuUtilizationProvider.cs | 10 +- .../Utilities/ObserverConstants.cs | 2 +- .../ProcessInfo/WindowsProcessInfoProvider.cs | 42 +++-- FabricObserver.nuspec.template | 4 + FabricObserver/Observers/AppObserver.cs | 86 +++++---- .../Observers/FabricSystemObserver.cs | 14 +- FabricObserver/Observers/NodeObserver.cs | 53 +++--- .../PackageRoot/Config/Settings.xml | 10 +- .../ApplicationManifest.xml | 26 +-- 11 files changed, 245 insertions(+), 167 deletions(-) diff --git a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs index c3b126b5..63406013 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs @@ -51,7 +51,7 @@ public string ServicePackageActivationId get; set; } - public List<(string procName, int Pid)> ChildProcessInfo + public List<(string procName, int Pid)> ChildProcesses { get; set; } diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 5a6f69d4..c8fdbcff 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -29,9 +29,9 @@ public abstract class ObserverBase : IObserver private const int TtlAddMinutes = 5; private const string FabricSystemAppName = "fabric:/System"; private const int MaxDumps = 5; - private Dictionary serviceDumpCountDictionary; + private Dictionary serviceDumpCountDictionary; private string SFLogRoot; - private string dumpsPath; + private string SFDumpsPath; private bool disposed; public string ObserverName @@ -310,7 +310,7 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel string logFolderBasePath; string observerLogPath = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.ObserverLogPathParameter); - if (!string.IsNullOrEmpty(observerLogPath)) + if (!string.IsNullOrWhiteSpace(observerLogPath)) { logFolderBasePath = observerLogPath; } @@ -325,9 +325,10 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel EnableETWLogging = IsEtwProviderEnabled }; - if (string.IsNullOrEmpty(dumpsPath)) + // Only supported on Windows (dump on error). + if (string.IsNullOrWhiteSpace(SFDumpsPath) && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - SetDefaultSfDumpPath(); + SetDefaultSFWindowsDumpPath(); } ConfigurationSettings = new ConfigSettings( @@ -401,7 +402,7 @@ public void WriteToLogWithLevel(string property, string description, LogLevel le /// parameter value. public string GetSettingParameterValue(string sectionName, string parameterName, string defaultValue = null) { - if (string.IsNullOrEmpty(sectionName) || string.IsNullOrEmpty(parameterName)) + if (string.IsNullOrWhiteSpace(sectionName) || string.IsNullOrWhiteSpace(parameterName)) { return null; } @@ -427,7 +428,7 @@ public string GetSettingParameterValue(string sectionName, string parameterName, string setting = serviceConfiguration.Settings.Sections[sectionName].Parameters[parameterName]?.Value; - if (string.IsNullOrEmpty(setting) && defaultValue != null) + if (string.IsNullOrWhiteSpace(setting) && defaultValue != null) { return defaultValue; } @@ -454,23 +455,22 @@ public void Dispose() /// /// Process id of the target process to dump. /// Optional: The type of dump to generate. Default is DumpType.Full. - /// Optional: The full path to store dump file. Default is %SFLogRoot%\CrashDumps + /// Optional: The full path to store dump file. Default is %SFLogRoot%\CrashDumps /// true or false if the operation succeeded. - public bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpType.Full, string filePath = null) + private bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpType.Full, string folderPath = null, string fileName = null) { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return false; } - if (string.IsNullOrEmpty(dumpsPath) && string.IsNullOrEmpty(filePath)) + if (string.IsNullOrWhiteSpace(SFDumpsPath) && string.IsNullOrWhiteSpace(folderPath)) { return false; } - string path = !string.IsNullOrEmpty(filePath) ? filePath : dumpsPath; - string processName = string.Empty; - + string path = !string.IsNullOrWhiteSpace(folderPath) ? folderPath : SFDumpsPath; + string processName = !string.IsNullOrWhiteSpace(fileName) ? fileName : string.Empty; NativeMethods.MINIDUMP_TYPE miniDumpType; switch (dumpType) @@ -503,33 +503,43 @@ public bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpTyp try { - // This is to ensure friendly-name of resulting dmp file. using (Process process = Process.GetProcessById(processId)) { - processName = process.ProcessName; - - if (string.IsNullOrEmpty(processName)) + if (processName == string.Empty) { - return false; + processName = process.ProcessName; } - processName += $"_{DateTime.Now:ddMMyyyyHHmmss}.dmp"; IntPtr processHandle = process.Handle; + processName += $"_{DateTime.Now:ddMMyyyyHHmmss}.dmp"; // Check disk space availability before writing dump file. string driveName = path.Substring(0, 2); - + if (DiskUsage.GetCurrentDiskSpaceUsedPercent(driveName) > 90) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Not enough disk space available for dump file creation."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Not enough disk space available for dump file creation."); return false; } + if (!Directory.Exists(path)) + { + try + { + Directory.CreateDirectory(path); + } + catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) + { + // Can't create directory in SF dumps folder, so dump into top level directory.. + path = SFDumpsPath; + } + } + using (FileStream file = File.Create(Path.Combine(path, processName))) { if (!NativeMethods.MiniDumpWriteDump( @@ -655,22 +665,24 @@ public void ProcessResourceDataReportHealth( } // Container - if (!string.IsNullOrEmpty(replicaOrInstance?.ContainerId)) + if (!string.IsNullOrWhiteSpace(replicaOrInstance?.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } - // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. + // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. If the process has children, then don't emit this raw data since it will already + // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to send data to ApplicationInsights or LogAnalytics for each resource usage observation it makes per specified metric. - if (IsTelemetryEnabled) + if (IsTelemetryEnabled && replicaOrInstance.ChildProcesses == null) { - _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token).ConfigureAwait(true); + _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token).ConfigureAwait(true); } - // ETW - This is informational, per reading EventSource tracing, healthstate is irrelevant here. + // ETW - This is informational, per reading EventSource tracing, healthstate is irrelevant here. If the process has children, then don't emit this raw data since it will already + // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to log etw (which can then be read by some agent that will send it to some endpoint) // for each resource usage observation it makes per specified metric. - if (IsEtwEnabled) + if (IsEtwEnabled && replicaOrInstance.ChildProcesses == null) { ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, @@ -745,13 +757,13 @@ public void ProcessResourceDataReportHealth( warningOrError = true; healthState = HealthState.Error; - // **Windows-only**. This is primarily useful for AppObserver, but makes sense to be - // part of the base class for future use, like for FSO. + // **Windows-only**. This is used by AppObserver, but makes sense to be + // part of the base class for future use, like for plugins that manage service processes. if (replicaOrInstance != null && dumpOnError && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { if (serviceDumpCountDictionary == null) { - serviceDumpCountDictionary = new Dictionary(5); + serviceDumpCountDictionary = new Dictionary(5); } try @@ -761,21 +773,33 @@ public void ProcessResourceDataReportHealth( using (var proc = Process.GetProcessById(pid)) { string procName = proc?.ProcessName; - - if (!serviceDumpCountDictionary.ContainsKey(procName)) + StringBuilder sb = new StringBuilder(data.Property); + string metricName = sb.Replace(" ", string.Empty) + .Replace("Total", string.Empty) + .Replace("MB", string.Empty) + .Replace("%", string.Empty) + .Replace("Active", string.Empty) + .Replace("TCP", string.Empty).ToString(); + sb.Clear(); + string dumpKey = $"{procName}_{metricName}"; + + if (!serviceDumpCountDictionary.ContainsKey(dumpKey)) { - serviceDumpCountDictionary.Add(procName, 0); + serviceDumpCountDictionary.Add(dumpKey, (0, DateTime.UtcNow)); + } + else if (DateTime.UtcNow.Subtract(serviceDumpCountDictionary[dumpKey].LastDumpDate) >= TimeSpan.FromDays(1)) + { + serviceDumpCountDictionary[dumpKey] = (0, DateTime.UtcNow); } - if (serviceDumpCountDictionary[procName] < MaxDumps) + if (serviceDumpCountDictionary[dumpKey].DumpCount < MaxDumps) { - // DumpServiceProcess defaults to a Full dump with - // process memory, handles and thread data. - bool success = DumpServiceProcessWindows(pid); + // DumpServiceProcess defaults to a Full dump with process memory, handles and thread data. + bool success = DumpServiceProcessWindows(pid, DumpType.Full, Path.Combine(SFDumpsPath, procName), dumpKey); if (success) { - serviceDumpCountDictionary[procName]++; + serviceDumpCountDictionary[dumpKey] = (serviceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); } } } @@ -889,9 +913,9 @@ public void ProcessResourceDataReportHealth( var healthMessage = new StringBuilder(); string childProcMsg = string.Empty; - if (replicaOrInstance != null && replicaOrInstance.ChildProcessInfo != null) + if (replicaOrInstance != null && replicaOrInstance.ChildProcesses != null) { - childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcessInfo.Count}). " + + childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcesses.Count}). " + $"Their cumulative impact on {name}'s resource usage has been applied."; } @@ -910,7 +934,7 @@ public void ProcessResourceDataReportHealth( telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; - if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) + if (replicaOrInstance != null && !string.IsNullOrWhiteSpace(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } @@ -992,7 +1016,7 @@ public void ProcessResourceDataReportHealth( telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = FOErrorWarningCodes.Ok; - if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) + if (replicaOrInstance != null && !string.IsNullOrWhiteSpace(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } @@ -1114,38 +1138,47 @@ protected virtual void Dispose(bool disposing) } } - private void SetDefaultSfDumpPath() + private void SetDefaultSFWindowsDumpPath() { // This only needs to be set once. - if (string.IsNullOrEmpty(dumpsPath)) + if (string.IsNullOrWhiteSpace(SFDumpsPath)) { SFLogRoot = ServiceFabricConfiguration.Instance.FabricLogRoot; - if (!string.IsNullOrEmpty(SFLogRoot)) + if (string.IsNullOrWhiteSpace(SFLogRoot)) { - dumpsPath = Path.Combine(SFLogRoot, "CrashDumps"); + SFDumpsPath = null; + return; } } - if (Directory.Exists(dumpsPath)) + SFDumpsPath = Path.Combine(SFLogRoot, "ApplicationCrashDumps"); + + if (Directory.Exists(SFDumpsPath)) { return; } - try + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"Unable to locate dump directory {SFDumpsPath}. Trying another one..."); + + SFDumpsPath = Path.Combine(SFLogRoot, "CrashDumps"); + + if (Directory.Exists(SFDumpsPath)) { - _ = Directory.CreateDirectory(dumpsPath); + return; } - catch (IOException e) - { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Unable to create dumps directory:{Environment.NewLine}{e}"); - dumpsPath = null; - } + SFDumpsPath = null; + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"Unable to locate dump directory {SFDumpsPath}. Aborting. Will not generate application service dumps."); + return; } private void SetObserverConfiguration() @@ -1175,7 +1208,7 @@ private void SetObserverConfiguration() string telemetryProviderType = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.TelemetryProviderType); - if (string.IsNullOrEmpty(telemetryProviderType)) + if (string.IsNullOrWhiteSpace(telemetryProviderType)) { IsTelemetryProviderEnabled = false; @@ -1202,7 +1235,7 @@ private void SetObserverConfiguration() string logAnalyticsWorkspaceId = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.LogAnalyticsWorkspaceIdParameter); - if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) + if (string.IsNullOrWhiteSpace(logAnalyticsWorkspaceId) || string.IsNullOrWhiteSpace(logAnalyticsSharedKey)) { IsTelemetryProviderEnabled = false; return; @@ -1222,7 +1255,7 @@ private void SetObserverConfiguration() string aiKey = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.AiKey); - if (string.IsNullOrEmpty(aiKey)) + if (string.IsNullOrWhiteSpace(aiKey)) { IsTelemetryProviderEnabled = false; return; @@ -1266,7 +1299,7 @@ private void InitializeCsvLogger() string dataLogPath = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.DataLogPathParameter); - CsvFileLogger.BaseDataLogFolderPath = !string.IsNullOrEmpty(dataLogPath) ? Path.Combine(dataLogPath, ObserverName) : Path.Combine(Environment.CurrentDirectory, "fabric_observer_csvdata", ObserverName); + CsvFileLogger.BaseDataLogFolderPath = !string.IsNullOrWhiteSpace(dataLogPath) ? Path.Combine(dataLogPath, ObserverName) : Path.Combine(Environment.CurrentDirectory, "fabric_observer_csvdata", ObserverName); } private bool IsObserverWebApiAppInstalled() diff --git a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs index 8bb3ad5f..7bd3d904 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs @@ -13,11 +13,6 @@ public abstract class CpuUtilizationProvider : IDisposable { public abstract Task NextValueAsync(); - public void Dispose() - { - Dispose(disposing: true); - } - public static CpuUtilizationProvider Create() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) @@ -28,6 +23,11 @@ public static CpuUtilizationProvider Create() return new LinuxCpuUtilizationProvider(); } + public void Dispose() + { + Dispose(disposing: true); + } + protected abstract void Dispose(bool disposing); } } diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index e4b62b1f..41af9981 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -55,7 +55,7 @@ public sealed class ObserverConstants // AppObserver. public const string AppObserverName = "AppObserver"; public const string AppObserverConfigurationSectionName = "AppObserverConfiguration"; - public const string MaxChildProcessesToMonitorParameter = "MaxChildProcessesToMonitor"; + public const string MaxChildProcTelemetryDataCountParameter = "MaxChildProcTelemetryDataCount"; // Certificate Observer public const string CertificateObserverName = "CertificateObserver"; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index bc05841e..203563ff 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -8,6 +8,7 @@ using System.ComponentModel; using System.Diagnostics; using System.Fabric; +using System.Linq; using System.Management; namespace FabricObserver.Observers.Utilities @@ -19,6 +20,7 @@ public class WindowsProcessInfoProvider : ProcessInfoProvider const string FileHandlesCounterName = "Handle Count"; private readonly object memPerfCounterLock = new object(); private readonly object fileHandlesPerfCounterLock = new object(); + private const int MaxDescendants = 50; private PerformanceCounter memProcessPrivateWorkingSetCounter = new PerformanceCounter { CategoryName = ProcessCategoryName, @@ -47,7 +49,6 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogWarning($"Handled Exception in GetProcessPrivateWorkingSetInMB: {e.Message}"); return 0F; } @@ -60,15 +61,12 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) } catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); - // Don't throw. return 0F; } catch (Exception e) { Logger.LogError($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); - throw; } } @@ -94,7 +92,6 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogWarning($"Handled Exception in GetProcessAllocatedHandles: {e.Message}"); return -1F; } @@ -107,15 +104,12 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService } catch (Exception e) when (e is InvalidOperationException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{ProcessCategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); - // Don't throw. return -1F; } catch (Exception e) { Logger.LogError($"{ProcessCategoryName} {FileHandlesCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); - throw; } } @@ -136,8 +130,12 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return null; } - // Get descendent procs, max depth = 4. *Not* an optimal algo... This is fine. It is much better than increased StackOverflow exception potential - // due to recursive calls which are FailFast and will take FO down. Most services will never reach c3, let alone c4, anyway... + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + // Get descendant proc at max depth = 5 and max number of descendants = 50. for (int i = 0; i < childProcesses.Count; ++i) { List<(string procName, int pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].pid); @@ -146,7 +144,12 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService { childProcesses.AddRange(c1); - for (int j = 0; j < c1.Count; ++j) + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int j = 0; j < c1.Count; ++j) { List<(string procName, int pid)> c2 = TupleGetChildProcessInfo(c1[j].pid); @@ -154,6 +157,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService { childProcesses.AddRange(c2); + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + for (int k = 0; k < c2.Count; ++k) { List<(string procName, int pid)> c3 = TupleGetChildProcessInfo(c2[k].pid); @@ -162,6 +170,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService { childProcesses.AddRange(c3); + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + for (int l = 0; l < c3.Count; ++l) { List<(string procName, int pid)> c4 = TupleGetChildProcessInfo(c3[l].pid); @@ -169,6 +182,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService if (c4?.Count > 0) { childProcesses.AddRange(c4); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } } } } @@ -197,8 +215,6 @@ public float GetProcessPrivateWorkingSetInMB(string processName) } catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); - // Don't throw. return 0F; } diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index a01a096b..4da22041 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -4,6 +4,10 @@ %PACKAGE_ID% 3.1.15 + This release adds support for process tree monitoring by AppObserver: + Any child process (and descendants at max depth = 4) launched by a service process that is being monitored by AppObserver will also be monitored and its resource usage will be added to the parent's for use in threshold violation checks for an observed (configured) metric. + Added support for new child process monitoring data in ETW, AppInsights and LogAnalytics telemetry provider impls. + Minor bug fix in AppObserver monitor duration logic. Microsoft MIT diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 2ab09808..c43c8756 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -13,6 +13,7 @@ using System.Fabric.Query; using System.IO; using System.Linq; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.MachineInfoModel; @@ -45,10 +46,10 @@ public class AppObserver : ObserverBase private string fileName; private readonly Stopwatch stopwatch; - public int MaxChildProcessesToMonitor + public int MaxChildProcTelemetryDataCount { get; set; - } = 15; + } public List ReplicaOrInstanceList { @@ -99,13 +100,13 @@ public override async Task ObserveAsync(CancellationToken token) return; } - // For child process monitoring. + // For child process reporting via telemetry. if (int.TryParse( GetSettingParameterValue( ConfigurationSectionName, - ObserverConstants.MaxChildProcessesToMonitorParameter), out int maxChildProcs)) + ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) { - MaxChildProcessesToMonitor = maxChildProcs; + MaxChildProcTelemetryDataCount = maxChildProcs; } await MonitorDeployedAppsAsync(token); @@ -144,7 +145,8 @@ public override Task ReportAsync(CancellationToken token) string processName = null; int processId = 0; ApplicationInfo app = null; - bool hasChildProcs = repOrInst.ChildProcessInfo != null; + bool hasChildProcs = repOrInst.ChildProcesses != null && MaxChildProcTelemetryDataCount > 0; + if (hasChildProcs) { childProcessTelemetryDataList = new List(); @@ -327,8 +329,8 @@ public override Task ReportAsync(CancellationToken token) app.DumpProcessOnError); } - // Child proc info telemetry.. - if (IsEtwEnabled && childProcessTelemetryDataList != null) + // Child proc info telemetry. + if (IsEtwEnabled && hasChildProcs) { var data = new { @@ -338,7 +340,7 @@ public override Task ReportAsync(CancellationToken token) ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); } - if (IsTelemetryEnabled && childProcessTelemetryDataList != null) + if (IsTelemetryEnabled && hasChildProcs) { _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); } @@ -383,7 +385,7 @@ private void ProcessChildProcs( ApplicationInfo app, CancellationToken token) where T : struct { - var childProcs = repOrInst.ChildProcessInfo; + var childProcs = repOrInst.ChildProcesses; if (childProcs == null || childProcs.Count == 0 || token.IsCancellationRequested) { @@ -418,23 +420,18 @@ private void ProcessChildProcs( var childFruds = fruds.Where(x => x.Id.Contains(childProcName)).ToList(); metric = childFruds[0].Property; - // re-order the list by data value so we can emit raw telemetry for the top 10. - var childFrudsOrdered = childFruds.OrderByDescending(x => x.AverageDataValue).ToList(); - - for (int j = 0; j < childFrudsOrdered.Count; ++j) + for (int j = 0; j < childFruds.Count; ++j) { token.ThrowIfCancellationRequested(); - var frud = childFrudsOrdered[j]; - sumValues += Math.Round(frud.AverageDataValue, 0); + var frud = childFruds[j]; + double value = frud.AverageDataValue; + sumValues += Math.Round(value, 0); if (IsEtwEnabled || IsTelemetryEnabled) { - if (j < MaxChildProcessesToMonitor) - { - var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = frud.AverageDataValue }; - childProcessInfoData.ChildProcessInfo.Add(childProcInfo); - } + var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = value }; + childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } if (frud.IsUnhealthy(app.CpuWarningLimitPercent) ||frud.IsUnhealthy(app.MemoryWarningLimitMb) || @@ -512,8 +509,6 @@ private void ProcessChildProcs( fruds.Remove(frud); } - childFrudsOrdered?.Clear(); - childFrudsOrdered = null; childFruds?.Clear(); childFruds = null; } @@ -529,6 +524,15 @@ private void ProcessChildProcs( } } + // Order List by Value descending. + childProcessInfoData.ChildProcessInfo = childProcessInfoData.ChildProcessInfo.OrderByDescending(v => v.Value).ToList(); + + // Cap size of List to MaxChildProcTelemetryDataCount. + if (childProcessInfoData.ChildProcessInfo.Count >= MaxChildProcTelemetryDataCount) + { + childProcessInfoData.ChildProcessInfo = childProcessInfoData.ChildProcessInfo.Take(MaxChildProcTelemetryDataCount).ToList(); + } + return (childProcessInfoData, sumValues); } @@ -668,7 +672,7 @@ private async Task InitializeAsync() existingAppConfig.NetworkWarningActivePorts = existingAppConfig.NetworkWarningActivePorts == 0 && application.NetworkWarningActivePorts > 0 ? application.NetworkWarningActivePorts : existingAppConfig.NetworkWarningActivePorts; existingAppConfig.NetworkErrorEphemeralPorts = existingAppConfig.NetworkErrorEphemeralPorts == 0 && application.NetworkErrorEphemeralPorts > 0 ? application.NetworkErrorEphemeralPorts : existingAppConfig.NetworkErrorEphemeralPorts; existingAppConfig.NetworkWarningEphemeralPorts = existingAppConfig.NetworkWarningEphemeralPorts == 0 && application.NetworkWarningEphemeralPorts > 0 ? application.NetworkWarningEphemeralPorts : existingAppConfig.NetworkWarningEphemeralPorts; - existingAppConfig.DumpProcessOnError = application.DumpProcessOnError != existingAppConfig.DumpProcessOnError ? application.DumpProcessOnError : existingAppConfig.DumpProcessOnError; + existingAppConfig.DumpProcessOnError = application.DumpProcessOnError == existingAppConfig.DumpProcessOnError ? application.DumpProcessOnError : existingAppConfig.DumpProcessOnError; existingAppConfig.ErrorOpenFileHandles = existingAppConfig.ErrorOpenFileHandles == 0 && application.ErrorOpenFileHandles > 0 ? application.ErrorOpenFileHandles : existingAppConfig.ErrorOpenFileHandles; existingAppConfig.WarningOpenFileHandles = existingAppConfig.WarningOpenFileHandles == 0 && application.WarningOpenFileHandles > 0 ? application.WarningOpenFileHandles : existingAppConfig.WarningOpenFileHandles; } @@ -751,7 +755,7 @@ private async Task InitializeAsync() ObserverName, HealthState.Warning, $"InitializeAsync() | {application.TargetApp}: Invalid TargetApp value. " + - $"Value must be a valid Uri string of format \"fabric:/MyApp\", for example."); + $"Value must be a valid Uri string of format \"fabric:/MyApp\" OR just \"MyApp\""); settingsFail++; continue; @@ -795,6 +799,7 @@ private async Task InitializeAsync() } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { + } } @@ -928,18 +933,15 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) (parentProc.ProcessName, parentProc.Id) }; - if (repOrInst.ChildProcessInfo != null && repOrInst.ChildProcessInfo.Count > 0) + if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) { - procTree.AddRange(repOrInst.ChildProcessInfo); + procTree.AddRange(repOrInst.ChildProcesses); } for (int j = 0; j < procTree.Count; ++j) { int procId = procTree[j].Pid; string procName = procTree[j].procName; - - - TimeSpan duration = TimeSpan.FromSeconds(1); if (MonitorDuration > TimeSpan.MinValue) @@ -953,21 +955,30 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) continue; } - /* Warm up counters. */ + /* Warm up Windows perf counters. */ if (checkCpu) { - _ = cpuUsage.GetCpuUsagePercentageProcess(procId); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = cpuUsage.GetCpuUsagePercentageProcess(procId); + } } if (checkHandles) { - _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + } } if (checkMemMb || checkMemPct) { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + } } // Monitor Duration applies to the code below. @@ -1179,6 +1190,9 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType).ToList(); + + appList.Clear(); + appList = null; } for (int i = 0; i < deployedApps.Count; ++i) @@ -1292,7 +1306,7 @@ private void SetInstanceOrReplicaMonitoringList( if (childPids != null && childPids.Count > 0) { - replicaInfo.ChildProcessInfo = childPids; + replicaInfo.ChildProcesses = childPids; } break; } @@ -1324,7 +1338,7 @@ private void SetInstanceOrReplicaMonitoringList( if (childProcs != null && childProcs.Count > 0) { - replicaInfo.ChildProcessInfo = childProcs; + replicaInfo.ChildProcesses = childProcs; } break; } diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index b32df6bd..919d5d45 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -825,9 +825,10 @@ private async Task GetProcessInfoAsync(string procName) if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) { // Warm up the perf counters. - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); - float mem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); - allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + } } // Allocated Handles @@ -859,6 +860,13 @@ private async Task GetProcessInfoAsync(string procName) allCpuData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(cpu); } + // Mem + if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) + { + float mem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); + } + await Task.Delay(250, Token).ConfigureAwait(true); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 6eb1c8ed..b3e7da80 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -616,19 +616,6 @@ private async Task GetSystemCpuMemoryValuesAsync(CancellationToken token) try { - // Ports. - if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) - { - int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); - ActivePortsData.Data.Add(activePortCountTotal); - } - - if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) - { - int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); - EphemeralPortsData.Data.Add(ephemeralPortCountTotal); - } - // Firewall rules. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && FirewallData != null) { @@ -692,29 +679,44 @@ error on these conditions. } } - if (MemDataCommittedBytes != null && (MemErrorUsageThresholdMb > 0 || MemWarningUsageThresholdMb > 0)) - { - float committedMegaBytes = MemoryUsageProvider.Instance.GetCommittedBytes() / 1048576.0f; - MemDataCommittedBytes.Data.Add(committedMegaBytes); - } - - if (MemDataPercentUsed != null && (MemoryErrorLimitPercent > 0 || MemoryWarningLimitPercent > 0)) - { - MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); - } - timer.Start(); while (timer.Elapsed <= duration) { token.ThrowIfCancellationRequested(); + // CPU if (CpuTimeData != null && (CpuErrorUsageThresholdPct > 0 || CpuWarningUsageThresholdPct > 0)) { CpuTimeData.Data.Add(await cpuUtilizationProvider.NextValueAsync()); } - await Task.Delay(500, Token).ConfigureAwait(true); + // Memory + if (MemDataCommittedBytes != null && (MemErrorUsageThresholdMb > 0 || MemWarningUsageThresholdMb > 0)) + { + float committedMegaBytes = MemoryUsageProvider.Instance.GetCommittedBytes() / 1048576.0f; + MemDataCommittedBytes.Data.Add(committedMegaBytes); + } + + if (MemDataPercentUsed != null && (MemoryErrorLimitPercent > 0 || MemoryWarningLimitPercent > 0)) + { + MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); + } + + // Ports. + if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) + { + int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); + ActivePortsData.Data.Add(activePortCountTotal); + } + + if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) + { + int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); + EphemeralPortsData.Data.Add(ephemeralPortCountTotal); + } + + await Task.Delay(250, Token).ConfigureAwait(true); } timer.Stop(); @@ -733,6 +735,7 @@ error on these conditions. finally { cpuUtilizationProvider?.Dispose(); + cpuUtilizationProvider = null; } } diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index c421775f..77261f0b 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -143,10 +143,11 @@ the observer. The default value for capacity is 30 if you omit the ResourceUsageDataCapacity parameter or use an invalid value like 0 or a negative number (or omit the parameter altogether). --> - - + +
@@ -173,7 +174,6 @@ - diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 5110b294..a017de10 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -52,11 +52,10 @@ - - - + + - + @@ -74,8 +73,10 @@ - - + + @@ -152,7 +153,7 @@ - +
@@ -171,7 +172,6 @@ - @@ -265,9 +265,9 @@ - + + + + \ No newline at end of file From d4312c007110efa3f2cea340d91235d1d8216893 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Fri, 9 Jul 2021 17:03:11 -0700 Subject: [PATCH 09/14] FO 3.1.15 RC --- .../ApplicationPackageRoot/ApplicationManifest.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index a017de10..4f96a536 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -265,9 +265,9 @@ - + \ No newline at end of file From 52ab2314e7d82f346907d38c0a2e1a331df17ec5 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 12 Jul 2021 16:02:17 -0700 Subject: [PATCH 10/14] FO 3.1.15 --- Build-COSFPkgs.ps1 | 8 +-- ClusterObserver.nuspec.template | 4 +- ClusterObserver/ClusterObserver.cs | 17 +---- .../PackageRoot/ServiceManifest.xml | 6 +- .../Telemetry/AppInsightsTelemetry.cs | 13 +--- .../Utilities/Telemetry/TelemetryData.cs | 6 +- .../ApplicationManifest.xml | 4 +- Documentation/Observers.md | 33 +++++---- FabricObserver.Extensibility/ObserverBase.cs | 13 ++-- .../Utilities/ObserverConstants.cs | 1 + .../Telemetry/AppInsightsTelemetry.cs | 26 ++----- .../Telemetry/ChildProcessTelemetryData.cs | 2 +- .../Utilities/Telemetry/TelemetryData.cs | 6 +- FabricObserver.nuspec.template | 1 + FabricObserver/Observers/AppObserver.cs | 70 ++++++++++++------- .../Observers/CertificateObserver.cs | 1 - FabricObserver/Observers/OSObserver.cs | 1 - .../PackageRoot/Config/Settings.xml | 11 +-- .../ApplicationManifest.xml | 24 +++---- 19 files changed, 124 insertions(+), 123 deletions(-) diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index 57e4c4ec..1d86e1ad 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.9" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.9" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.9" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.9" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index bf07f519..5d576d76 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,9 +2,9 @@ %PACKAGE_ID% - 2.1.9 + 2.1.10 - Code improvements. + Updated TelemetryData and ApplicationInsights impl to match FO 3.1.15's impls. Microsoft MIT diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index bc39da9c..e05d4275 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -402,8 +402,6 @@ private async Task ProcessApplicationHealthAsync(IList a // ETW. if (EtwEnabled) { - double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - Logger.EtwLogger?.Write( ObserverConstants.ClusterObserverETWEventName, new @@ -420,7 +418,7 @@ private async Task ProcessApplicationHealthAsync(IList a foTelemetryData.ProcessId, foTelemetryData.ReplicaId, foTelemetryData.SystemServiceProcessName, - Value = value + foTelemetryData.Value }); } @@ -544,7 +542,7 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt Metric = metric ?? "AggregatedClusterHealth", ObserverName = sourceObserver ?? string.Empty, Source = foStats != null ? foStats.Source : ObserverName, - Value = foStats != null ? foStats.Value : string.Empty + Value = foStats != null ? foStats.Value : 0 }; // Telemetry. @@ -557,13 +555,6 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt continue; } - double value = 0; - - if (foStats != null) - { - value = double.TryParse(foStats.Value?.ToString(), out double val) ? val : -1; - } - Logger.EtwLogger?.Write( ObserverConstants.ClusterObserverETWEventName, new @@ -576,7 +567,7 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt Metric = metric ?? "AggregatedClusterHealth", ObserverName = sourceObserver ?? string.Empty, Source = foStats != null ? foStats.Source : ObserverName, - Value = value + Value = foStats != null ? foStats.Value : 0 }); } } @@ -642,7 +633,6 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = $"{nodeDictItem.Key} is now Up.", Metric = "NodeStatus", NodeName = nodeDictItem.Key, - Value = "Up", Source = ObserverName }; @@ -716,7 +706,6 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = message, Metric = "NodeStatus", NodeName = kvp.Key, - Value = $"{kvp.Value.NodeStatus}", Source = ObserverName }; diff --git a/ClusterObserver/PackageRoot/ServiceManifest.xml b/ClusterObserver/PackageRoot/ServiceManifest.xml index afc53c50..8795ad20 100644 --- a/ClusterObserver/PackageRoot/ServiceManifest.xml +++ b/ClusterObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index 42566027..2e66eec3 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -122,13 +122,6 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { cancellationToken.ThrowIfCancellationRequested(); - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - Dictionary properties = new Dictionary { { "ClusterId", telemetryData.ClusterId ?? string.Empty }, @@ -136,13 +129,13 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "Application", telemetryData.ApplicationName ?? string.Empty }, { "Service", telemetryData.ServiceName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "Partition", telemetryData.PartitionId }, - { "Replica", telemetryData.ReplicaId }, + { "Replica", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs index c641e710..f79bc3e8 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs @@ -67,12 +67,12 @@ public string PartitionId get; set; } - public string ProcessId + public int ProcessId { get; set; } - public string ReplicaId + public long ReplicaId { get; set; } @@ -92,7 +92,7 @@ public string SystemServiceProcessName get; set; } - public object Value + public double Value { get; set; } diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index f26c8c7c..974750e2 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -15,7 +15,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + diff --git a/Documentation/Observers.md b/Documentation/Observers.md index ff6bc2b0..b0197180 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -43,19 +43,29 @@ For every other observer, it's XML as per usual. ## AppObserver Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application service processes and the child processes they spawn. If a service process creates child processes, then these processes will be monitored and their summed resource usage for some metric you are observing will be applied to the parent process (added) and a threshold breach will be determined based on the sum of children and parent resource usage. -This observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. +This observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. + +#### A note on child process monitoring + +AppObserver (FO version >= 3.1.15) will automatically monitor up to 50 process descendants of your primary service process (50 is extreme. You should not design services that own that many descendant processes..). If your services launch child processes, then AppObserver will automatically monitor them for the same metrics and thresholds you supply for the containing Application. +Their culmative impact on some monitored metric will be added to that of the parent process (your service process) and this combined (sum) value will be used to determine health state based on supplied threshold for the related metric. + +You can disable this feature (you shouldn't if you **do** launch child processes from your service and they run for a while or for the lifetime of your service and compute (use resources)) by setting AppObserverEnableChildProcessMonitoring to false. For telemetry, you can control how many offspring are present in the event data by setting AppObserverMaxChildProcTelemetryDataCount (default is 5). Both of these settings are located in ApplicationManifest.xml. +The AppObserverMaxChildProcTelemetryDataCount setting determines the size of the list used in family tree process data telemetry transmission, which corresponds to the size of the telemetry data event. You should keep this below 10. AppObserver will order the list of ChildProcessInfo (a member of ChildProcessTelemetryData) by resoure usage value, from highest to lowest. + +In the vast majority of cases, your services are not going to launch 50 descendant processes, but FO is designed to support such an extreme edge case scenario, which frankly should not be in your service design playbook. Also note that if you do spawn a lot of child processes and +you have AppObserverMonitorDuration set to, say, 10 seconds, then you will be running AppObserver for n * 10 seconds, where n is the number of descendant proceses plus the parent service process that owns them for each metric for each service with descendants. Please keep this in mind as you design your configuration. + +Finally, you can ignore this feature if you do not launch child processes from your services. Just disable it. This is important because if AppObserver will run code that checks to see if some process id has children. If you know this is not the case, then save CPU cycles and disable the feature. + ### Input -JSON config file supplied by user, stored in -PackageRoot/Observers.Data folder. This data contains JSON arrays -objects which constitute Service Fabric Apps (identified by service -URI's). Users supply Error/Warning thresholds for CPU use, Memory use and Disk -IO, ports. Memory values are supplied as number of megabytes... CPU and -Disk Space values are provided as percentages (integers: so, 80 = 80%...)... +JSON config file supplied by user, stored in PackageRoot/Observers.Data folder. This data contains JSON arrays +objects which constitute Service Fabric Apps (identified by service URI's). Users supply Error/Warning thresholds for CPU use, Memory use and Disk +IO, ports. Memory values are supplied as number of megabytes... CPU and Disk Space values are provided as percentages (integers: so, 80 = 80%...)... **Please note that you can omit any of these properties. You can also supply 0 as the value, which means that threshold will be ignored (they are not omitted below so you can see what a fully specified object looks like). -We recommend you omit all Error thresholds until you become more -comfortable with the behavior of your services and the side effects they have on machine resources**. +We recommend you omit all Error thresholds until you become more comfortable with the behavior of your services and the side effects they have on machine resources**. Example JSON config file located in **PackageRoot\\Config** folder (AppObserver.config.json). This is an example of a configuration that applies to all Service Fabric user (non-System) application service processes running on the virtual machine. @@ -87,7 +97,7 @@ All settings are optional, ***except target OR targetType***, and can be omitted | **memoryWarningLimitPercent** | Minimum percentage of memory used by an App's service process (integer) that should generate a Fabric Warning (SFX and local log) | | **cpuErrorLimitPercent** | Maximum CPU percentage that should generate a Fabric Error | | **cpuWarningLimitPercent** | Minimum CPU percentage that should generate a Fabric Warning | -| **dumpProcessOnError** | Instructs whether or not FabricObserver should dump your service process when service health is detected to be in an Error (critical) state... | +| **dumpProcessOnError** | Instructs whether or not FabricObserver should dump your service process when service health is detected to be in an Error (critical) state... | | **networkErrorActivePorts** | Maximum number of established TCP ports in use by app process that will generate a Fabric Error. | | **networkWarningActivePorts** | Minimum number of established TCP ports in use by app process that will generate a Fabric Warning. | | **networkErrorEphemeralPorts** | Maximum number of ephemeral TCP ports (within a dynamic port range) in use by app process that will generate a Fabric Error. | @@ -98,9 +108,8 @@ All settings are optional, ***except target OR targetType***, and can be omitted **Output** Log text(Error/Warning), Service Fabric Application Health Report (Error/Warning/Ok), ETW (EventSource), Telemetry (AppInsights/LogAnalytics) AppObserver also supports non-JSON parameters for configuration unrelated to thresholds. Like all observers these settings are located in ApplicationManifest.xml to support versionless configuration updates via application upgrade. -One of these settings is the maximum number of child processes to monitor (AppObserverMaxChildProcessesToMonitor). Note that there is a cap on generations. Up to 4 generations of child processes will be monitored. The resulting set will be ordered by resource usage from high to low for a given metric. This list size is determined by this setting. -Example SFX Output (Warning - Ephemeral Ports Usage): +Example AppObserver Output (Warning - Ephemeral Ports Usage): ![alt text](/Documentation/Images/AppObsWarn.png "AppObserver Warning output example.") diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index c8fdbcff..1b8ccf44 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -606,7 +606,8 @@ public void ProcessResourceDataReportHealth( string thresholdName = "Minimum"; bool warningOrError = false; - string name = string.Empty, id, drive = string.Empty, procId = string.Empty; + string name = string.Empty, id, drive = string.Empty; + int procId = 0; T threshold = thresholdWarning; HealthState healthState = HealthState.Ok; Uri appName = null; @@ -621,7 +622,7 @@ public void ProcessResourceDataReportHealth( appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = serviceName.OriginalString.Replace($"{appName.OriginalString}/", string.Empty); - procId = replicaOrInstance.HostProcessId.ToString(); + procId = (int)replicaOrInstance.HostProcessId; } else // System service report from FabricSystemObserver. { @@ -630,9 +631,9 @@ public void ProcessResourceDataReportHealth( try { - procId = Process.GetProcessesByName(name).First()?.Id.ToString(); + procId = (int)Process.GetProcessesByName(name).First()?.Id; } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is PlatformNotSupportedException) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is PlatformNotSupportedException || e is Win32Exception) { } @@ -652,7 +653,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ProcessId = procId, - ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), + ReplicaId = replicaOrInstance != null ? replicaOrInstance.ReplicaOrInstanceId : 0, ServiceName = serviceName?.OriginalString ?? string.Empty, SystemServiceProcessName = appName?.OriginalString == FabricSystemAppName ? name : string.Empty, Source = ObserverConstants.FabricObserverName @@ -695,7 +696,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ProcessId = procId, - ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), + ReplicaId = replicaOrInstance?.ReplicaOrInstanceId, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, SystemServiceProcessName = appName?.OriginalString == FabricSystemAppName ? name : string.Empty diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 41af9981..c55ad358 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -55,6 +55,7 @@ public sealed class ObserverConstants // AppObserver. public const string AppObserverName = "AppObserver"; public const string AppObserverConfigurationSectionName = "AppObserverConfiguration"; + public const string EnableChildProcessMonitoring = "EnableChildProcessMonitoring"; public const string MaxChildProcTelemetryDataCountParameter = "MaxChildProcTelemetryDataCount"; // Certificate Observer diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index de385434..550d607c 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -174,13 +174,6 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { cancellationToken.ThrowIfCancellationRequested(); - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - var properties = new Dictionary { { "ClusterId", telemetryData.ClusterId ?? string.Empty }, @@ -188,13 +181,13 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "PartitionId", telemetryData.PartitionId ?? string.Empty }, - { "ReplicaId", telemetryData.ReplicaId ?? string.Empty }, + { "ReplicaId", telemetryData.ReplicaId.ToString() }, { "ObserverName", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } @@ -249,13 +242,6 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can return Task.CompletedTask; } - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - try { var properties = new Dictionary @@ -263,12 +249,12 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "PartitionId", telemetryData.PartitionId }, - { "ReplicaId", telemetryData.ReplicaId }, + { "ReplicaId", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs index 93767c75..56f3fd1e 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs @@ -12,7 +12,7 @@ public class ChildProcessTelemetryData public string ServiceName; public string Metric; public double Value; - public long ProcessId; + public int ProcessId; public string PartitionId; public string ReplicaId; public string NodeName; diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index 743cd065..0330a0c2 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -69,12 +69,12 @@ public string PartitionId get; set; } - public string ProcessId + public int ProcessId { get; set; } - public string ReplicaId + public long ReplicaId { get; set; } @@ -94,7 +94,7 @@ public string SystemServiceProcessName get; set; } - public object Value + public double Value { get; set; } diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 4da22041..4f0baada 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -8,6 +8,7 @@ Any child process (and descendants at max depth = 4) launched by a service process that is being monitored by AppObserver will also be monitored and its resource usage will be added to the parent's for use in threshold violation checks for an observed (configured) metric. Added support for new child process monitoring data in ETW, AppInsights and LogAnalytics telemetry provider impls. Minor bug fix in AppObserver monitor duration logic. + Added lifetime management to DumpOnError feature (max 5 dumps per process per metric for 24 hour period. Then, new cycle) Microsoft MIT diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index c43c8756..60be4826 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -51,6 +51,11 @@ public int MaxChildProcTelemetryDataCount get; set; } + public bool EnableChildProcessMonitoring + { + get; set; + } + public List ReplicaOrInstanceList { get; set; @@ -100,15 +105,6 @@ public override async Task ObserveAsync(CancellationToken token) return; } - // For child process reporting via telemetry. - if (int.TryParse( - GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) - { - MaxChildProcTelemetryDataCount = maxChildProcs; - } - await MonitorDeployedAppsAsync(token); await ReportAsync(token); @@ -133,7 +129,7 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - // For use in family tree monitoring. + // For use in process family tree monitoring. List childProcessTelemetryDataList = null; TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); @@ -145,7 +141,7 @@ public override Task ReportAsync(CancellationToken token) string processName = null; int processId = 0; ApplicationInfo app = null; - bool hasChildProcs = repOrInst.ChildProcesses != null && MaxChildProcTelemetryDataCount > 0; + bool hasChildProcs = EnableChildProcessMonitoring && repOrInst.ChildProcesses != null; if (hasChildProcs) { @@ -330,7 +326,7 @@ public override Task ReportAsync(CancellationToken token) } // Child proc info telemetry. - if (IsEtwEnabled && hasChildProcs) + if (IsEtwEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) { var data = new { @@ -340,7 +336,7 @@ public override Task ReportAsync(CancellationToken token) ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); } - if (IsTelemetryEnabled && hasChildProcs) + if (IsTelemetryEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) { _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); } @@ -399,7 +395,7 @@ private void ProcessChildProcs( ApplicationName = repOrInst.ApplicationName.OriginalString, ServiceName = repOrInst.ServiceName.OriginalString, NodeName = NodeName, - ProcessId = repOrInst.HostProcessId, + ProcessId = (int)repOrInst.HostProcessId, PartitionId = repOrInst.PartitionId.ToString(), ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), ChildProcessCount = childProcs.Count, @@ -472,8 +468,8 @@ private void ProcessChildProcs( NodeName = NodeName, ObserverName = ObserverName, PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid.ToString(), - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ProcessId = childPid, + ReplicaId = repOrInst.ReplicaOrInstanceId, ServiceName = repOrInst.ServiceName.OriginalString, Source = ObserverConstants.FabricObserverName, Value = frud.AverageDataValue @@ -551,6 +547,24 @@ private async Task InitializeAsync() userTargetList = new List(); deployedTargetList = new List(); + /* For descendant proc monitoring */ + if (bool.TryParse( + GetSettingParameterValue( + ConfigurationSectionName, + ObserverConstants.EnableChildProcessMonitoring), out bool enableDescendantMonitoring)) + { + EnableChildProcessMonitoring = enableDescendantMonitoring; + } + + if (int.TryParse( + GetSettingParameterValue( + ConfigurationSectionName, + ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) + { + MaxChildProcTelemetryDataCount = maxChildProcs; + } + /* End descendant proc monitoring */ + configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( ObserverConstants.ObserverConfigurationPackageName)?.Settings, @@ -1302,12 +1316,16 @@ private void SetInstanceOrReplicaMonitoringList( ServiceName = statefulReplica.ServiceName }; - var childPids = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statefulReplica.HostProcessId); - - if (childPids != null && childPids.Count > 0) + if (EnableChildProcessMonitoring) { - replicaInfo.ChildProcesses = childPids; + var childPids = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statefulReplica.HostProcessId); + + if (childPids != null && childPids.Count > 0) + { + replicaInfo.ChildProcesses = childPids; + } } + break; } case DeployedStatelessServiceInstance statelessInstance: @@ -1334,12 +1352,16 @@ private void SetInstanceOrReplicaMonitoringList( ServiceName = statelessInstance.ServiceName }; - var childProcs = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statelessInstance.HostProcessId); - - if (childProcs != null && childProcs.Count > 0) + if (EnableChildProcessMonitoring) { - replicaInfo.ChildProcesses = childProcs; + var childProcs = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statelessInstance.HostProcessId); + + if (childProcs != null && childProcs.Count > 0) + { + replicaInfo.ChildProcesses = childProcs; + } } + break; } } diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 1e2b687d..0d44aeac 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -250,7 +250,6 @@ public override async Task ReportAsync(CancellationToken token) ObserverName = ObserverName, OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", Source = ObserverConstants.FabricObserverName, - Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration) }; await TelemetryClient.ReportHealthAsync(telemetryData, Token); diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 9ec1748c..8f5151b8 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -254,7 +254,6 @@ public override async Task ReportAsync(CancellationToken token) Description = auServiceEnabledMessage, HealthState = "Warning", Metric = "WUAutoDownloadEnabled", - Value = isAUAutomaticDownloadEnabled, NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 77261f0b..50ada035 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -143,10 +143,11 @@ the observer. The default value for capacity is 30 if you omit the ResourceUsageDataCapacity parameter or use an invalid value like 0 or a negative number (or omit the parameter altogether). --> - + + +
@@ -285,7 +286,7 @@ [ObserverName]Configuration. Example: SampleNewObserverConfiguration where SampleNewObserver is the type name of the observer plugin. See the SampleObserverPlugin project for a complete example of implementing an observer plugin. --> - + - + @@ -37,7 +37,7 @@ - + @@ -51,7 +51,7 @@ - + @@ -65,7 +65,7 @@ - + @@ -73,9 +73,8 @@ - + + @@ -153,6 +152,7 @@ +
@@ -265,9 +265,9 @@ - + + \ No newline at end of file From ea77ec208de20991e41e305fbd3e821d370881f6 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 12 Jul 2021 16:02:56 -0700 Subject: [PATCH 11/14] FO 3.1.15 --- .../ApplicationPackageRoot/ApplicationManifest.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 963ea559..45ed329b 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -29,7 +29,7 @@ - + From a00ab33e22c0b86d9a7c7349903e8640e0bd5a93 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 12 Jul 2021 19:18:49 -0700 Subject: [PATCH 12/14] FO 3.1.15 RC2 (bug fix) --- FabricObserver/Observers/AppObserver.cs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 60be4826..7ef23485 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -430,11 +430,14 @@ private void ProcessChildProcs( childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } - if (frud.IsUnhealthy(app.CpuWarningLimitPercent) ||frud.IsUnhealthy(app.MemoryWarningLimitMb) || - frud.IsUnhealthy(app.MemoryWarningLimitPercent) || frud.IsUnhealthy(app.NetworkWarningEphemeralPorts) || - frud.IsUnhealthy(app.WarningOpenFileHandles)) + if ((frud.Property == ErrorWarningProperty.TotalCpuTime && frud.IsUnhealthy(app.CpuWarningLimitPercent)) || + (frud.Property == ErrorWarningProperty.TotalMemoryConsumptionMb && frud.IsUnhealthy(app.MemoryWarningLimitMb)) || + (frud.Property == ErrorWarningProperty.TotalMemoryConsumptionPct && frud.IsUnhealthy(app.MemoryWarningLimitPercent)) || + (frud.Property == ErrorWarningProperty.TotalActivePorts && frud.IsUnhealthy(app.NetworkWarningActivePorts)) || + (frud.Property == ErrorWarningProperty.TotalEphemeralPorts && frud.IsUnhealthy(app.NetworkWarningEphemeralPorts)) || + (frud.Property == ErrorWarningProperty.TotalFileHandles && frud.IsUnhealthy(app.WarningOpenFileHandles))) { - if (IsEtwEnabled) + /*if (IsEtwEnabled) { var warningdata = new { @@ -476,7 +479,7 @@ private void ProcessChildProcs( }; _ = TelemetryClient?.ReportHealthAsync(telemWarnData, token); - } + }*/ // This provides information in SFX to help you understand that your App is in Warning because one of its services' child processes // is misbehaving. Now you know exactly which one you need to fix. @@ -486,7 +489,7 @@ private void ProcessChildProcs( Code = FOErrorWarningCodes.Ok, EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, HealthMessage = $"Note that service {repOrInst.ServiceName.OriginalString} spawned a child process, {childProcName}({childPid}), " + - $"that has exceeded your supplied threshold for {frud.Property} for Application {repOrInst.ApplicationName.OriginalString}.", + $"that has exceeded your supplied Warning or Error threshold for {frud.Property} for Application {repOrInst.ApplicationName.OriginalString}.", HealthReportTimeToLive = GetHealthReportTimeToLive(), ReportType = HealthReportType.Application, State = HealthState.Ok, From 2d4065df2e417d27f2b6bf19edb97ed60cde0075 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Jul 2021 12:11:49 -0700 Subject: [PATCH 13/14] FO 3.1.15 RTW --- FabricObserver/Observers/AppObserver.cs | 74 ------------------------- 1 file changed, 74 deletions(-) diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 7ef23485..640776c3 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -430,80 +430,6 @@ private void ProcessChildProcs( childProcessInfoData.ChildProcessInfo.Add(childProcInfo); } - if ((frud.Property == ErrorWarningProperty.TotalCpuTime && frud.IsUnhealthy(app.CpuWarningLimitPercent)) || - (frud.Property == ErrorWarningProperty.TotalMemoryConsumptionMb && frud.IsUnhealthy(app.MemoryWarningLimitMb)) || - (frud.Property == ErrorWarningProperty.TotalMemoryConsumptionPct && frud.IsUnhealthy(app.MemoryWarningLimitPercent)) || - (frud.Property == ErrorWarningProperty.TotalActivePorts && frud.IsUnhealthy(app.NetworkWarningActivePorts)) || - (frud.Property == ErrorWarningProperty.TotalEphemeralPorts && frud.IsUnhealthy(app.NetworkWarningEphemeralPorts)) || - (frud.Property == ErrorWarningProperty.TotalFileHandles && frud.IsUnhealthy(app.WarningOpenFileHandles))) - { - /*if (IsEtwEnabled) - { - var warningdata = new - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = "", - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", - HealthState = "Warning", - Metric = frud.Property, - NodeName, - ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid, - ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, warningdata); - } - - if (IsTelemetryEnabled) - { - var telemWarnData = new TelemetryData(FabricClientInstance, token) - { - ApplicationName = repOrInst.ApplicationName.OriginalString, - Code = "", - Description = $"{repOrInst.ServiceName.OriginalString}: child process {childProcName} has exceeded supplied threshold for {frud.Property}.", - HealthState = "Warning", - Metric = frud.Property, - NodeName = NodeName, - ObserverName = ObserverName, - PartitionId = repOrInst.PartitionId.ToString(), - ProcessId = childPid, - ReplicaId = repOrInst.ReplicaOrInstanceId, - ServiceName = repOrInst.ServiceName.OriginalString, - Source = ObserverConstants.FabricObserverName, - Value = frud.AverageDataValue - }; - - _ = TelemetryClient?.ReportHealthAsync(telemWarnData, token); - }*/ - - // This provides information in SFX to help you understand that your App is in Warning because one of its services' child processes - // is misbehaving. Now you know exactly which one you need to fix. - var healthReport = new Utilities.HealthReport - { - AppName = repOrInst.ApplicationName, - Code = FOErrorWarningCodes.Ok, - EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, - HealthMessage = $"Note that service {repOrInst.ServiceName.OriginalString} spawned a child process, {childProcName}({childPid}), " + - $"that has exceeded your supplied Warning or Error threshold for {frud.Property} for Application {repOrInst.ApplicationName.OriginalString}.", - HealthReportTimeToLive = GetHealthReportTimeToLive(), - ReportType = HealthReportType.Application, - State = HealthState.Ok, - NodeName = NodeName, - Observer = ObserverName, - Property = $"{NodeName}_{frud.Id.Split(':')[0]}_{childProcName}", - ResourceUsageDataProperty = frud.Property, - SourceId = $"{ObserverName}({FOErrorWarningCodes.Ok})" - }; - - // Generate a Service Fabric Health Report. - HealthReporter.ReportHealthToServiceFabric(healthReport); - } - // Remove child FRUD from ref FRUD. fruds.Remove(frud); } From be367d1d8d4e8b4e6160e1685f2cde6a703d4879 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 13 Jul 2021 12:22:54 -0700 Subject: [PATCH 14/14] FO 3.1.15 RTW --- .../ApplicationPackageRoot/ApplicationManifest.xml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 45ed329b..c35b3467 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -73,7 +73,7 @@ - + @@ -265,9 +265,9 @@ - + \ No newline at end of file