diff --git a/.gitignore b/.gitignore index 42a0c818..226269ea 100644 --- a/.gitignore +++ b/.gitignore @@ -336,3 +336,6 @@ ASALocalRun/ /FabricObserver/observer_logs /FabricObserver/PackageRoot/Data/Plugins/SampleNewObserver.dll /nuget.exe +/FabricObserver/PackageRoot/Data/Plugins/ContainerObserver +/FabricObserver/PackageRoot/Data/Plugins/FabricObserverMdm +/FabricObserver/PackageRoot/Config/containerobserver.config.json diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index 57e4c4ec..1d86e1ad 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.9" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.9" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.9" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.9" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.10" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index c450c32f..677e2c8f 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.14" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.14" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.14" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.14" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.15" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index bf07f519..5d576d76 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,9 +2,9 @@ %PACKAGE_ID% - 2.1.9 + 2.1.10 - Code improvements. + Updated TelemetryData and ApplicationInsights impl to match FO 3.1.15's impls. Microsoft MIT diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index fd2410e7..e05d4275 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -372,7 +372,7 @@ private async Task ProcessApplicationHealthAsync(IList a udText = $"in UD {udsInAppUpgrade.First(ud => ud > -1 && ud < int.MaxValue)}"; } - telemetryDescription += $"{appName} is upgrading {udText}.{Environment.NewLine}"; + telemetryDescription += $" Note: {appName} is upgrading {udText}.{Environment.NewLine}"; } } @@ -390,17 +390,18 @@ private async Task ProcessApplicationHealthAsync(IList a // From FabricObserver? if (foTelemetryData != null) { + foTelemetryData.Description += telemetryDescription; + // Telemetry. if (TelemetryEnabled && ObserverTelemetryClient != null) { + await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); } // ETW. if (EtwEnabled) { - double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - Logger.EtwLogger?.Write( ObserverConstants.ClusterObserverETWEventName, new @@ -417,7 +418,7 @@ private async Task ProcessApplicationHealthAsync(IList a foTelemetryData.ProcessId, foTelemetryData.ReplicaId, foTelemetryData.SystemServiceProcessName, - Value = value + foTelemetryData.Value }); } @@ -541,7 +542,7 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt Metric = metric ?? "AggregatedClusterHealth", ObserverName = sourceObserver ?? string.Empty, Source = foStats != null ? foStats.Source : ObserverName, - Value = foStats != null ? foStats.Value : string.Empty + Value = foStats != null ? foStats.Value : 0 }; // Telemetry. @@ -554,13 +555,6 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt continue; } - double value = 0; - - if (foStats != null) - { - value = double.TryParse(foStats.Value?.ToString(), out double val) ? val : -1; - } - Logger.EtwLogger?.Write( ObserverConstants.ClusterObserverETWEventName, new @@ -573,7 +567,7 @@ private async Task ProcessNodeHealthAsync(IEnumerable nodeHealt Metric = metric ?? "AggregatedClusterHealth", ObserverName = sourceObserver ?? string.Empty, Source = foStats != null ? foStats.Source : ObserverName, - Value = value + Value = foStats != null ? foStats.Value : 0 }); } } @@ -639,7 +633,6 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = $"{nodeDictItem.Key} is now Up.", Metric = "NodeStatus", NodeName = nodeDictItem.Key, - Value = "Up", Source = ObserverName }; @@ -713,7 +706,6 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) Description = message, Metric = "NodeStatus", NodeName = kvp.Key, - Value = $"{kvp.Value.NodeStatus}", Source = ObserverName }; diff --git a/ClusterObserver/PackageRoot/ServiceManifest.xml b/ClusterObserver/PackageRoot/ServiceManifest.xml index afc53c50..8795ad20 100644 --- a/ClusterObserver/PackageRoot/ServiceManifest.xml +++ b/ClusterObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index 42566027..2e66eec3 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -122,13 +122,6 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { cancellationToken.ThrowIfCancellationRequested(); - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - Dictionary properties = new Dictionary { { "ClusterId", telemetryData.ClusterId ?? string.Empty }, @@ -136,13 +129,13 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "Application", telemetryData.ApplicationName ?? string.Empty }, { "Service", telemetryData.ServiceName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "Partition", telemetryData.PartitionId }, - { "Replica", telemetryData.ReplicaId }, + { "Replica", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs index 370ffe53..f79bc3e8 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs @@ -67,12 +67,12 @@ public string PartitionId get; set; } - public string ProcessId + public int ProcessId { get; set; } - public string ReplicaId + public long ReplicaId { get; set; } @@ -92,7 +92,7 @@ public string SystemServiceProcessName get; set; } - public object Value + public double Value { get; set; } @@ -100,9 +100,10 @@ public object Value [JsonConstructor] public TelemetryData() { + } - public TelemetryData( FabricClient fabricClient, CancellationToken cancellationToken) + public TelemetryData(FabricClient fabricClient, CancellationToken cancellationToken) { var (clusterId, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, cancellationToken).Result; ClusterId = clusterId; diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index f26c8c7c..974750e2 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -15,7 +15,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + diff --git a/Documentation/Observers.md b/Documentation/Observers.md index 6731003d..b0197180 100644 --- a/Documentation/Observers.md +++ b/Documentation/Observers.md @@ -19,7 +19,7 @@ Service Fabric Error Health Events can block upgrades and other important Fabric | Observer | Description | | :--- | :--- | -| [AppObserver](#appobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric Application services (processes). Alerts when user-supplied thresholds are reached. | +| [AppObserver](#appobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric Application services (processes) and their spawn (child processes). Alerts when user-supplied thresholds are reached. | | [CertificateObserver](#certificateobserver) | Monitors the expiration date of the cluster certificate and any other certificates provided by the user. Warns when close to expiration. | | [DiskObserver](#diskobserver) | Monitors, storage disk information like capacity and IO rates. Alerts when user-supplied thresholds are reached. | | [FabricSystemObserver](#fabricsystemobserver) | Monitors CPU usage, Memory use, and Disk space availability for Service Fabric System services (compare to AppObserver) | @@ -42,20 +42,30 @@ For every other observer, it's XML as per usual. ``` ## AppObserver -Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application services (processes). This -observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. +Observer that monitors CPU usage, Memory use, and Port use for Service Fabric Application service processes and the child processes they spawn. If a service process creates child processes, then these processes will be monitored and their summed resource usage for some metric you are observing will be applied to the parent process (added) and a threshold breach will be determined based on the sum of children and parent resource usage. +This observer will alert (SF Health event) when user-supplied thresholds are reached. **Please note that this observer should not be used to monitor docker container applications. It is not designed for this task. Instead, please consider employing [ContainerObserver](https://github.com/GitTorre/ContainerObserver), which is designed specifically for container monitoring**. + +#### A note on child process monitoring + +AppObserver (FO version >= 3.1.15) will automatically monitor up to 50 process descendants of your primary service process (50 is extreme. You should not design services that own that many descendant processes..). If your services launch child processes, then AppObserver will automatically monitor them for the same metrics and thresholds you supply for the containing Application. +Their culmative impact on some monitored metric will be added to that of the parent process (your service process) and this combined (sum) value will be used to determine health state based on supplied threshold for the related metric. + +You can disable this feature (you shouldn't if you **do** launch child processes from your service and they run for a while or for the lifetime of your service and compute (use resources)) by setting AppObserverEnableChildProcessMonitoring to false. For telemetry, you can control how many offspring are present in the event data by setting AppObserverMaxChildProcTelemetryDataCount (default is 5). Both of these settings are located in ApplicationManifest.xml. +The AppObserverMaxChildProcTelemetryDataCount setting determines the size of the list used in family tree process data telemetry transmission, which corresponds to the size of the telemetry data event. You should keep this below 10. AppObserver will order the list of ChildProcessInfo (a member of ChildProcessTelemetryData) by resoure usage value, from highest to lowest. + +In the vast majority of cases, your services are not going to launch 50 descendant processes, but FO is designed to support such an extreme edge case scenario, which frankly should not be in your service design playbook. Also note that if you do spawn a lot of child processes and +you have AppObserverMonitorDuration set to, say, 10 seconds, then you will be running AppObserver for n * 10 seconds, where n is the number of descendant proceses plus the parent service process that owns them for each metric for each service with descendants. Please keep this in mind as you design your configuration. + +Finally, you can ignore this feature if you do not launch child processes from your services. Just disable it. This is important because if AppObserver will run code that checks to see if some process id has children. If you know this is not the case, then save CPU cycles and disable the feature. + ### Input -JSON config file supplied by user, stored in -PackageRoot/Observers.Data folder. This data contains JSON arrays -objects which constitute Service Fabric Apps (identified by service -URI's). Users supply Error/Warning thresholds for CPU use, Memory use and Disk -IO, ports. Memory values are supplied as number of megabytes... CPU and -Disk Space values are provided as percentages (integers: so, 80 = 80%...)... +JSON config file supplied by user, stored in PackageRoot/Observers.Data folder. This data contains JSON arrays +objects which constitute Service Fabric Apps (identified by service URI's). Users supply Error/Warning thresholds for CPU use, Memory use and Disk +IO, ports. Memory values are supplied as number of megabytes... CPU and Disk Space values are provided as percentages (integers: so, 80 = 80%...)... **Please note that you can omit any of these properties. You can also supply 0 as the value, which means that threshold will be ignored (they are not omitted below so you can see what a fully specified object looks like). -We recommend you omit all Error thresholds until you become more -comfortable with the behavior of your services and the side effects they have on machine resources**. +We recommend you omit all Error thresholds until you become more comfortable with the behavior of your services and the side effects they have on machine resources**. Example JSON config file located in **PackageRoot\\Config** folder (AppObserver.config.json). This is an example of a configuration that applies to all Service Fabric user (non-System) application service processes running on the virtual machine. @@ -77,6 +87,8 @@ All settings are optional, ***except target OR targetType***, and can be omitted | :--- | :--- | | **targetApp** | App URI string to observe. Optional (Required if targetType not specified). | | **targetAppType** | ApplicationType name (this is not a Uri format). FO will observe **all** app services belonging to it. Optional (Required if target not specified). | +| **appExcludeList** | This setting is only useful when targetApp is set to "*" or "All". A comma-separated list of app names (***URI format***) to ***exclude from observation***. Just omit the object or set value to "" to mean ***include all***. (excluding all does not make sense) | +| **appIncludeList** | This setting is only useful when targetApp is set to "*" or "All". A comma-separated list of app names (***URI format***) to ***include in observation***. Just omit the object or set value to "" to mean ***include all***. | | **serviceExcludeList** | A comma-separated list of service names (***not URI format***, just the service name as we already know the app name URI) to ***exclude from observation***. Just omit the object or set value to "" to mean ***include all***. (excluding all does not make sense) | | **serviceIncludeList** | A comma-separated list of service names (***not URI format***, just the service name as we already know the app name URI) to ***include in observation***. Just omit the object or set value to "" to mean ***include all***. | | **memoryErrorLimitMb** | Maximum service process private working set in Megabytes that should generate a Fabric Error (SFX and local log) | @@ -85,7 +97,7 @@ All settings are optional, ***except target OR targetType***, and can be omitted | **memoryWarningLimitPercent** | Minimum percentage of memory used by an App's service process (integer) that should generate a Fabric Warning (SFX and local log) | | **cpuErrorLimitPercent** | Maximum CPU percentage that should generate a Fabric Error | | **cpuWarningLimitPercent** | Minimum CPU percentage that should generate a Fabric Warning | -| **dumpProcessOnError** | Instructs whether or not FabricObserver should dump your service process when service health is detected to be in an Error (critical) state... | +| **dumpProcessOnError** | Instructs whether or not FabricObserver should dump your service process when service health is detected to be in an Error (critical) state... | | **networkErrorActivePorts** | Maximum number of established TCP ports in use by app process that will generate a Fabric Error. | | **networkWarningActivePorts** | Minimum number of established TCP ports in use by app process that will generate a Fabric Warning. | | **networkErrorEphemeralPorts** | Maximum number of ephemeral TCP ports (within a dynamic port range) in use by app process that will generate a Fabric Error. | @@ -95,7 +107,9 @@ All settings are optional, ***except target OR targetType***, and can be omitted **Output** Log text(Error/Warning), Service Fabric Application Health Report (Error/Warning/Ok), ETW (EventSource), Telemetry (AppInsights/LogAnalytics) -Example SFX Output (Warning - Ephemeral Ports Usage): +AppObserver also supports non-JSON parameters for configuration unrelated to thresholds. Like all observers these settings are located in ApplicationManifest.xml to support versionless configuration updates via application upgrade. + +Example AppObserver Output (Warning - Ephemeral Ports Usage): ![alt text](/Documentation/Images/AppObsWarn.png "AppObserver Warning output example.") diff --git a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs index f2ea5499..4da1ef3b 100644 --- a/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs +++ b/FabricObserver.Extensibility/Interfaces/ITelemetryProvider.cs @@ -113,6 +113,15 @@ Task ReportMetricAsync( MachineTelemetryData telemetryData, CancellationToken cancellationToken); + /// + /// Calls telemetry provider to report a metric. + /// + /// List of ChildProcessTelemetry. + /// CancellationToken instance. + Task ReportMetricAsync( + List telemetryData, + CancellationToken cancellationToken); + /// /// Calls telemetry provider to report a metric. /// diff --git a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs index a6b125a1..63406013 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ReplicaOrInstanceMonitoringInfo.cs @@ -4,6 +4,8 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; +using System.Diagnostics; namespace FabricObserver.Observers.MachineInfoModel { @@ -48,5 +50,10 @@ public string ServicePackageActivationId { get; set; } + + public List<(string procName, int Pid)> ChildProcesses + { + get; set; + } } } diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 882d64d4..1b8ccf44 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -29,9 +29,9 @@ public abstract class ObserverBase : IObserver private const int TtlAddMinutes = 5; private const string FabricSystemAppName = "fabric:/System"; private const int MaxDumps = 5; - private Dictionary serviceDumpCountDictionary; + private Dictionary serviceDumpCountDictionary; private string SFLogRoot; - private string dumpsPath; + private string SFDumpsPath; private bool disposed; public string ObserverName @@ -310,7 +310,7 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel string logFolderBasePath; string observerLogPath = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.ObserverLogPathParameter); - if (!string.IsNullOrEmpty(observerLogPath)) + if (!string.IsNullOrWhiteSpace(observerLogPath)) { logFolderBasePath = observerLogPath; } @@ -325,9 +325,10 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel EnableETWLogging = IsEtwProviderEnabled }; - if (string.IsNullOrEmpty(dumpsPath)) + // Only supported on Windows (dump on error). + if (string.IsNullOrWhiteSpace(SFDumpsPath) && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - SetDefaultSfDumpPath(); + SetDefaultSFWindowsDumpPath(); } ConfigurationSettings = new ConfigSettings( @@ -401,7 +402,7 @@ public void WriteToLogWithLevel(string property, string description, LogLevel le /// parameter value. public string GetSettingParameterValue(string sectionName, string parameterName, string defaultValue = null) { - if (string.IsNullOrEmpty(sectionName) || string.IsNullOrEmpty(parameterName)) + if (string.IsNullOrWhiteSpace(sectionName) || string.IsNullOrWhiteSpace(parameterName)) { return null; } @@ -427,7 +428,7 @@ public string GetSettingParameterValue(string sectionName, string parameterName, string setting = serviceConfiguration.Settings.Sections[sectionName].Parameters[parameterName]?.Value; - if (string.IsNullOrEmpty(setting) && defaultValue != null) + if (string.IsNullOrWhiteSpace(setting) && defaultValue != null) { return defaultValue; } @@ -454,23 +455,22 @@ public void Dispose() /// /// Process id of the target process to dump. /// Optional: The type of dump to generate. Default is DumpType.Full. - /// Optional: The full path to store dump file. Default is %SFLogRoot%\CrashDumps + /// Optional: The full path to store dump file. Default is %SFLogRoot%\CrashDumps /// true or false if the operation succeeded. - public bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpType.Full, string filePath = null) + private bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpType.Full, string folderPath = null, string fileName = null) { if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { return false; } - if (string.IsNullOrEmpty(dumpsPath) && string.IsNullOrEmpty(filePath)) + if (string.IsNullOrWhiteSpace(SFDumpsPath) && string.IsNullOrWhiteSpace(folderPath)) { return false; } - string path = !string.IsNullOrEmpty(filePath) ? filePath : dumpsPath; - string processName = string.Empty; - + string path = !string.IsNullOrWhiteSpace(folderPath) ? folderPath : SFDumpsPath; + string processName = !string.IsNullOrWhiteSpace(fileName) ? fileName : string.Empty; NativeMethods.MINIDUMP_TYPE miniDumpType; switch (dumpType) @@ -503,33 +503,43 @@ public bool DumpServiceProcessWindows(int processId, DumpType dumpType = DumpTyp try { - // This is to ensure friendly-name of resulting dmp file. using (Process process = Process.GetProcessById(processId)) { - processName = process.ProcessName; - - if (string.IsNullOrEmpty(processName)) + if (processName == string.Empty) { - return false; + processName = process.ProcessName; } - processName += $"_{DateTime.Now:ddMMyyyyHHmmss}.dmp"; IntPtr processHandle = process.Handle; + processName += $"_{DateTime.Now:ddMMyyyyHHmmss}.dmp"; // Check disk space availability before writing dump file. string driveName = path.Substring(0, 2); - + if (DiskUsage.GetCurrentDiskSpaceUsedPercent(driveName) > 90) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Not enough disk space available for dump file creation."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Not enough disk space available for dump file creation."); return false; } + if (!Directory.Exists(path)) + { + try + { + Directory.CreateDirectory(path); + } + catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) + { + // Can't create directory in SF dumps folder, so dump into top level directory.. + path = SFDumpsPath; + } + } + using (FileStream file = File.Create(Path.Combine(path, processName))) { if (!NativeMethods.MiniDumpWriteDump( @@ -596,7 +606,8 @@ public void ProcessResourceDataReportHealth( string thresholdName = "Minimum"; bool warningOrError = false; - string name = string.Empty, id, drive = string.Empty, procId = string.Empty; + string name = string.Empty, id, drive = string.Empty; + int procId = 0; T threshold = thresholdWarning; HealthState healthState = HealthState.Ok; Uri appName = null; @@ -611,7 +622,7 @@ public void ProcessResourceDataReportHealth( appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; name = serviceName.OriginalString.Replace($"{appName.OriginalString}/", string.Empty); - procId = replicaOrInstance.HostProcessId.ToString(); + procId = (int)replicaOrInstance.HostProcessId; } else // System service report from FabricSystemObserver. { @@ -620,9 +631,9 @@ public void ProcessResourceDataReportHealth( try { - procId = Process.GetProcessesByName(name).First()?.Id.ToString(); + procId = (int)Process.GetProcessesByName(name).First()?.Id; } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is PlatformNotSupportedException) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is PlatformNotSupportedException || e is Win32Exception) { } @@ -642,7 +653,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ProcessId = procId, - ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), + ReplicaId = replicaOrInstance != null ? replicaOrInstance.ReplicaOrInstanceId : 0, ServiceName = serviceName?.OriginalString ?? string.Empty, SystemServiceProcessName = appName?.OriginalString == FabricSystemAppName ? name : string.Empty, Source = ObserverConstants.FabricObserverName @@ -655,22 +666,24 @@ public void ProcessResourceDataReportHealth( } // Container - if (!string.IsNullOrEmpty(replicaOrInstance?.ContainerId)) + if (!string.IsNullOrWhiteSpace(replicaOrInstance?.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } - // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. + // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. If the process has children, then don't emit this raw data since it will already + // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to send data to ApplicationInsights or LogAnalytics for each resource usage observation it makes per specified metric. - if (IsTelemetryEnabled) + if (IsTelemetryEnabled && replicaOrInstance.ChildProcesses == null) { - _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token).ConfigureAwait(true); + _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token).ConfigureAwait(true); } - // ETW - This is informational, per reading EventSource tracing, healthstate is irrelevant here. + // ETW - This is informational, per reading EventSource tracing, healthstate is irrelevant here. If the process has children, then don't emit this raw data since it will already + // be contained in the ChildProcessTelemetry data instances and AppObserver will have already emitted it. // Enable this for your observer if you want to log etw (which can then be read by some agent that will send it to some endpoint) // for each resource usage observation it makes per specified metric. - if (IsEtwEnabled) + if (IsEtwEnabled && replicaOrInstance.ChildProcesses == null) { ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, @@ -683,7 +696,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ProcessId = procId, - ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), + ReplicaId = replicaOrInstance?.ReplicaOrInstanceId, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, SystemServiceProcessName = appName?.OriginalString == FabricSystemAppName ? name : string.Empty @@ -745,13 +758,13 @@ public void ProcessResourceDataReportHealth( warningOrError = true; healthState = HealthState.Error; - // **Windows-only**. This is primarily useful for AppObserver, but makes sense to be - // part of the base class for future use, like for FSO. + // **Windows-only**. This is used by AppObserver, but makes sense to be + // part of the base class for future use, like for plugins that manage service processes. if (replicaOrInstance != null && dumpOnError && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { if (serviceDumpCountDictionary == null) { - serviceDumpCountDictionary = new Dictionary(5); + serviceDumpCountDictionary = new Dictionary(5); } try @@ -761,21 +774,33 @@ public void ProcessResourceDataReportHealth( using (var proc = Process.GetProcessById(pid)) { string procName = proc?.ProcessName; - - if (!serviceDumpCountDictionary.ContainsKey(procName)) + StringBuilder sb = new StringBuilder(data.Property); + string metricName = sb.Replace(" ", string.Empty) + .Replace("Total", string.Empty) + .Replace("MB", string.Empty) + .Replace("%", string.Empty) + .Replace("Active", string.Empty) + .Replace("TCP", string.Empty).ToString(); + sb.Clear(); + string dumpKey = $"{procName}_{metricName}"; + + if (!serviceDumpCountDictionary.ContainsKey(dumpKey)) + { + serviceDumpCountDictionary.Add(dumpKey, (0, DateTime.UtcNow)); + } + else if (DateTime.UtcNow.Subtract(serviceDumpCountDictionary[dumpKey].LastDumpDate) >= TimeSpan.FromDays(1)) { - serviceDumpCountDictionary.Add(procName, 0); + serviceDumpCountDictionary[dumpKey] = (0, DateTime.UtcNow); } - if (serviceDumpCountDictionary[procName] < MaxDumps) + if (serviceDumpCountDictionary[dumpKey].DumpCount < MaxDumps) { - // DumpServiceProcess defaults to a Full dump with - // process memory, handles and thread data. - bool success = DumpServiceProcessWindows(pid); + // DumpServiceProcess defaults to a Full dump with process memory, handles and thread data. + bool success = DumpServiceProcessWindows(pid, DumpType.Full, Path.Combine(SFDumpsPath, procName), dumpKey); if (success) { - serviceDumpCountDictionary[procName]++; + serviceDumpCountDictionary[dumpKey] = (serviceDumpCountDictionary[dumpKey].DumpCount + 1, DateTime.UtcNow); } } } @@ -887,9 +912,22 @@ public void ProcessResourceDataReportHealth( } var healthMessage = new StringBuilder(); + string childProcMsg = string.Empty; + + if (replicaOrInstance != null && replicaOrInstance.ChildProcesses != null) + { + childProcMsg = $"Note that {serviceName.OriginalString} has spawned one or more child processes ({replicaOrInstance.ChildProcesses.Count}). " + + $"Their cumulative impact on {name}'s resource usage has been applied."; + } _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); - _ = healthMessage.Append($" - {data.Property}: {Math.Round(data.AverageDataValue, 0)}{data.Units}"); + _ = healthMessage.Append($" - {data.Property}: {Math.Round(data.AverageDataValue, 0)}{data.Units} "); + + if (childProcMsg != string.Empty) + { + _ = healthMessage.AppendLine(); + _ = healthMessage.AppendLine(childProcMsg); + } // The health event description will be a serialized instance of telemetryData, // so it should be completely constructed (filled with data) regardless @@ -897,7 +935,7 @@ public void ProcessResourceDataReportHealth( telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = errorWarningCode; - if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) + if (replicaOrInstance != null && !string.IsNullOrWhiteSpace(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } @@ -979,7 +1017,7 @@ public void ProcessResourceDataReportHealth( telemetryData.ApplicationName = appName?.OriginalString ?? string.Empty; telemetryData.Code = FOErrorWarningCodes.Ok; - if (replicaOrInstance != null && !string.IsNullOrEmpty(replicaOrInstance.ContainerId)) + if (replicaOrInstance != null && !string.IsNullOrWhiteSpace(replicaOrInstance.ContainerId)) { telemetryData.ContainerId = replicaOrInstance.ContainerId; } @@ -1101,38 +1139,47 @@ protected virtual void Dispose(bool disposing) } } - private void SetDefaultSfDumpPath() + private void SetDefaultSFWindowsDumpPath() { // This only needs to be set once. - if (string.IsNullOrEmpty(dumpsPath)) + if (string.IsNullOrWhiteSpace(SFDumpsPath)) { SFLogRoot = ServiceFabricConfiguration.Instance.FabricLogRoot; - if (!string.IsNullOrEmpty(SFLogRoot)) + if (string.IsNullOrWhiteSpace(SFLogRoot)) { - dumpsPath = Path.Combine(SFLogRoot, "CrashDumps"); + SFDumpsPath = null; + return; } } - if (Directory.Exists(dumpsPath)) + SFDumpsPath = Path.Combine(SFLogRoot, "ApplicationCrashDumps"); + + if (Directory.Exists(SFDumpsPath)) { return; } - try + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"Unable to locate dump directory {SFDumpsPath}. Trying another one..."); + + SFDumpsPath = Path.Combine(SFLogRoot, "CrashDumps"); + + if (Directory.Exists(SFDumpsPath)) { - _ = Directory.CreateDirectory(dumpsPath); + return; } - catch (IOException e) - { - HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Unable to create dumps directory:{Environment.NewLine}{e}"); - dumpsPath = null; - } + SFDumpsPath = null; + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"Unable to locate dump directory {SFDumpsPath}. Aborting. Will not generate application service dumps."); + return; } private void SetObserverConfiguration() @@ -1162,7 +1209,7 @@ private void SetObserverConfiguration() string telemetryProviderType = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.TelemetryProviderType); - if (string.IsNullOrEmpty(telemetryProviderType)) + if (string.IsNullOrWhiteSpace(telemetryProviderType)) { IsTelemetryProviderEnabled = false; @@ -1189,7 +1236,7 @@ private void SetObserverConfiguration() string logAnalyticsWorkspaceId = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.LogAnalyticsWorkspaceIdParameter); - if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) + if (string.IsNullOrWhiteSpace(logAnalyticsWorkspaceId) || string.IsNullOrWhiteSpace(logAnalyticsSharedKey)) { IsTelemetryProviderEnabled = false; return; @@ -1209,7 +1256,7 @@ private void SetObserverConfiguration() string aiKey = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.AiKey); - if (string.IsNullOrEmpty(aiKey)) + if (string.IsNullOrWhiteSpace(aiKey)) { IsTelemetryProviderEnabled = false; return; @@ -1253,7 +1300,7 @@ private void InitializeCsvLogger() string dataLogPath = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.DataLogPathParameter); - CsvFileLogger.BaseDataLogFolderPath = !string.IsNullOrEmpty(dataLogPath) ? Path.Combine(dataLogPath, ObserverName) : Path.Combine(Environment.CurrentDirectory, "fabric_observer_csvdata", ObserverName); + CsvFileLogger.BaseDataLogFolderPath = !string.IsNullOrWhiteSpace(dataLogPath) ? Path.Combine(dataLogPath, ObserverName) : Path.Combine(Environment.CurrentDirectory, "fabric_observer_csvdata", ObserverName); } private bool IsObserverWebApiAppInstalled() diff --git a/FabricObserver.Extensibility/Utilities/CpuUsage.cs b/FabricObserver.Extensibility/Utilities/CpuUsage.cs index f2587211..e7fa565c 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUsage.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUsage.cs @@ -4,6 +4,7 @@ // ------------------------------------------------------------ using System; +using System.ComponentModel; using System.Diagnostics; namespace FabricObserver.Observers.Utilities @@ -21,28 +22,38 @@ public class CpuUsage /// /// Target Process object /// CPU percentage in use as double value - public double GetCpuUsagePercentageProcess(Process p) + public double GetCpuUsagePercentageProcess(int procId) { - if (p == null || p.HasExited) + try { - return 0; - } + using (Process p = Process.GetProcessById(procId)) + { + if (p.HasExited) + { + return 0.0; + } - if (prevTime == DateTime.MinValue) - { - prevTime = DateTime.Now; - prevTotalProcessorTime = p.TotalProcessorTime; + if (prevTime == DateTime.MinValue) + { + prevTime = DateTime.Now; + prevTotalProcessorTime = p.TotalProcessorTime; + } + else + { + currentTimeTime = DateTime.Now; + currentTotalProcessorTime = p.TotalProcessorTime; + double currentUsage = (currentTotalProcessorTime.TotalMilliseconds - prevTotalProcessorTime.TotalMilliseconds) / currentTimeTime.Subtract(prevTime).TotalMilliseconds; + double cpuUsage = currentUsage / Environment.ProcessorCount; + prevTime = currentTimeTime; + prevTotalProcessorTime = currentTotalProcessorTime; + + return cpuUsage * 100.0; + } + } } - else + catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException || e is NotSupportedException) { - currentTimeTime = DateTime.Now; - currentTotalProcessorTime = p.TotalProcessorTime; - double currentUsage = (currentTotalProcessorTime.TotalMilliseconds - prevTotalProcessorTime.TotalMilliseconds) / currentTimeTime.Subtract(prevTime).TotalMilliseconds; - double cpuUsage = currentUsage / Environment.ProcessorCount; - prevTime = currentTimeTime; - prevTotalProcessorTime = currentTotalProcessorTime; - return cpuUsage * 100.0; } return 0.0; diff --git a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs index 8bb3ad5f..7bd3d904 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUtilization/CpuUtilizationProvider.cs @@ -13,11 +13,6 @@ public abstract class CpuUtilizationProvider : IDisposable { public abstract Task NextValueAsync(); - public void Dispose() - { - Dispose(disposing: true); - } - public static CpuUtilizationProvider Create() { if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) @@ -28,6 +23,11 @@ public static CpuUtilizationProvider Create() return new LinuxCpuUtilizationProvider(); } + public void Dispose() + { + Dispose(disposing: true); + } + protected abstract void Dispose(bool disposing); } } diff --git a/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs b/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs index 728fdfda..15ca2de9 100644 --- a/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs +++ b/FabricObserver.Extensibility/Utilities/ErrorWarningProperty.cs @@ -31,5 +31,8 @@ public sealed class ErrorWarningProperty // File Handles public const string TotalFileHandles = "Total Allocated File Handles"; public const string TotalFileHandlesPct = "Total Allocated File Handles %"; + + // Child procs + public const string ChildProcessCount = "Child Process Count"; } } diff --git a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs index e4afae77..8eda8028 100644 --- a/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs +++ b/FabricObserver.Extensibility/Utilities/FabricResourceUsageData.cs @@ -64,27 +64,27 @@ public FabricResourceUsageData( } /// - /// Gets the name of the machine resource property this instance represents. + /// Gets or sets the name of the machine resource property this instance represents. /// public string Property { - get; + get; set; } /// - /// Gets the unique Id of this instance. + /// Gets or sets the unique Id of this instance. /// public string Id { - get; + get; set; } /// - /// Gets the unit of measure for the data (%, MB/GB, etc). + /// Gets or sets the unit of measure for the data (%, MB/GB, etc). /// public string Units { - get; + get; set; } /// @@ -93,7 +93,7 @@ public string Units /// public IList Data { - get; + get; set; } private bool isInWarningState; diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 49e826f2..c55ad358 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -55,6 +55,8 @@ public sealed class ObserverConstants // AppObserver. public const string AppObserverName = "AppObserver"; public const string AppObserverConfigurationSectionName = "AppObserverConfiguration"; + public const string EnableChildProcessMonitoring = "EnableChildProcessMonitoring"; + public const string MaxChildProcTelemetryDataCountParameter = "MaxChildProcTelemetryDataCount"; // Certificate Observer public const string CertificateObserverName = "CertificateObserver"; diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs index cec02061..e33240ed 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs @@ -288,7 +288,7 @@ public override int GetTotalAllocatedFileHandlesCount() } // https://loune.net/2017/06/running-shell-bash-commands-in-net-core/ - internal static class LinuxShellHelper + public static class LinuxShellHelper { /// /// This string extension will run a supplied linux bash command and return the console output. diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 69bced84..396ccf51 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -39,28 +39,21 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - PropertyDataCollection.PropertyDataEnumerator propEnumerator = mObj.Properties.GetEnumerator(); + object visibleTotalObj = mObj.Properties["TotalVisibleMemorySize"].Value; + object freePhysicalObj = mObj.Properties["FreePhysicalMemory"].Value; - while (propEnumerator.MoveNext()) + if (visibleTotalObj == null || freePhysicalObj == null) { - PropertyData prop = propEnumerator.Current; - string name = prop.Name; - string value = prop.Value.ToString(); - - if (name.Contains("TotalVisible")) - { - visibleTotal = !string.IsNullOrWhiteSpace(value) ? long.Parse(value) : -1L; - } - else - { - freePhysical = !string.IsNullOrWhiteSpace(value) ? long.Parse(value) : -1L; - } + continue; } + + visibleTotal = Convert.ToInt64(visibleTotalObj); + freePhysical = Convert.ToInt64(freePhysicalObj); } } } - if (visibleTotal == -1L || freePhysical == -1L) + if (visibleTotal < 1) { return (-1L, -1); } @@ -197,7 +190,7 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) { win32OsInfo = new ManagementObjectSearcher( "SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory," + - "TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); + "TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); results = win32OsInfo.Get(); @@ -209,80 +202,48 @@ public override Task GetOSInfoAsync(CancellationToken cancellationToken) { using (ManagementObject mObj = (ManagementObject)enumerator.Current) { - PropertyDataCollection.PropertyDataEnumerator propEnumerator = mObj.Properties.GetEnumerator(); - - while (propEnumerator.MoveNext()) + object captionObj = mObj.Properties["Caption"].Value; + object versionObj = mObj.Properties["Version"].Value; + object statusObj = mObj.Properties["Status"].Value; + object osLanguageObj = mObj.Properties["OSLanguage"].Value; + object numProcsObj = mObj.Properties["NumberOfProcesses"].Value; + object freePhysicalObj = mObj.Properties["FreePhysicalMemory"].Value; + object freeVirtualTotalObj = mObj.Properties["FreeVirtualMemory"].Value; + object totalVirtualObj = mObj.Properties["TotalVirtualMemorySize"].Value; + object totalVisibleObj = mObj.Properties["TotalVisibleMemorySize"].Value; + object installDateObj = mObj.Properties["InstallDate"].Value; + object lastBootDateObj = mObj.Properties["LastBootUpTime"].Value; + + osInfo.Name = captionObj?.ToString(); + + if (int.TryParse(numProcsObj?.ToString(), out int numProcesses)) { - PropertyData prop = propEnumerator.Current; - string name = prop.Name; - string value = prop.Value?.ToString(); - - if (string.IsNullOrWhiteSpace(name) || string.IsNullOrWhiteSpace(value)) - { - continue; - } - - switch (name.ToLowerInvariant()) - { - case "caption": - osInfo.Name = value; - break; - - case "numberofprocesses": - if (int.TryParse(value, out int numProcesses)) - { - osInfo.NumberOfProcesses = numProcesses; - } - else - { - osInfo.NumberOfProcesses = -1; - } - - break; - - case "status": - osInfo.Status = value; - break; - - case "oslanguage": - osInfo.Language = value; - break; - - case "version": - osInfo.Version = value; - break; - - case "installdate": - osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - - case "lastbootuptime": - osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - - case "freephysicalmemory": - osInfo.FreePhysicalMemoryKB = ulong.Parse(value); - break; - - case "freevirtualmemory": - osInfo.FreeVirtualMemoryKB = ulong.Parse(value); - break; - - case "totalvirtualmemorysize": - osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); - break; - - case "totalvisiblememorysize": - osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); - break; - } + osInfo.NumberOfProcesses = numProcesses; } - } + else + { + osInfo.NumberOfProcesses = -1; + } + + osInfo.Status = statusObj?.ToString(); + osInfo.Language = osLanguageObj?.ToString(); + osInfo.Version = versionObj?.ToString(); + osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(installDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(lastBootDateObj?.ToString()).ToUniversalTime().ToString("o"); + osInfo.FreePhysicalMemoryKB = ulong.TryParse(freePhysicalObj?.ToString(), out ulong freePhysical) ? freePhysical : 0; + osInfo.FreeVirtualMemoryKB = ulong.TryParse(freeVirtualTotalObj?.ToString(), out ulong freeVirtual) ? freeVirtual : 0; + osInfo.TotalVirtualMemorySizeKB = ulong.TryParse(totalVirtualObj?.ToString(), out ulong totalVirtual) ? totalVirtual : 0; + osInfo.TotalVisibleMemorySizeKB = ulong.TryParse(totalVisibleObj?.ToString(), out ulong totalVisible) ? totalVisible : 0; + } } catch (ManagementException me) { Logger.LogInfo($"Handled ManagementException in GetOSInfoAsync retrieval:{Environment.NewLine}{me}"); } + catch (Exception e) + { + Logger.LogInfo($"Bug? => Exception in GetOSInfoAsync:{Environment.NewLine}{e}"); + } } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index 61e4ea43..41812b61 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -3,6 +3,8 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System.Collections.Generic; +using System.Diagnostics; using System.Fabric; namespace FabricObserver.Observers.Utilities @@ -18,5 +20,14 @@ public interface IProcessInfoProvider /// StatelessServiceContext instance. /// float value representing number of allocated file handles for the process. float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); + + /// + /// Returns a list of Process objects that are active descendants (e.g., children and grandchildren) of the provided Process object. + /// + /// + /// + List<(string ProcName, int Pid)> GetChildProcessInfo(int processId); + + void Dispose(); } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs index ac8f9ec6..d9665905 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcessInfoProvider.cs @@ -3,6 +3,7 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System.Collections.Generic; using System.Diagnostics; using System.Fabric; @@ -64,5 +65,36 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return result; } + + public override List<(string ProcName, int Pid)> GetChildProcessInfo(int processId) + { + string pidCmdResult = $"ps -o pid= --ppid {processId}".Bash(); + string procNameCmdResult = $"ps -o comm= --ppid {processId}".Bash(); + List<(string procName, int Pid)> childProcesses = new List<(string procName, int Pid)>(); + + if (!string.IsNullOrWhiteSpace(pidCmdResult) && !string.IsNullOrWhiteSpace(procNameCmdResult)) + { + var sPids = pidCmdResult.Trim().Split('\n'); + var sProcNames = procNameCmdResult.Trim().Split('\n'); + + if (sPids?.Length > 0 && sProcNames.Length > 0) + { + for (int i = 0; i < sPids.Length; ++i) + { + if (int.TryParse(sPids[i], out int childProcId)) + { + childProcesses.Add((sProcNames[i], childProcId)); + } + } + } + } + + return childProcesses; + } + + protected override void Dispose(bool disposing) + { + // nothing to do here. + } } } \ No newline at end of file diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index ca24a183..d5b63341 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -3,12 +3,14 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ +using System; +using System.Collections.Generic; using System.Fabric; using System.Runtime.InteropServices; namespace FabricObserver.Observers.Utilities { - public abstract class ProcessInfoProvider : IProcessInfoProvider + public abstract class ProcessInfoProvider : IProcessInfoProvider, IDisposable { private static IProcessInfoProvider instance; private static readonly object lockObj = new object(); @@ -39,6 +41,12 @@ public static IProcessInfoProvider Instance } } + public void Dispose() + { + Dispose(disposing: true); + instance = null; + } + protected Logger Logger { get; @@ -47,5 +55,9 @@ protected Logger Logger public abstract float GetProcessPrivateWorkingSetInMB(int processId); public abstract float GetProcessAllocatedHandles(int processId, StatelessServiceContext context); + + public abstract List<(string ProcName, int Pid)> GetChildProcessInfo(int processId); + + protected abstract void Dispose(bool disposing); } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index 099799ce..203563ff 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -4,21 +4,39 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; using System.ComponentModel; using System.Diagnostics; using System.Fabric; +using System.Linq; +using System.Management; namespace FabricObserver.Observers.Utilities { public class WindowsProcessInfoProvider : ProcessInfoProvider { - const string CategoryName = "Process"; + const string ProcessCategoryName = "Process"; + const string WorkingSetCounterName = "Working Set - Private"; + const string FileHandlesCounterName = "Handle Count"; private readonly object memPerfCounterLock = new object(); private readonly object fileHandlesPerfCounterLock = new object(); + private const int MaxDescendants = 50; + private PerformanceCounter memProcessPrivateWorkingSetCounter = new PerformanceCounter + { + CategoryName = ProcessCategoryName, + CounterName = WorkingSetCounterName, + ReadOnly = true + }; + + private PerformanceCounter processFileHandleCounter = new PerformanceCounter + { + CategoryName = ProcessCategoryName, + CounterName = FileHandlesCounterName, + ReadOnly = true + }; public override float GetProcessPrivateWorkingSetInMB(int processId) { - const string WorkingSetCounterName = "Working Set - Private"; string processName; try @@ -31,43 +49,26 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogWarning($"Handled Exception in GetProcessPrivateWorkingSetInMB: {e.Message}"); return 0F; } lock (memPerfCounterLock) { - PerformanceCounter memProcessPrivateWorkingSetCounter = null; - try { - memProcessPrivateWorkingSetCounter = new PerformanceCounter - { - CategoryName = CategoryName, - CounterName = WorkingSetCounterName, - InstanceName = processName - }; - + memProcessPrivateWorkingSetCounter.InstanceName = processName; return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); } catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); - // Don't throw. return 0F; } catch (Exception e) { - Logger.LogError($"{CategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); - + Logger.LogError($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); throw; } - finally - { - memProcessPrivateWorkingSetCounter?.Dispose(); - memProcessPrivateWorkingSetCounter = null; - } } } @@ -79,7 +80,6 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService return -1F; } - const string FileHandlesCounterName = "Handle Count"; string processName; try @@ -92,44 +92,207 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogWarning($"Handled Exception in GetProcessAllocatedHandles: {e.Message}"); return -1F; } lock (fileHandlesPerfCounterLock) { - PerformanceCounter processFileHandleCounter = null; - try { - processFileHandleCounter = new PerformanceCounter - { - CategoryName = CategoryName, - CounterName = FileHandlesCounterName, - InstanceName = processName - }; - + processFileHandleCounter.InstanceName = processName; return processFileHandleCounter.NextValue(); } catch (Exception e) when (e is InvalidOperationException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogWarning($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); - // Don't throw. return -1F; } catch (Exception e) { - Logger.LogError($"{CategoryName} {FileHandlesCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); + Logger.LogError($"{ProcessCategoryName} {FileHandlesCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); + throw; + } + } + } + + public override List<(string ProcName, int Pid)> GetChildProcessInfo(int processId) + { + if (processId < 1) + { + return null; + } + + // Get child procs. + List<(string procName, int pid)> childProcesses = TupleGetChildProcessInfo(processId); + + if (childProcesses == null) + { + return null; + } + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + // Get descendant proc at max depth = 5 and max number of descendants = 50. + for (int i = 0; i < childProcesses.Count; ++i) + { + List<(string procName, int pid)> c1 = TupleGetChildProcessInfo(childProcesses[i].pid); + + if (c1?.Count > 0) + { + childProcesses.AddRange(c1); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int j = 0; j < c1.Count; ++j) + { + List<(string procName, int pid)> c2 = TupleGetChildProcessInfo(c1[j].pid); + + if (c2?.Count > 0) + { + childProcesses.AddRange(c2); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int k = 0; k < c2.Count; ++k) + { + List<(string procName, int pid)> c3 = TupleGetChildProcessInfo(c2[k].pid); + + if (c3?.Count > 0) + { + childProcesses.AddRange(c3); + + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + + for (int l = 0; l < c3.Count; ++l) + { + List<(string procName, int pid)> c4 = TupleGetChildProcessInfo(c3[l].pid); + + if (c4?.Count > 0) + { + childProcesses.AddRange(c4); + if (childProcesses.Count >= MaxDescendants) + { + return childProcesses.Take(MaxDescendants).ToList(); + } + } + } + } + } + } + } + } + } + + return childProcesses; + } + + public float GetProcessPrivateWorkingSetInMB(string processName) + { + if (string.IsNullOrWhiteSpace(processName)) + { + return 0F; + } + + lock (memPerfCounterLock) + { + try + { + memProcessPrivateWorkingSetCounter.InstanceName = processName; + return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); + } + catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) + { + // Don't throw. + return 0F; + } + catch (Exception e) + { + Logger.LogError($"{ProcessCategoryName} {WorkingSetCounterName} PerfCounter unhandled error:{Environment.NewLine}{e}"); throw; } - finally + } + } + + private List<(string procName, int pid)> TupleGetChildProcessInfo(int processId) + { + List<(string procName, int pid)> childProcesses = null; + string query = $"select caption,processid from win32_process where parentprocessid = {processId}"; + + try + { + using (var searcher = new ManagementObjectSearcher(query)) { - processFileHandleCounter?.Dispose(); - processFileHandleCounter = null; + var results = searcher.Get(); + + using (ManagementObjectCollection.ManagementObjectEnumerator enumerator = results.GetEnumerator()) + { + while (enumerator.MoveNext()) + { + try + { + using (ManagementObject mObj = (ManagementObject)enumerator.Current) + { + object childProcessIdObj = mObj.Properties["processid"].Value; + object childProcessNameObj = mObj.Properties["caption"].Value; + + if (childProcessIdObj == null || childProcessNameObj == null) + { + continue; + } + + if (childProcessNameObj.ToString() == "conhost.exe") + { + continue; + } + + if (childProcesses == null) + { + childProcesses = new List<(string procName, int pid)>(); + } + + int childProcessId = Convert.ToInt32(childProcessIdObj); + string procName = childProcessNameObj.ToString(); + + childProcesses.Add((procName, childProcessId)); + } + } + catch (Exception e) when (e is ArgumentException || e is ManagementException) + { + Logger.LogWarning($"[Inner try-catch (enumeration)] Handled Exception in GetChildProcesses: {e}"); + continue; + } + } + } } } + catch (ManagementException me) + { + Logger.LogWarning($"[Containing try-catch] Handled Exception in GetChildProcesses: {me}"); + } + + return childProcesses; + } + + protected override void Dispose(bool disposing) + { + this.memProcessPrivateWorkingSetCounter?.Dispose(); + this.memProcessPrivateWorkingSetCounter = null; + + this.processFileHandleCounter?.Dispose(); + this.processFileHandleCounter = null; } } } \ No newline at end of file diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index ddbca6b4..550d607c 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; +using System.Fabric; using System.Fabric.Health; using System.Globalization; using System.Runtime.InteropServices; @@ -14,6 +15,8 @@ using Microsoft.ApplicationInsights; using Microsoft.ApplicationInsights.DataContracts; using Microsoft.ApplicationInsights.Extensibility; +using Microsoft.ServiceFabric.TelemetryLib; +using Newtonsoft.Json; namespace FabricObserver.Observers.Utilities.Telemetry { @@ -171,13 +174,6 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { cancellationToken.ThrowIfCancellationRequested(); - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - var properties = new Dictionary { { "ClusterId", telemetryData.ClusterId ?? string.Empty }, @@ -185,13 +181,13 @@ public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken can { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "PartitionId", telemetryData.PartitionId ?? string.Empty }, - { "ReplicaId", telemetryData.ReplicaId ?? string.Empty }, + { "ReplicaId", telemetryData.ReplicaId.ToString() }, { "ObserverName", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } @@ -246,13 +242,6 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can return Task.CompletedTask; } - string value = null; - - if (telemetryData.Value != null) - { - value = telemetryData.Value.ToString(); - } - try { var properties = new Dictionary @@ -260,12 +249,12 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ApplicationName", telemetryData.ApplicationName ?? string.Empty }, { "ServiceName", telemetryData.ServiceName ?? string.Empty }, - { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId.ToString() }, { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "Value", value ?? string.Empty }, + { "Value", telemetryData.Value.ToString() }, { "PartitionId", telemetryData.PartitionId }, - { "ReplicaId", telemetryData.ReplicaId }, + { "ReplicaId", telemetryData.ReplicaId.ToString() }, { "Source", telemetryData.ObserverName }, { "NodeName", telemetryData.NodeName ?? string.Empty }, { "OS", telemetryData.OS ?? string.Empty } @@ -281,6 +270,61 @@ public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken can return Task.CompletedTask; } + /// + /// Reports metric for a collection (List) of ChildProcessTelemetryData instances. + /// + /// List of ChildProcessTelemetryData. + /// Cancellation Token + /// + public Task ReportMetricAsync(List telemetryDataList, CancellationToken cancellationToken) + { + if (telemetryDataList == null || cancellationToken.IsCancellationRequested) + { + return Task.CompletedTask; + } + + string clusterid = string.Empty; + + using (FabricClient fabClient = new FabricClient()) + { + var (clusterId, _, _) = ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabClient, cancellationToken).Result; + clusterid = clusterId; + } + + string OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux"; + + foreach (var telemData in telemetryDataList) + { + try + { + var properties = new Dictionary + { + { "ClusterId", clusterid }, + { "ApplicationName", telemData.ApplicationName ?? string.Empty }, + { "ServiceName", telemData.ServiceName ?? string.Empty }, + { "ProcessId", telemData.ProcessId.ToString() }, + { "Metric", telemData.Metric ?? string.Empty }, + { "Value", telemData.Value.ToString() }, + { "ChildProcessCount", telemData.ChildProcessCount.ToString() }, + { "ChildProcessInfo", JsonConvert.SerializeObject(telemData.ChildProcessInfo) }, + { "PartitionId", telemData.PartitionId }, + { "ReplicaId", telemData.ReplicaId }, + { "Source", ObserverConstants.AppObserverName }, + { "NodeName", telemData.NodeName }, + { "OS", OS } + }; + + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } + } + + return Task.CompletedTask; + } + /// /// Reports a metric to a telemetry service. /// diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs new file mode 100644 index 00000000..66f86353 --- /dev/null +++ b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessInfo.cs @@ -0,0 +1,13 @@ +using Newtonsoft.Json; +using System.Diagnostics.Tracing; + +namespace FabricObserver.Observers.Utilities.Telemetry +{ + [EventData] + [JsonObject] + public class ChildProcessInfo + { + public string ProcessName; + public double Value; + } +} diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs new file mode 100644 index 00000000..56f3fd1e --- /dev/null +++ b/FabricObserver.Extensibility/Utilities/Telemetry/ChildProcessTelemetryData.cs @@ -0,0 +1,22 @@ +using Newtonsoft.Json; +using System.Collections.Generic; +using System.Diagnostics.Tracing; + +namespace FabricObserver.Observers.Utilities.Telemetry +{ + [EventData] + [JsonObject] + public class ChildProcessTelemetryData + { + public string ApplicationName; + public string ServiceName; + public string Metric; + public double Value; + public int ProcessId; + public string PartitionId; + public string ReplicaId; + public string NodeName; + public int ChildProcessCount; + public List ChildProcessInfo; + } +} diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs index fc72de27..3c1409e7 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -8,6 +8,7 @@ using System.Fabric; using System.Fabric.Health; using System.Net; +using System.Net.Sockets; using System.Runtime.InteropServices; using System.Security.Cryptography; using System.Text; @@ -26,7 +27,6 @@ public class LogAnalyticsTelemetry : ITelemetryProvider private readonly FabricClient fabricClient; private readonly CancellationToken token; private readonly Logger logger; - private int retries; private string WorkspaceId { @@ -100,7 +100,7 @@ public async Task ReportHealthAsync( public async Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (telemetryData == null) + if (telemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -111,7 +111,18 @@ public async Task ReportHealthAsync(TelemetryData telemetryData, CancellationTok public async Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (telemetryData == null) + if (telemetryData == null || cancellationToken.IsCancellationRequested) + { + return; + } + + string jsonPayload = JsonConvert.SerializeObject(telemetryData); + await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(true); + } + + public async Task ReportMetricAsync(List telemetryData, CancellationToken cancellationToken) + { + if (telemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -122,7 +133,7 @@ public async Task ReportMetricAsync(TelemetryData telemetryData, CancellationTok public async Task ReportMetricAsync(MachineTelemetryData machineTelemetryData, CancellationToken cancellationToken) { - if (machineTelemetryData == null) + if (machineTelemetryData == null || cancellationToken.IsCancellationRequested) { return; } @@ -226,11 +237,16 @@ public Task ReportMetricAsync( /// A completed task or task containing exception info. private async Task SendTelemetryAsync(string payload, CancellationToken cancellationToken) { - var requestUri = new Uri($"https://{WorkspaceId}.ods.opinsights.azure.com/api/logs?api-version={ApiVersion}"); + if (string.IsNullOrWhiteSpace(payload) || cancellationToken.IsCancellationRequested) + { + return; + } + + Uri requestUri = new Uri($"https://{WorkspaceId}.ods.opinsights.azure.com/api/logs?api-version={ApiVersion}"); string date = DateTime.UtcNow.ToString("r"); string signature = GetSignature("POST", payload.Length, "application/json", date, "/api/logs"); - - var request = (HttpWebRequest)WebRequest.Create(requestUri); + + HttpWebRequest request = (HttpWebRequest)WebRequest.Create(requestUri); request.ContentType = "application/json"; request.Method = "POST"; request.Headers["Log-Type"] = LogType; @@ -256,10 +272,8 @@ private async Task SendTelemetryAsync(string payload, CancellationToken cancella return; } - if (responseAsync != null && (responseAsync.StatusCode == HttpStatusCode.OK || - responseAsync.StatusCode == HttpStatusCode.Accepted)) + if (responseAsync != null && (responseAsync.StatusCode == HttpStatusCode.OK || responseAsync.StatusCode == HttpStatusCode.Accepted)) { - retries = 0; return; } @@ -271,37 +285,24 @@ private async Task SendTelemetryAsync(string payload, CancellationToken cancella } } } - catch (Exception e) + catch (Exception e) when (e is SocketException || e is WebException) { - // An Exception during telemetry data submission should never take down FO process. Log it. - logger.LogWarning($"Handled Exception in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{e}"); + logger.LogInfo($"Exception sending telemetry to LogAnalytics service:{Environment.NewLine}{e}"); } - - if (retries < MaxRetries) - { - if (cancellationToken.IsCancellationRequested) - { - return; - } - - retries++; - await Task.Delay(1000).ConfigureAwait(true); - await SendTelemetryAsync(payload, cancellationToken).ConfigureAwait(true); - } - else + catch (Exception e) { - // Exhausted retries. Reset counter. - logger.LogWarning($"Exhausted request retries in LogAnalyticsTelemetry.SendTelemetryAsync: {MaxRetries}. See logs for error details."); - retries = 0; + // Do not take down FO with a telemetry fault. Log it. Warning level will always log. + // This means there is either a bug in this code or something else that needs your attention.. + logger.LogWarning($"Unhandled exception sending telemetry to LogAnalytics service:{Environment.NewLine}{e}"); } } private string GetSignature( - string method, - int contentLength, - string contentType, - string date, - string resource) + string method, + int contentLength, + string contentType, + string date, + string resource) { string message = $"{method}\n{contentLength}\n{contentType}\nx-ms-date:{date}\n{resource}"; byte[] bytes = Encoding.UTF8.GetBytes(message); diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index 743cd065..0330a0c2 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -69,12 +69,12 @@ public string PartitionId get; set; } - public string ProcessId + public int ProcessId { get; set; } - public string ReplicaId + public long ReplicaId { get; set; } @@ -94,7 +94,7 @@ public string SystemServiceProcessName get; set; } - public object Value + public double Value { get; set; } diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 7d9ed2de..4f0baada 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,17 +2,13 @@ %PACKAGE_ID% - 3.1.14 + 3.1.15 - - Bug fix in FO plugin loader: native dependencies will not crash loader (and thus FO). - - For Plugin authors: You MUST place all plugin dependency libraries in the same folder as your plugin dll. - - Plugins (and their dependencies) can now live in child folders in the Plugins directory, which will keep things cleaner for folks with multiple plugins. - The Plugins folder/file structure MUST be: - Config/Data/Plugins/MyPlugin/MyPlugin.dll (required), MyPlugin.pdb (optional), [ALL of MyPlugin.dll's private dependencies] (required) - OR Config/Data/Plugins/MyPlugin.dll (required), MyPlugin.pdb(optional), [ALL of MyPlugin.dll's private dependencies] (required). - A private plugin dependency is any package that you reference in your plugin project that is not already referenced by FabricObserver. - So, things like Nuget packages or Project References or COM References that are only used by your plugin. It is important to stress that if a dependency dll has dependencies, - then you MUST also place those in the plugin's directory. + This release adds support for process tree monitoring by AppObserver: + Any child process (and descendants at max depth = 4) launched by a service process that is being monitored by AppObserver will also be monitored and its resource usage will be added to the parent's for use in threshold violation checks for an observed (configured) metric. + Added support for new child process monitoring data in ETW, AppInsights and LogAnalytics telemetry provider impls. + Minor bug fix in AppObserver monitor duration logic. + Added lifetime management to DumpOnError feature (max 5 dumps per process per metric for 24 hour period. Then, new cycle) Microsoft MIT diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index 88aff4ec..0c00d336 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -4,11 +4,11 @@ // ------------------------------------------------------------ using System; -using System.Collections.Generic; using System.Fabric; using System.IO; using System.Linq; using System.Reflection; +using System.Runtime.Loader; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers; @@ -89,19 +89,19 @@ private void LoadObserversFromPlugins(IServiceCollection services) return; } - var pluginLoaders = new List(pluginDlls.Length); + PluginLoader[] pluginLoaders = new PluginLoader[pluginDlls.Length]; Type[] sharedTypes = { typeof(FabricObserverStartupAttribute), typeof(IFabricObserverStartup), typeof(IServiceCollection) }; - foreach (string dll in pluginDlls) + for (int i = 0; i < pluginDlls.Length; ++i) { - // This does not create an Assembly. It creates a PluginLoader instance for each dll in the Plugins folder. - // TODO: Figure out how to only load the plugin dll in an efficient way. For now, this is fine. This is not resource intensive. - PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes); - pluginLoaders.Add(loader); + string dll = pluginDlls[i]; + PluginLoader loader = PluginLoader.CreateFromAssemblyFile(dll, sharedTypes, a => a.IsUnloadable = true); + pluginLoaders[i] = loader; } - foreach (PluginLoader pluginLoader in pluginLoaders) + for (int i = 0; i < pluginLoaders.Length; ++i) { + var pluginLoader = pluginLoaders[i]; Assembly pluginAssembly; try @@ -109,12 +109,11 @@ private void LoadObserversFromPlugins(IServiceCollection services) // If your plugin has native library dependencies (that's fine), then we will land in the catch (BadImageFormatException). // This is by design. The Managed FO plugin assembly will successfully load, of course. pluginAssembly = pluginLoader.LoadDefaultAssembly(); - FabricObserverStartupAttribute[] startupAttributes = pluginAssembly.GetCustomAttributes().ToArray(); - for (int i = 0; i < startupAttributes.Length; ++i) + for (int j = 0; j < startupAttributes.Length; ++j) { - object startupObject = Activator.CreateInstance(startupAttributes[i].StartupType); + object startupObject = Activator.CreateInstance(startupAttributes[j].StartupType); if (startupObject is IFabricObserverStartup fabricObserverStartup) { @@ -123,17 +122,14 @@ private void LoadObserversFromPlugins(IServiceCollection services) else { // This will bring down FO, which it should: This means your plugin is not supported. Fix your bug. - throw new InvalidOperationException($"{startupAttributes[i].StartupType.FullName} must implement IFabricObserverStartup."); + throw new InvalidOperationException($"{startupAttributes[j].StartupType.FullName} must implement IFabricObserverStartup."); } } } - catch (BadImageFormatException) - { - continue; - } - finally + catch (Exception e) when (e is ArgumentException || e is BadImageFormatException || e is IOException) { pluginLoader?.Dispose(); + continue; } } } diff --git a/FabricObserver/FabricObserver.csproj b/FabricObserver/FabricObserver.csproj index 75df3c28..0166967c 100644 --- a/FabricObserver/FabricObserver.csproj +++ b/FabricObserver/FabricObserver.csproj @@ -12,11 +12,11 @@ linux-x64;win-x64 - 3.1.11.0 + 3.1.15.0 Copyright © 2020 FabricObserver Service Fabric Observer - 3.1.11 + 3.1.15 true true FabricObserver.Program diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index efb826fd..640776c3 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -13,18 +13,19 @@ using System.Fabric.Query; using System.IO; using System.Linq; +using System.Runtime.InteropServices; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.MachineInfoModel; using FabricObserver.Observers.Utilities; +using FabricObserver.Observers.Utilities.Telemetry; +using Newtonsoft.Json; using ConfigSettings = FabricObserver.Observers.MachineInfoModel.ConfigSettings; namespace FabricObserver.Observers { - // This observer monitors the behavior of user SF service processes - // and signals Warning and Error based on user-supplied resource thresholds - // in AppObserver.config.json - // Health Report processor will also emit ETW telemetry if configured in Settings.xml. + // This observer monitors the behavior of user SF service processes (and their children) and signals Warning and Error based on user-supplied resource thresholds + // in AppObserver.config.json. This observer will also emit telemetry (ETW, LogAnalytics/AppInsights) if enabled in Settings.xml (ObserverManagerConfiguration) and ApplicationManifest.xml (AppObserverEnableEtw). public class AppObserver : ObserverBase { // Health Report data containers - For use in analysis to determine health state. @@ -35,7 +36,6 @@ public class AppObserver : ObserverBase private List> AllAppTotalActivePortsData; private List> AllAppEphemeralPortsData; private List> AllAppHandlesData; - private readonly Stopwatch stopwatch; // userTargetList is the list of ApplicationInfo objects representing app/app types supplied in configuration. private List userTargetList; @@ -44,6 +44,17 @@ public class AppObserver : ObserverBase private List deployedTargetList; private readonly ConfigSettings configSettings; private string fileName; + private readonly Stopwatch stopwatch; + + public int MaxChildProcTelemetryDataCount + { + get; set; + } + + public bool EnableChildProcessMonitoring + { + get; set; + } public List ReplicaOrInstanceList { @@ -85,7 +96,8 @@ public override async Task ObserveAsync(CancellationToken token) FabricServiceContext.ServiceName.OriginalString, ObserverName, HealthState.Warning, - "This observer was unable to initialize correctly due to missing configuration info."); + "AppObserver was unable to initialize correctly due to misconfiguration. " + + "Please check your AppObserver configuration settings."); stopwatch.Stop(); stopwatch.Reset(); @@ -93,192 +105,360 @@ public override async Task ObserveAsync(CancellationToken token) return; } - await MonitorDeployedAppsAsync(token).ConfigureAwait(true); - await ReportAsync(token).ConfigureAwait(true); + await MonitorDeployedAppsAsync(token); + await ReportAsync(token); // The time it took to run this observer. stopwatch.Stop(); CleanUp(); RunDuration = stopwatch.Elapsed; - + if (EnableVerboseLogging) { ObserverLogger.LogInfo($"Run Duration: {RunDuration}"); } stopwatch.Reset(); - LastRunDateTime = DateTime.Now; } public override Task ReportAsync(CancellationToken token) { - token.ThrowIfCancellationRequested(); - if (deployedTargetList.Count == 0) { return Task.CompletedTask; } - var healthReportTimeToLive = GetHealthReportTimeToLive(); + // For use in process family tree monitoring. + List childProcessTelemetryDataList = null; + TimeSpan healthReportTimeToLive = GetHealthReportTimeToLive(); - // App-specific reporting. - foreach (var app in deployedTargetList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { token.ThrowIfCancellationRequested(); - // Process data for reporting. - foreach (var repOrInst in ReplicaOrInstanceList) + var repOrInst = ReplicaOrInstanceList[i]; + string processName = null; + int processId = 0; + ApplicationInfo app = null; + bool hasChildProcs = EnableChildProcessMonitoring && repOrInst.ChildProcesses != null; + + if (hasChildProcs) + { + childProcessTelemetryDataList = new List(); + } + + try { - token.ThrowIfCancellationRequested(); + app = deployedTargetList.Find( + a => a.TargetApp == repOrInst.ApplicationName.OriginalString || a.TargetAppType == repOrInst.ApplicationTypeName); + + using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); - if (!string.IsNullOrWhiteSpace(app.TargetAppType) - && !string.Equals( - repOrInst.ApplicationTypeName, - app.TargetAppType, - StringComparison.CurrentCultureIgnoreCase)) + // If the process is no longer running, then don't report on it. + if (p.HasExited) { continue; } - if (!string.IsNullOrWhiteSpace(app.TargetApp) - && !string.Equals( - repOrInst.ApplicationName.OriginalString, - app.TargetApp, - StringComparison.CurrentCultureIgnoreCase)) + processName = p.ProcessName; + processId = p.Id; + } + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + ObserverLogger.LogWarning($"Handled Exception in ReportAsync:{Environment.NewLine}{e}"); + continue; + } + + string appNameOrType = GetAppNameOrType(repOrInst); + var id = $"{appNameOrType}:{processName}"; + + // Locally Log (csv) CPU/Mem/FileHandles/Ports per app service process. + if (EnableCsvLogging) + { + // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. + // Please use ContainerObserver for SF container app service monitoring. + if (processName == "Fabric") { continue; } - string processName = null; - int processId = 0; + fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - try + // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. + // This means a new folder will be added to the base path. + if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) { - using Process p = Process.GetProcessById((int)repOrInst.HostProcessId); + CsvFileLogger.DataLogFolder = processName; + } - // If the process is no longer running, then don't report on it. - if (p.HasExited) - { - continue; - } + // Log pid.. + CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); - processName = p.ProcessName; - processId = p.Id; - } - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + // Log resource usage data to CSV files. + LogAllAppResourceDataToCsv(id); + } + + // CPU - Parent process + if (AllAppCpuData.Any(x => x.Id == id)) + { + var parentFrud = AllAppCpuData.FirstOrDefault(x => x.Id == id); + + if (hasChildProcs) { - continue; + ProcessChildProcs(ref AllAppCpuData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } - string appNameOrType = GetAppNameOrType(repOrInst); + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.CpuErrorLimitPercent, + app.CpuWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } - var id = $"{appNameOrType}:{processName}"; + // Memory MB - Parent process + if (AllAppMemDataMb.Any(x => x.Id == id)) + { + var parentFrud = AllAppMemDataMb.FirstOrDefault(x => x.Id == id); - // Locally Log (csv) CPU/Mem/FileHandles/Ports per app service process. - if (EnableCsvLogging) + if (hasChildProcs) { - // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. - // Please use ContainerObserver for SF container app service monitoring. - if (processName == "Fabric") - { - continue; - } - - fileName = $"{processName}{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; - - // BaseLogDataLogFolderPath is set in ObserverBase or a default one is created by CsvFileLogger. - // This means a new folder will be added to the base path. - if (CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) - { - CsvFileLogger.DataLogFolder = processName; - } + ProcessChildProcs(ref AllAppMemDataMb, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); + } - // Log pid.. - CsvFileLogger.LogData(fileName, id, "ProcessId", "", processId); + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.MemoryErrorLimitMb, + app.MemoryWarningLimitMb, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } - // Log resource usage data to CSV files. - LogAllAppResourceDataToCsv(id); - } + // Memory Percent - Parent process + if (AllAppMemDataPercent.Any(x => x.Id == id)) + { + var parentFrud = AllAppMemDataPercent.FirstOrDefault(x => x.Id == id); - // CPU - if (AllAppCpuData.Any(x => x.Id == id)) + if (hasChildProcs) { - ProcessResourceDataReportHealth( - AllAppCpuData.FirstOrDefault(x => x.Id == id), - app.CpuErrorLimitPercent, - app.CpuWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + ProcessChildProcs(ref AllAppMemDataPercent, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } - // Memory MB - if (AllAppMemDataMb.Any(x => x.Id == id)) + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.MemoryErrorLimitPercent, + app.MemoryWarningLimitPercent, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // TCP Ports - Active - Parent process + if (AllAppTotalActivePortsData.Any(x => x.Id == id)) + { + var parentFrud = AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id); + + if (hasChildProcs) { - ProcessResourceDataReportHealth( - AllAppMemDataMb.FirstOrDefault(x => x.Id == id), - app.MemoryErrorLimitMb, - app.MemoryWarningLimitMb, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + ProcessChildProcs(ref AllAppTotalActivePortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } - // Memory Percent - if (AllAppMemDataPercent.Any(x => x.Id == id)) + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.NetworkErrorActivePorts, + app.NetworkWarningActivePorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - Parent process + if (AllAppEphemeralPortsData.Any(x => x.Id == id)) + { + var parentFrud = AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id); + + if (hasChildProcs) { - ProcessResourceDataReportHealth( - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id), - app.MemoryErrorLimitPercent, - app.MemoryWarningLimitPercent, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst, - app.DumpProcessOnError); + ProcessChildProcs(ref AllAppEphemeralPortsData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } - // TCP Ports - Active - if (AllAppTotalActivePortsData.Any(x => x.Id == id)) + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.NetworkErrorEphemeralPorts, + app.NetworkWarningEphemeralPorts, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // Allocated (in use) Handles - Parent process + if (AllAppHandlesData.Any(x => x.Id == id)) + { + var parentFrud = AllAppHandlesData.FirstOrDefault(x => x.Id == id); + + if (hasChildProcs) { - ProcessResourceDataReportHealth( - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id), - app.NetworkErrorActivePorts, - app.NetworkWarningActivePorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); + ProcessChildProcs(ref AllAppHandlesData, ref childProcessTelemetryDataList, repOrInst, app, ref parentFrud, token); } - // TCP Ports - Ephemeral (port numbers fall in the dynamic range) - if (AllAppEphemeralPortsData.Any(x => x.Id == id)) + // Parent's and aggregated (summed) spawned process data (if any). + ProcessResourceDataReportHealth( + parentFrud, + app.ErrorOpenFileHandles, + app.WarningOpenFileHandles, + healthReportTimeToLive, + HealthReportType.Application, + repOrInst, + app.DumpProcessOnError); + } + + // Child proc info telemetry. + if (IsEtwEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) + { + var data = new { - ProcessResourceDataReportHealth( - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id), - app.NetworkErrorEphemeralPorts, - app.NetworkWarningEphemeralPorts, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); - } + ChildProcessTelemetryData = JsonConvert.SerializeObject(childProcessTelemetryDataList) + }; + + ObserverLogger.LogEtw(ObserverConstants.FabricObserverETWEventName, data); + } + + if (IsTelemetryEnabled && hasChildProcs && MaxChildProcTelemetryDataCount > 0) + { + _ = TelemetryClient?.ReportMetricAsync(childProcessTelemetryDataList, token); + } + + childProcessTelemetryDataList = null; + } - // Allocated (in use) Handles - if (AllAppHandlesData.Any(x => x.Id == id)) + return Task.CompletedTask; + } + + private void ProcessChildProcs( + ref List> fruds, + ref List childProcessTelemetryDataList, + ReplicaOrInstanceMonitoringInfo repOrInst, + ApplicationInfo app, + ref FabricResourceUsageData parentFrud, + CancellationToken token) where T : struct + { + token.ThrowIfCancellationRequested(); + + try + { + string metric = parentFrud.Property; + var parentDataAvg = Math.Round(parentFrud.AverageDataValue, 0); + var (childProcInfo, Sum) = ProcessChildFrudsGetDataSum(ref fruds, repOrInst, app, token); + double sumAllValues = Sum + parentDataAvg; + childProcInfo.Metric = metric; + childProcInfo.Value = sumAllValues; + childProcessTelemetryDataList.Add(childProcInfo); + parentFrud.Data.Clear(); + parentFrud.Data.Add((T)Convert.ChangeType(sumAllValues, typeof(T))); + } + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + { + ObserverLogger.LogWarning($"Error processing child processes:{Environment.NewLine}{e}"); + } + } + + private (ChildProcessTelemetryData childProcInfo, double Sum) ProcessChildFrudsGetDataSum( + ref List> fruds, + ReplicaOrInstanceMonitoringInfo repOrInst, + ApplicationInfo app, + CancellationToken token) where T : struct + { + var childProcs = repOrInst.ChildProcesses; + + if (childProcs == null || childProcs.Count == 0 || token.IsCancellationRequested) + { + return (null, 0); + } + + double sumValues = 0; + string metric = string.Empty; + var childProcessInfoData = new ChildProcessTelemetryData + { + ApplicationName = repOrInst.ApplicationName.OriginalString, + ServiceName = repOrInst.ServiceName.OriginalString, + NodeName = NodeName, + ProcessId = (int)repOrInst.HostProcessId, + PartitionId = repOrInst.PartitionId.ToString(), + ReplicaId = repOrInst.ReplicaOrInstanceId.ToString(), + ChildProcessCount = childProcs.Count, + ChildProcessInfo = new List() + }; + + for (int i = 0; i < childProcs.Count; ++i) + { + token.ThrowIfCancellationRequested(); + + int childPid = childProcs[i].Pid; + string childProcName = childProcs[i].procName; + + try + { + if (fruds.Any(x => x.Id.Contains(childProcName))) { - ProcessResourceDataReportHealth( - AllAppHandlesData.FirstOrDefault(x => x.Id == id), - app.ErrorOpenFileHandles, - app.WarningOpenFileHandles, - healthReportTimeToLive, - HealthReportType.Application, - repOrInst); + var childFruds = fruds.Where(x => x.Id.Contains(childProcName)).ToList(); + metric = childFruds[0].Property; + + for (int j = 0; j < childFruds.Count; ++j) + { + token.ThrowIfCancellationRequested(); + + var frud = childFruds[j]; + double value = frud.AverageDataValue; + sumValues += Math.Round(value, 0); + + if (IsEtwEnabled || IsTelemetryEnabled) + { + var childProcInfo = new ChildProcessInfo { ProcessName = childProcName, Value = value }; + childProcessInfoData.ChildProcessInfo.Add(childProcInfo); + } + + // Remove child FRUD from ref FRUD. + fruds.Remove(frud); + } + + childFruds?.Clear(); + childFruds = null; } } + catch (Exception e) when (e is ArgumentException || e is Win32Exception || e is InvalidOperationException) + { + continue; + } + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) + { + ObserverLogger.LogWarning($"Error processing child processes:{Environment.NewLine}{e}"); + continue; + } } - return Task.CompletedTask; + // Order List by Value descending. + childProcessInfoData.ChildProcessInfo = childProcessInfoData.ChildProcessInfo.OrderByDescending(v => v.Value).ToList(); + + // Cap size of List to MaxChildProcTelemetryDataCount. + if (childProcessInfoData.ChildProcessInfo.Count >= MaxChildProcTelemetryDataCount) + { + childProcessInfoData.ChildProcessInfo = childProcessInfoData.ChildProcessInfo.Take(MaxChildProcTelemetryDataCount).ToList(); + } + + return (childProcessInfoData, sumValues); } private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst) @@ -292,26 +472,40 @@ private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst // be up to date across observer loop iterations. private async Task InitializeAsync() { - ReplicaOrInstanceList ??= new List(); - userTargetList ??= new List(); - deployedTargetList ??= new List(); + ReplicaOrInstanceList = new List(); + userTargetList = new List(); + deployedTargetList = new List(); + + /* For descendant proc monitoring */ + if (bool.TryParse( + GetSettingParameterValue( + ConfigurationSectionName, + ObserverConstants.EnableChildProcessMonitoring), out bool enableDescendantMonitoring)) + { + EnableChildProcessMonitoring = enableDescendantMonitoring; + } + + if (int.TryParse( + GetSettingParameterValue( + ConfigurationSectionName, + ObserverConstants.MaxChildProcTelemetryDataCountParameter), out int maxChildProcs)) + { + MaxChildProcTelemetryDataCount = maxChildProcs; + } + /* End descendant proc monitoring */ configSettings.Initialize( FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( ObserverConstants.ObserverConfigurationPackageName)?.Settings, ConfigurationSectionName, "AppObserverDataFileName"); - + // Unit tests may have null path and filename, thus the null equivalence operations. var appObserverConfigFileName = Path.Combine(ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); if (!File.Exists(appObserverConfigFileName)) { - WriteToLogWithLevel( - ObserverName, - $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", - LogLevel.Information); - + ObserverLogger.LogWarning($"Will not observe resource consumption on node {NodeName} as no configuration file has been supplied."); return false; } @@ -325,11 +519,7 @@ private async Task InitializeAsync() // Are any of the config-supplied apps deployed?. if (userTargetList.Count == 0) { - WriteToLogWithLevel( - ObserverName, - $"Will not observe resource consumption as no configuration parameters have been supplied. | {NodeName}", - LogLevel.Information); - + ObserverLogger.LogWarning($"Will not observe service resource consumption on node {NodeName} as no configuration parameters have been supplied."); return false; } @@ -375,13 +565,15 @@ private async Task InitializeAsync() apps.AddRange(appList.ToList()); // TODO: Add random wait (ms) impl, include cluster size in calc. - await Task.Delay(250, Token).ConfigureAwait(true); + await Task.Delay(250, Token); } - foreach (var app in apps) + for (int i = 0; i < apps.Count; ++i) { Token.ThrowIfCancellationRequested(); - + + var app = apps[i]; + if (app.ApplicationName.OriginalString == "fabric:/System") { continue; @@ -423,7 +615,7 @@ private async Task InitializeAsync() existingAppConfig.NetworkWarningActivePorts = existingAppConfig.NetworkWarningActivePorts == 0 && application.NetworkWarningActivePorts > 0 ? application.NetworkWarningActivePorts : existingAppConfig.NetworkWarningActivePorts; existingAppConfig.NetworkErrorEphemeralPorts = existingAppConfig.NetworkErrorEphemeralPorts == 0 && application.NetworkErrorEphemeralPorts > 0 ? application.NetworkErrorEphemeralPorts : existingAppConfig.NetworkErrorEphemeralPorts; existingAppConfig.NetworkWarningEphemeralPorts = existingAppConfig.NetworkWarningEphemeralPorts == 0 && application.NetworkWarningEphemeralPorts > 0 ? application.NetworkWarningEphemeralPorts : existingAppConfig.NetworkWarningEphemeralPorts; - existingAppConfig.DumpProcessOnError = application.DumpProcessOnError != existingAppConfig.DumpProcessOnError ? application.DumpProcessOnError : existingAppConfig.DumpProcessOnError; + existingAppConfig.DumpProcessOnError = application.DumpProcessOnError == existingAppConfig.DumpProcessOnError ? application.DumpProcessOnError : existingAppConfig.DumpProcessOnError; existingAppConfig.ErrorOpenFileHandles = existingAppConfig.ErrorOpenFileHandles == 0 && application.ErrorOpenFileHandles > 0 ? application.ErrorOpenFileHandles : existingAppConfig.ErrorOpenFileHandles; existingAppConfig.WarningOpenFileHandles = existingAppConfig.WarningOpenFileHandles == 0 && application.WarningOpenFileHandles > 0 ? application.WarningOpenFileHandles : existingAppConfig.WarningOpenFileHandles; } @@ -464,10 +656,11 @@ private async Task InitializeAsync() int settingsFail = 0; - foreach (var application in userTargetList) + for (int i = 0; i < userTargetList.Count; ++i) { Token.ThrowIfCancellationRequested(); + var application = userTargetList[i]; Uri appUri = null; if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) @@ -505,7 +698,7 @@ private async Task InitializeAsync() ObserverName, HealthState.Warning, $"InitializeAsync() | {application.TargetApp}: Invalid TargetApp value. " + - $"Value must be a valid Uri string of format \"fabric:/MyApp\", for example."); + $"Value must be a valid Uri string of format \"fabric:/MyApp\" OR just \"MyApp\""); settingsFail++; continue; @@ -520,18 +713,20 @@ private async Task InitializeAsync() if (!string.IsNullOrWhiteSpace(application.TargetAppType)) { - await SetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetAppType).ConfigureAwait(true); + await SetDeployedApplicationReplicaOrInstanceListAsync(null, application.TargetAppType); } else { - await SetDeployedApplicationReplicaOrInstanceListAsync(appUri).ConfigureAwait(true); + await SetDeployedApplicationReplicaOrInstanceListAsync(appUri); } } - foreach (var rep in ReplicaOrInstanceList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { Token.ThrowIfCancellationRequested(); - + + var rep = ReplicaOrInstanceList[i]; + try { // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. @@ -543,10 +738,11 @@ private async Task InitializeAsync() continue; } - ObserverLogger.LogInfo($"Will observe resource consumption by {rep.ServiceName?.OriginalString}({rep.HostProcessId}) on Node {NodeName}."); + ObserverLogger.LogInfo($"Will observe resource consumption by {rep.ServiceName?.OriginalString}({rep.HostProcessId}) (and child procs, if any) on Node {NodeName}."); } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { + } } @@ -555,7 +751,6 @@ private async Task InitializeAsync() private async Task MonitorDeployedAppsAsync(CancellationToken token) { - Process currentProcess = null; int capacity = ReplicaOrInstanceList.Count; AllAppCpuData ??= new List>(capacity); AllAppMemDataMb ??= new List>(capacity); @@ -564,19 +759,22 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) AllAppEphemeralPortsData ??= new List>(capacity); AllAppHandlesData ??= new List>(capacity); - foreach (var repOrInst in ReplicaOrInstanceList) + for (int i = 0; i < ReplicaOrInstanceList.Count; ++i) { token.ThrowIfCancellationRequested(); + var repOrInst = ReplicaOrInstanceList[i]; var timer = new Stopwatch(); - int processId = (int)repOrInst.HostProcessId; + int parentPid = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false; var application = deployedTargetList?.Find( app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString.ToLower() || !string.IsNullOrWhiteSpace(app?.TargetAppType) && app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); - + + List<(string procName, int Pid)> procTree = null; + if (application?.TargetApp == null && application?.TargetAppType == null) { continue; @@ -584,19 +782,18 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) try { - // App level. - currentProcess = Process.GetProcessById(processId); - string procName = currentProcess.ProcessName; + using Process parentProc = Process.GetProcessById(parentPid); + string parentProcName = parentProc.ProcessName; // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. // Please use ContainerObserver for SF container app service monitoring. - if (procName == "Fabric") + if (parentProcName == "Fabric") { continue; } string appNameOrType = GetAppNameOrType(repOrInst); - string id = $"{appNameOrType}:{procName}"; + string id = $"{appNameOrType}:{parentProcName}"; if (UseCircularBuffer) { @@ -658,17 +855,6 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) checkEphemeralPorts = true; } - // Measure Total and Ephemeral ports. - if (checkAllPorts) - { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(currentProcess.Id, FabricServiceContext)); - } - - if (checkEphemeralPorts) - { - AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(currentProcess.Id, FabricServiceContext)); - } - // File Handles (FD on linux) if (AllAppHandlesData.All(list => list.Id != id) && (application.ErrorOpenFileHandles > 0 || application.WarningOpenFileHandles > 0)) { @@ -680,105 +866,216 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) checkHandles = true; } - // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. - if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) + // Get list of child processes of parentProc should they exist. + // In order to provide accurate resource usage of an SF service process we need to also account for + // any processes (children) that the service process (parent) created/spawned. + procTree = new List<(string procName, int Pid)> { - continue; - } - - /* CPU and Memory Usage */ + // Add parent to the process tree list since we want to monitor all processes in the family. If there are no child processes, + // then only the parent process will be in this list. + (parentProc.ProcessName, parentProc.Id) + }; - TimeSpan duration = TimeSpan.FromSeconds(1); - - if (MonitorDuration > TimeSpan.MinValue) - { - duration = MonitorDuration; - } - - /* Warm up counters. */ - - if (checkCpu) + if (repOrInst.ChildProcesses != null && repOrInst.ChildProcesses.Count > 0) { - _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); + procTree.AddRange(repOrInst.ChildProcesses); } - if (checkHandles) + for (int j = 0; j < procTree.Count; ++j) { - _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); - } + int procId = procTree[j].Pid; + string procName = procTree[j].procName; + TimeSpan duration = TimeSpan.FromSeconds(1); - if (checkMemMb || checkMemPct) - { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); - } + if (MonitorDuration > TimeSpan.MinValue) + { + duration = MonitorDuration; + } - timer.Start(); + // No need to proceed further if no cpu/mem/file handles thresholds are specified in configuration. + if (!checkCpu && !checkMemMb && !checkMemPct && !checkHandles) + { + continue; + } - while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) - { - token.ThrowIfCancellationRequested(); + /* Warm up Windows perf counters. */ if (checkCpu) { - // CPU (all cores). - double cpu = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); - - if (cpu >= 0) + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - if (cpu > 100) - { - cpu = 100; - } - - AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + _ = cpuUsage.GetCpuUsagePercentageProcess(procId); } } - float processMem = 0; + if (checkHandles) + { + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + } + } if (checkMemMb || checkMemPct) { - processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + } } - if (checkMemMb) + // Monitor Duration applies to the code below. + timer.Start(); + + while (timer.Elapsed.Seconds <= duration.Seconds) { + token.ThrowIfCancellationRequested(); + + float processMem = 0; + + if (checkCpu) + { + // CPU (all cores). + double cpu = cpuUsage.GetCpuUsagePercentageProcess(procId); + + if (cpu >= 0) + { + if (cpu > 100) + { + cpu = 100; + } + + if (procId == parentPid) + { + AllAppCpuData.FirstOrDefault(x => x.Id == id).Data.Add(cpu); + } + else + { + if (!AllAppCpuData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppCpuData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalCpuTime, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppCpuData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(cpu); + } + } + } + // Memory (private working set (process)). - AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); - } + if (checkMemMb) + { + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + + if (procId == parentPid) + { + AllAppMemDataMb.FirstOrDefault(x => x.Id == id).Data.Add(processMem); + } + else + { + if (!AllAppMemDataMb.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppMemDataMb.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionMb, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppMemDataMb.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(processMem); + } + } - if (checkMemPct) - { // Memory (percent in use (total)). - var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + if (checkMemPct) + { + if (processMem == 0) + { + processMem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(procId); + } + + var (TotalMemory, _) = OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse(); + + if (TotalMemory > 0) + { + double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); + + if (procId == parentPid) + { + AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + } + else + { + if (!AllAppMemDataPercent.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppMemDataPercent.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalMemoryConsumptionPct, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppMemDataPercent.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(Math.Round(usedPct, 1)); + } + } + } - if (TotalMemory > 0) + if (checkHandles) { - double usedPct = Math.Round((double)(processMem * 100) / (TotalMemory * 1024), 2); - AllAppMemDataPercent.FirstOrDefault(x => x.Id == id).Data.Add(Math.Round(usedPct, 1)); + float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(procId, FabricServiceContext); + + if (handles > -1) + { + if (procId == parentPid) + { + AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + } + else + { + if (!AllAppHandlesData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppHandlesData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalFileHandles, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppHandlesData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(handles); + } + } } - } - if (checkHandles) - { - float handles = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); + // Total TCP ports usage + if (checkAllPorts) + { + // Parent process (the service process). + if (procId == parentPid) + { + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + else + { + // Child procs spawned by the parent service process. + if (!AllAppTotalActivePortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppTotalActivePortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalActivePorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId, FabricServiceContext)); + } + } - if (handles > -1) + // Ephemeral TCP ports usage + if (checkEphemeralPorts) { - AllAppHandlesData.FirstOrDefault(x => x.Id == id).Data.Add(handles); + if (procId == parentPid) + { + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } + else + { + if (!AllAppEphemeralPortsData.Any(x => x.Id == $"{id}:{procName}")) + { + AllAppEphemeralPortsData.Add(new FabricResourceUsageData(ErrorWarningProperty.TotalEphemeralPorts, $"{id}:{procName}", capacity, UseCircularBuffer)); + } + AllAppEphemeralPortsData.FirstOrDefault(x => x.Id == $"{id}:{procName}").Data.Add(OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId, FabricServiceContext)); + } } + + await Task.Delay(250, Token); } - await Task.Delay(250, Token); + timer.Stop(); + timer.Reset(); } - - timer.Stop(); - timer.Reset(); } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { ObserverLogger.LogWarning( - $"Handled exception in MonitorDeployedAppsAsync: process {processId} is not running or it's running at a higher privilege than FabricObserver.{Environment.NewLine}" + + $"Handled exception in MonitorDeployedAppsAsync: Process {parentPid} is not running or it's running at a higher privilege than FabricObserver.{Environment.NewLine}" + $"ServiceName: {repOrInst.ServiceName?.OriginalString ?? "unknown"}{Environment.NewLine}Error message: {e.Message}"); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) @@ -788,12 +1085,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Fix the bug.. throw; } - finally - { - currentProcess?.Dispose(); - currentProcess = null; - } - } + } } private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) @@ -802,7 +1094,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat if (applicationNameFilter != null) { - var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(true); + var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter); deployedApps.AddRange(app.ToList()); } else if (!string.IsNullOrWhiteSpace(applicationType)) @@ -837,16 +1129,20 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat Token); deployedApps.AddRange(appList.ToList()); - await Task.Delay(250, Token).ConfigureAwait(true); + await Task.Delay(250, Token); } deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType).ToList(); + + appList.Clear(); + appList = null; } - foreach (var deployedApp in deployedApps) + for (int i = 0; i < deployedApps.Count; ++i) { Token.ThrowIfCancellationRequested(); + var deployedApp = deployedApps[i]; string[] filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. @@ -870,11 +1166,7 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat } } - var replicasOrInstances = await GetDeployedPrimaryReplicaAsync( - deployedApp.ApplicationName, - filteredServiceList, - filterType, - applicationType).ConfigureAwait(true); + var replicasOrInstances = await GetDeployedPrimaryReplicaAsync(deployedApp.ApplicationName, filteredServiceList, filterType, applicationType); ReplicaOrInstanceList.AddRange(replicasOrInstances); @@ -919,15 +1211,17 @@ private void SetInstanceOrReplicaMonitoringList( DeployedServiceReplicaList deployedReplicaList, ref List replicaMonitoringList) { - foreach (var deployedReplica in deployedReplicaList) + for (int i = 0; i < deployedReplicaList.Count; ++i) { Token.ThrowIfCancellationRequested(); + var deployedReplica = deployedReplicaList[i]; ReplicaOrInstanceMonitoringInfo replicaInfo = null; switch (deployedReplica) { - case DeployedStatefulServiceReplica {ReplicaRole: ReplicaRole.Primary} statefulReplica: + case DeployedStatefulServiceReplica statefulReplica when statefulReplica.ReplicaRole == ReplicaRole.Primary || + statefulReplica.ReplicaRole == ReplicaRole.ActiveSecondary: { if (filterList != null && filterType != ServiceFilterType.None) { @@ -950,6 +1244,17 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statefulReplica.Partitionid, ServiceName = statefulReplica.ServiceName }; + + if (EnableChildProcessMonitoring) + { + var childPids = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statefulReplica.HostProcessId); + + if (childPids != null && childPids.Count > 0) + { + replicaInfo.ChildProcesses = childPids; + } + } + break; } case DeployedStatelessServiceInstance statelessInstance: @@ -975,6 +1280,17 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statelessInstance.Partitionid, ServiceName = statelessInstance.ServiceName }; + + if (EnableChildProcessMonitoring) + { + var childProcs = ProcessInfoProvider.Instance.GetChildProcessInfo((int)statelessInstance.HostProcessId); + + if (childProcs != null && childProcs.Count > 0) + { + replicaInfo.ChildProcesses = childProcs; + } + } + break; } } @@ -997,37 +1313,37 @@ private void CleanUp() ReplicaOrInstanceList?.Clear(); ReplicaOrInstanceList = null; - if (AllAppCpuData != null && !AllAppCpuData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppCpuData != null && AllAppCpuData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppCpuData?.Clear(); AllAppCpuData = null; } - if (AllAppEphemeralPortsData != null && !AllAppEphemeralPortsData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppEphemeralPortsData != null && AllAppEphemeralPortsData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppEphemeralPortsData?.Clear(); AllAppEphemeralPortsData = null; } - if (AllAppHandlesData != null && !AllAppHandlesData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppHandlesData != null && AllAppHandlesData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppHandlesData?.Clear(); AllAppHandlesData = null; } - if (AllAppMemDataMb != null && !AllAppMemDataMb.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppMemDataMb != null && AllAppMemDataMb.All(frud => !frud.ActiveErrorOrWarning)) { AllAppMemDataMb?.Clear(); AllAppMemDataMb = null; } - if (AllAppMemDataPercent != null && !AllAppMemDataPercent.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppMemDataPercent != null && AllAppMemDataPercent.All(frud => !frud.ActiveErrorOrWarning)) { AllAppMemDataPercent?.Clear(); AllAppMemDataPercent = null; } - if (AllAppTotalActivePortsData != null && !AllAppTotalActivePortsData.Any(frud => frud.ActiveErrorOrWarning)) + if (AllAppTotalActivePortsData != null && AllAppTotalActivePortsData.All(frud => !frud.ActiveErrorOrWarning)) { AllAppTotalActivePortsData?.Clear(); AllAppTotalActivePortsData = null; diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 1e2b687d..0d44aeac 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -250,7 +250,6 @@ public override async Task ReportAsync(CancellationToken token) ObserverName = ObserverName, OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", Source = ObserverConstants.FabricObserverName, - Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration) }; await TelemetryClient.ReportHealthAsync(telemetryData, Token); diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index b34aa6e2..ba83152c 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -218,9 +218,10 @@ public override Task ReportAsync(CancellationToken token) var timeToLiveWarning = GetHealthReportTimeToLive(); // User-supplied Disk Space Usage % thresholds from ApplicationManifest.xml. - foreach (var data in DiskSpaceUsagePercentageData) + for (int i = 0; i < DiskSpaceUsagePercentageData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceUsagePercentageData[i]; ProcessResourceDataReportHealth( data, @@ -232,9 +233,10 @@ public override Task ReportAsync(CancellationToken token) // User-supplied Average disk queue length thresholds from ApplicationManifest.xml. Windows only. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - foreach (var data in DiskAverageQueueLengthData) + for (int i = 0; i < DiskAverageQueueLengthData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskAverageQueueLengthData[i]; ProcessResourceDataReportHealth( data, @@ -249,27 +251,21 @@ in FabricObserver.Extensibility project. */ if (IsEtwEnabled) { // Disk Space Available - foreach (var data in DiskSpaceAvailableMbData) + for (int i = 0; i < DiskSpaceAvailableMbData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceAvailableMbData[i]; - ProcessResourceDataReportHealth( - data, - 0, - 0, - timeToLiveWarning); + ProcessResourceDataReportHealth(data, 0, 0, timeToLiveWarning); } // Disk Space Total - foreach (var data in DiskSpaceTotalMbData) + for (int i = 0; i < DiskSpaceTotalMbData.Count; ++i) { token.ThrowIfCancellationRequested(); + var data = DiskSpaceTotalMbData[i]; - ProcessResourceDataReportHealth( - data, - 0, - 0, - timeToLiveWarning); + ProcessResourceDataReportHealth(data, 0, 0, timeToLiveWarning); } } diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index be4456e9..919d5d45 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -29,7 +29,7 @@ namespace FabricObserver.Observers // As with all observers, you should first determine the good (normal) states across resource usage before you set thresholds for the bad ones. public class FabricSystemObserver : ObserverBase { - private List processWatchList; + private string[] processWatchList; private Stopwatch stopwatch; // Health Report data container - For use in analysis to determine health state. @@ -40,7 +40,7 @@ public class FabricSystemObserver : ObserverBase private List> allHandlesData; // Windows only. (EventLog). - private List evtRecordList; + private List evtRecordList = null; private bool monitorWinEventLog; /// @@ -117,11 +117,6 @@ public int AllocatedHandlesError get; set; } - public string ErrorOrWarningKind - { - get; set; - } = null; - public override async Task ObserveAsync(CancellationToken token) { // If set, this observer will only run during the supplied interval. @@ -141,11 +136,14 @@ public override async Task ObserveAsync(CancellationToken token) { Initialize(); - foreach (var procName in processWatchList) + for (int i = 0; i < processWatchList.Length; ++i) { + Token.ThrowIfCancellationRequested(); + + string procName = processWatchList[i]; + try { - Token.ThrowIfCancellationRequested(); string dotnet = string.Empty; if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux) && procName.EndsWith(".dll")) @@ -157,15 +155,13 @@ public override async Task ObserveAsync(CancellationToken token) } catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { + } } } catch (Exception e) when (!(e is OperationCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError( $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -350,10 +346,7 @@ public override Task ReportAsync(CancellationToken token) } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in ReportAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError($"Unhandled exception in ReportAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -446,10 +439,12 @@ private Process[] GetDotnetLinuxProcessesByFirstArgument(string argument) var result = new List(); var processes = Process.GetProcessesByName("dotnet"); - foreach (var process in processes) + for (int i = 0; i < processes.Length; ++i) { Token.ThrowIfCancellationRequested(); + Process process = processes[i]; + try { string cmdline = File.ReadAllText($"/proc/{process.Id}/cmdline"); @@ -491,7 +486,7 @@ private void Initialize() // Linux if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { - processWatchList = new List + processWatchList = new [] { "Fabric", "FabricDCA.dll", @@ -508,7 +503,7 @@ private void Initialize() else { // Windows - processWatchList = new List + processWatchList = new [] { "Fabric", "FabricApplicationGateway", @@ -522,7 +517,7 @@ private void Initialize() }; } - int listcapacity = processWatchList.Count; + int listcapacity = processWatchList.Length; int frudCapacity = 4; if (UseCircularBuffer) @@ -780,12 +775,14 @@ private async Task GetProcessInfoAsync(string procName) Stopwatch timer = new Stopwatch(); - foreach (var process in processes) + for (int i = 0; i < processes.Length; ++i) { - try - { - Token.ThrowIfCancellationRequested(); + Token.ThrowIfCancellationRequested(); + + Process process = processes[i]; + try + { // Ports - Active TCP All int activePortCount = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(process.Id, FabricServiceContext); @@ -824,10 +821,21 @@ private async Task GetProcessInfoAsync(string procName) CpuUsage cpuUsage = new CpuUsage(); - // Warm up the perf counters. + // Mem if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + // Warm up the perf counters. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); + } + } + + // Allocated Handles + if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) + { + float handleCount = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); + allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handleCount); } TimeSpan duration = TimeSpan.FromSeconds(1); @@ -848,32 +856,22 @@ private async Task GetProcessInfoAsync(string procName) // CPU Time for service process. if (CpuErrorUsageThresholdPct > 0 || CpuWarnUsageThresholdPct > 0) { - int cpu = (int)cpuUsage.GetCpuUsagePercentageProcess(process); + int cpu = (int)cpuUsage.GetCpuUsagePercentageProcess(process.Id); allCpuData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(cpu); } - // Private Working Set for service process. + // Mem if (MemErrorUsageThresholdMb > 0 || MemWarnUsageThresholdMb > 0) { float mem = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(process.Id); allMemData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(mem); } - // Allocated Handles - if (AllocatedHandlesError > 0 || AllocatedHandlesWarning > 0) - { - float handleCount = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(process.Id, FabricServiceContext); - allHandlesData.FirstOrDefault(x => x.Id == dotnetArg).Data.Add(handleCount); - } - await Task.Delay(250, Token).ConfigureAwait(true); } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}", - LogLevel.Warning); + ObserverLogger.LogWarning($"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -886,19 +884,13 @@ private async Task GetProcessInfoAsync(string procName) // It's OK. Just means that the elevated process (like FabricHost.exe) won't be observed. // It is generally *not* worth running FO process as a Windows elevated user just for this scenario. On Linux, FO always should be run as normal user, not root. #if DEBUG - WriteToLogWithLevel( - ObserverName, - $"Can't observe {procName} due to it's privilege level. FabricObserver must be running as System or Admin on Windows for this specific task.", - LogLevel.Warning); + ObserverLogger.LogWarning($"Can't observe {procName} due to it's privilege level. FabricObserver must be running as System or Admin on Windows for this specific task."); #endif continue; } catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - WriteToLogWithLevel( - ObserverName, - $"Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}", - LogLevel.Error); + ObserverLogger.LogError("Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}"); // Fix the bug.. throw; @@ -914,7 +906,7 @@ private async Task GetProcessInfoAsync(string procName) } private void ProcessResourceDataList( - IReadOnlyCollection> data, + List> data, T thresholdError, T thresholdWarning) where T : struct @@ -926,10 +918,12 @@ private void ProcessResourceDataList( fileName = $"FabricSystemServices{(CsvWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives ? "_" + DateTime.UtcNow.ToString("o") : string.Empty)}"; } - foreach (var dataItem in data) + for (int i = 0; i < data.Count; ++i) { Token.ThrowIfCancellationRequested(); + var dataItem = data[i]; + if (dataItem.Data.Count == 0 || dataItem.AverageDataValue <= 0) { continue; @@ -990,7 +984,6 @@ private void ProcessResourceDataList( private void CleanUp() { - processWatchList.Clear(); processWatchList = null; if (allCpuData != null && !allCpuData.Any(frud => frud.ActiveErrorOrWarning)) @@ -999,6 +992,12 @@ private void CleanUp() allCpuData = null; } + if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) + { + allEphemeralTcpPortData?.Clear(); + allEphemeralTcpPortData = null; + } + if (allHandlesData != null && !allHandlesData.Any(frud => frud.ActiveErrorOrWarning)) { allHandlesData?.Clear(); @@ -1011,12 +1010,6 @@ private void CleanUp() allMemData = null; } - if (allEphemeralTcpPortData != null && !allEphemeralTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) - { - allEphemeralTcpPortData?.Clear(); - allEphemeralTcpPortData = null; - } - if (allActiveTcpPortData != null && !allActiveTcpPortData.Any(frud => frud.ActiveErrorOrWarning)) { allActiveTcpPortData?.Clear(); diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 7c98dd14..b3e7da80 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -22,48 +22,24 @@ public class NodeObserver : ObserverBase private readonly Stopwatch stopwatch; // These are public properties because they are used in unit tests. - public FabricResourceUsageData MemDataCommittedBytes - { - get; set; - } + public FabricResourceUsageData MemDataCommittedBytes; - public FabricResourceUsageData FirewallData - { - get; set; - } + public FabricResourceUsageData FirewallData; - public FabricResourceUsageData ActivePortsData - { - get; set; - } + public FabricResourceUsageData ActivePortsData; - public FabricResourceUsageData EphemeralPortsData - { - get; set; - } + public FabricResourceUsageData EphemeralPortsData; - public FabricResourceUsageData MemDataPercentUsed - { - get; set; - } + public FabricResourceUsageData MemDataPercentUsed; - public FabricResourceUsageData CpuTimeData - { - get; set; - } + public FabricResourceUsageData CpuTimeData; // These are only useful for Linux.\\ // Holds data for percentage of total configured file descriptors that are in use. - public FabricResourceUsageData LinuxFileHandlesDataPercentAllocated - { - get; set; - } + public FabricResourceUsageData LinuxFileHandlesDataPercentAllocated; - public FabricResourceUsageData LinuxFileHandlesDataTotalAllocated - { - get; set; - } + public FabricResourceUsageData LinuxFileHandlesDataTotalAllocated; public float CpuErrorUsageThresholdPct { @@ -640,19 +616,6 @@ private async Task GetSystemCpuMemoryValuesAsync(CancellationToken token) try { - // Ports. - if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) - { - int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); - ActivePortsData.Data.Add(activePortCountTotal); - } - - if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) - { - int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); - EphemeralPortsData.Data.Add(ephemeralPortCountTotal); - } - // Firewall rules. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && FirewallData != null) { @@ -722,11 +685,13 @@ error on these conditions. { token.ThrowIfCancellationRequested(); + // CPU if (CpuTimeData != null && (CpuErrorUsageThresholdPct > 0 || CpuWarningUsageThresholdPct > 0)) { CpuTimeData.Data.Add(await cpuUtilizationProvider.NextValueAsync()); } + // Memory if (MemDataCommittedBytes != null && (MemErrorUsageThresholdMb > 0 || MemWarningUsageThresholdMb > 0)) { float committedMegaBytes = MemoryUsageProvider.Instance.GetCommittedBytes() / 1048576.0f; @@ -738,6 +703,19 @@ error on these conditions. MemDataPercentUsed.Data.Add(OperatingSystemInfoProvider.Instance.TupleGetTotalPhysicalMemorySizeAndPercentInUse().PercentInUse); } + // Ports. + if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) + { + int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); + ActivePortsData.Data.Add(activePortCountTotal); + } + + if (EphemeralPortsData != null && (EphemeralPortsErrorThreshold > 0 || EphemeralPortsWarningThreshold > 0)) + { + int ephemeralPortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); + EphemeralPortsData.Data.Add(ephemeralPortCountTotal); + } + await Task.Delay(250, Token).ConfigureAwait(true); } @@ -757,6 +735,7 @@ error on these conditions. finally { cpuUtilizationProvider?.Dispose(); + cpuUtilizationProvider = null; } } diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 0e0283e9..8f5151b8 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -15,7 +15,6 @@ using System.Runtime.InteropServices; using System.Security; using System.Text; -using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using FabricObserver.Observers.Utilities; @@ -255,7 +254,6 @@ public override async Task ReportAsync(CancellationToken token) Description = auServiceEnabledMessage, HealthState = "Warning", Metric = "WUAutoDownloadEnabled", - Value = isAUAutomaticDownloadEnabled, NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName @@ -670,9 +668,6 @@ await TelemetryClient.ReportMetricAsync( ObserverName, HealthState.Error, $"Unhandled Exception processing OS information:{Environment.NewLine}{e}"); - - // Fix the bug.. - throw; } } } diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 026b2d34..78b6c03f 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -300,7 +300,7 @@ make that connection. You should generally not have to call GC.Collect from user catch (Exception e) { var message = - $"Unhanded Exception in {ObserverConstants.ObserverManagerName} on node " + + $"Unhandled Exception in {ObserverConstants.ObserverManagerName} on node " + $"{nodeName}. Taking down FO process. " + $"Error info:{Environment.NewLine}{e}"; @@ -379,6 +379,13 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf var appHealth = await FabricClientInstance.HealthManager.GetApplicationHealthAsync(appName).ConfigureAwait(true); var fabricObserverAppHealthEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + if (isConfigurationUpdateInProgress) + { + fabricObserverAppHealthEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && s.HealthInformation.HealthState == HealthState.Warning + || s.HealthInformation.HealthState == HealthState.Error); + } + foreach (var evt in fabricObserverAppHealthEvents) { healthReport.AppName = appName; @@ -405,6 +412,14 @@ public async Task StopObserversAsync(bool isShutdownSignaled = true, bool isConf { var nodeHealth = await FabricClientInstance.HealthManager.GetNodeHealthAsync(obs.NodeName).ConfigureAwait(true); var fabricObserverNodeHealthEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + + if (isConfigurationUpdateInProgress) + { + fabricObserverNodeHealthEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && s.HealthInformation.HealthState == HealthState.Warning + || s.HealthInformation.HealthState == HealthState.Error); + } + healthReport.ReportType = HealthReportType.Node; foreach (var evt in fabricObserverNodeHealthEvents) @@ -762,8 +777,10 @@ private async Task RunObserversAsync() var exceptionBuilder = new StringBuilder(); bool allExecuted = true; - foreach (var observer in observers) + for (int i = 0; i < observers.Count(); ++i) { + var observer = observers[i]; + if (isConfigurationUpdateInProgress) { return true; @@ -865,6 +882,7 @@ await File.WriteAllLinesAsync( } catch (IOException) { + } } } @@ -894,7 +912,6 @@ ex.InnerException is OperationCanceledException || if (isConfigurationUpdateInProgress) { IsObserverRunning = false; - return true; } @@ -904,10 +921,10 @@ ex.InnerException is OperationCanceledException || catch (Exception e) { HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Error, - $"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); + ObserverConstants.ObserverManagerName, + ApplicationName, + HealthState.Error, + $"Unhandled Exception from {observer.ObserverName}:{Environment.NewLine}{e}"); allExecuted = false; } @@ -921,15 +938,12 @@ ex.InnerException is OperationCanceledException || } else { - if (Logger.EnableVerboseLogging) - { - HealthReporter.ReportFabricObserverServiceHealth( - ObserverConstants.ObserverManagerName, - ApplicationName, - HealthState.Warning, - exceptionBuilder.ToString()); - } - + HealthReporter.ReportFabricObserverServiceHealth( + ObserverConstants.ObserverManagerName, + ApplicationName, + HealthState.Warning, + exceptionBuilder.ToString()); + _ = exceptionBuilder.Clear(); } diff --git a/FabricObserver/PackageRoot/Config/NetworkObserver.config.json b/FabricObserver/PackageRoot/Config/NetworkObserver.config.json index 4b71e127..03113376 100644 --- a/FabricObserver/PackageRoot/Config/NetworkObserver.config.json +++ b/FabricObserver/PackageRoot/Config/NetworkObserver.config.json @@ -8,9 +8,9 @@ "protocol": "http" }, { - "hostname": "somesqlservername.database.windows.net", - "port": 1433, - "protocol": "tcp" + "hostname": "https://mycosmosdb.documents.azure.com/dbs/mydb", + "port": 443, + "protocol": "http" } ] }, diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index 9bc7787c..50ada035 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -143,6 +143,12 @@ the observer. The default value for capacity is 30 if you omit the ResourceUsageDataCapacity parameter or use an invalid value like 0 or a negative number (or omit the parameter altogether). --> + + + +
@@ -169,7 +175,6 @@ - @@ -288,5 +293,5 @@ -
--> + --> diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index bc25de37..0dae5929 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,10 +27,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 12b9eb8c..52abdf89 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,10 +21,10 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 3366420e..c35b3467 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + - @@ -28,8 +27,7 @@ - - + @@ -39,13 +37,11 @@ - - + - @@ -55,14 +51,11 @@ - - - - - + + + - - + @@ -72,28 +65,28 @@ - - + + - + + + - - @@ -111,7 +104,6 @@ - @@ -134,10 +126,8 @@ - - @@ -145,7 +135,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + @@ -162,6 +152,8 @@ + +
@@ -180,7 +172,6 @@ - @@ -274,6 +265,9 @@ + \ No newline at end of file diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index 13ac023c..b2285489 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -455,11 +455,11 @@ public async Task Successful_NetworkObserver_Run_Cancellation_Via_ObserverManage _ = Task.Run(async () => { - await obsMgr.StartObserversAsync().ConfigureAwait(true); - }).ConfigureAwait(true); + await obsMgr.StartObserversAsync(); + }); - Assert.IsTrue(await WaitAsync(() => obsMgr.IsObserverRunning, 1).ConfigureAwait(true)); - await obsMgr.StopObserversAsync().ConfigureAwait(true); + Assert.IsTrue(await WaitAsync(() => obsMgr.IsObserverRunning, 1)); + await obsMgr.StopObserversAsync(); Assert.IsFalse(obsMgr.IsObserverRunning); } diff --git a/README.md b/README.md index 1ad32b68..60b3da43 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.14 +# FabricObserver 3.1.15 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of machine resources that tend to be very important to all Service Fabric applications, like disk space consumption, CPU use, memory use, endpoint availability, ephemeral TCP port use, and app/cluster certificate health out-of-the-box.