diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index d2eef423..7da23613 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.4" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.4" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.5" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.5" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.4" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.4" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.5" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.5" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index 4c6ab60d..2593603c 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.5" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.5" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.6" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.6" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.5" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.5" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.6" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.6" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index 758a1290..2632ff2c 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 2.1.4 - FO 3.1.5 support. Bug fixes. Significant refactoring. + 2.1.5 + FO 3.1.6 support. Microsoft MIT false diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index b7eb3c23..8244a227 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -219,7 +219,7 @@ private async Task ReportClusterHealthAsync(CancellationToken token) var telemetry = new TelemetryData(FabricClientInstance, token) { HealthState = "Ok", - HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", + Description = "Cluster has recovered from previous Error/Warning state.", Metric = "AggregatedClusterHealth", Source = ObserverName, }; @@ -335,7 +335,7 @@ e is TimeoutException || var telemetryData = new TelemetryData(FabricClientInstance, token) { HealthState = "Warning", - HealthEventDescription = msg, + Description = msg, }; await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); @@ -435,7 +435,7 @@ private async Task ProcessApplicationHealthAsync(IList a { ApplicationName = appName.OriginalString, HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, + Description = telemetryDescription, Source = ObserverName, }; @@ -486,7 +486,7 @@ private async Task ProcessApplicationHealthAsync(IList a { foTelemetryData.ApplicationName, foTelemetryData.HealthState, - foTelemetryData.HealthEventDescription, + foTelemetryData.Description, foTelemetryData.Metric, foTelemetryData.ObserverName, foTelemetryData.NodeName, @@ -519,7 +519,7 @@ private async Task ProcessApplicationHealthAsync(IList a { ApplicationName = appName.OriginalString, HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, + Description = telemetryDescription, Source = ObserverName, }; @@ -596,7 +596,7 @@ private async Task ProcessNodeHealthAsync(IList nodeHealthState // From FO? if (foStats != null) { - telemetryDescription += foStats.HealthEventDescription; + telemetryDescription += foStats.Description; sourceObserver = foStats.ObserverName; metric = foStats.Metric; } @@ -626,7 +626,7 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( { NodeName = node.NodeName, HealthState = Enum.GetName(typeof(HealthState), node.AggregatedHealthState), - HealthEventDescription = $"{telemetryDescription}{Environment.NewLine}Node Status: {(targetNode != null ? Enum.GetName(typeof(NodeStatus), targetNode.NodeStatus) : string.Empty)}", + Description = $"{telemetryDescription}{Environment.NewLine}Node Status: {(targetNode != null ? Enum.GetName(typeof(NodeStatus), targetNode.NodeStatus) : string.Empty)}", Metric = metric ?? "AggregatedClusterHealth", ObserverName = sourceObserver ?? string.Empty, Source = ObserverName, @@ -675,7 +675,7 @@ private async Task ProcessGenericEntityHealthAsync(HealthEvaluation evaluation, var telemetryData = new TelemetryData(FabricClientInstance, token) { - HealthEventDescription = telemetryDescription, + Description = telemetryDescription, HealthState = healthState, Source = ObserverName, }; @@ -728,7 +728,7 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( var telemetry = new TelemetryData(FabricClientInstance, token) { HealthState = "Ok", - HealthEventDescription = $"{nodeDictItem.Key} is now Up.", + Description = $"{nodeDictItem.Key} is now Up.", Metric = "NodeStatus", NodeName = nodeDictItem.Key, Value = "Up", @@ -807,7 +807,7 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( var telemetry = new TelemetryData(FabricClientInstance, token) { HealthState = "Warning", - HealthEventDescription = message, + Description = message, Metric = "NodeStatus", NodeName = kvp.Key, Value = $"{kvp.Value.NodeStatus}", diff --git a/ClusterObserver/PackageRoot/ServiceManifest.xml b/ClusterObserver/PackageRoot/ServiceManifest.xml index 1626b3aa..9cf726f2 100644 --- a/ClusterObserver/PackageRoot/ServiceManifest.xml +++ b/ClusterObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index e3dcf8e7..beef5084 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -144,7 +144,7 @@ public Task ReportHealthAsync( { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.HealthEventDescription ?? string.Empty }, + { "Description", telemetryData.Description ?? string.Empty }, { "HealthState", telemetryData.HealthState ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, { "NodeName", telemetryData.NodeName ?? string.Empty }, diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs index 649f74c2..2841f3d7 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryData.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryData.cs @@ -32,7 +32,7 @@ public string ContainerId get; set; } - public string HealthEventDescription + public string Description { get; set; } diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index 1662cafe..60bb626f 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -14,7 +14,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + diff --git a/Documentation/Using.md b/Documentation/Using.md index a242cb81..cc89a15b 100644 --- a/Documentation/Using.md +++ b/Documentation/Using.md @@ -111,7 +111,7 @@ Example Output in SFX: ![alt text](/Documentation/Images/DiskWarnDescriptionNode.jpg "Logo Title Text 1") -**Memory Usage** +**Memory Usage - Private Working Set as Percentage of Total Physical Memory** ***Problem:*** I want to know how much memory some or all of my services are using and warn when they hit some meaningful percent-used thresold. @@ -137,6 +137,32 @@ The third one scopes to all services _but_ 3 and asks AppObserver to warn when a } ``` +**Memory Usage - Private Working Set - MB in Use** + +***Problem:*** I want to know how much memory some or all of my services are using and warn when they hit some meaningful Megabytes in use thresold. + +***Solution:*** AppObserver is your friend. + +The first two JSON objects below tell AppObserver to warn when any of the services under MyApp app reach 30% memory use (as a percentage of total memory). + +The third one scopes to all services _but_ 3 and asks AppObserver to warn when any of them hit 40% memory use on the machine (virtual or not). + +```JSON + { + "targetApp": "fabric:/MyApp", + "memoryWarningLimitMb": 300 + }, + { + "targetApp": "fabric:/AnotherApp", + "memoryWarningLimitMb": 500 + }, + { + "targetApp": "fabric:/SomeOtherApp", + "serviceExcludeList": "WhoNeedsMemoryService, NoMemoryNoProblemService, Service42", + "memoryWarningLimitMb": 600 + } +``` + **Different thresholds for different services belonging to the same app** @@ -187,6 +213,70 @@ The following configuration tells AppObserver to monitor and report Warnings for } ``` +**All App Monitoring** + +***Problem:*** I don't care what the app is, I just want to monitor all app services deployed to any node. + +***Solution:*** AppObserver is your friend. Note, you can specify all app targets using either "*" or "All" (case doesn't matter). +The configuration below specifies that AppObserver is to monitor and report thresholds breaches for a collection of metrics on all services belong to any app that is deployed to the node. +Note that AppObserver does not (and will not) monitor fabric:/System app services. Also, individual targetApp configuration items will override the global configuration when there the same metrics are supplied. +So, in the example below, the setting for cpuWarningLimitPercent for fabric:/MyApp will override the same setting specified in the all inclusive config item. fabric:/MyApp will still be monitored for the other global metrics. + +```JSON +[ + { + "targetApp": "*", + "cpuWarningLimitPercent": 75, + "memoryWarningLimitMb" : 500, + "networkWarningActivePorts": 2000, + "networkWarningEphemeralPorts": 1500 + }, + { + "targetApp": "fabric:/MyApp", + "cpuWarningLimitPercent": 50 + } +] +``` +***Problem:*** I don't care what the app is, I just want to monitor all app services deployed to any node, except for fabric:/MyApp, where I only care about raw memory use (MB) by any of its services. + +***Solution:*** AppObserver is your friend. Note, you can specify all app targets using either "*", "All", or "Any" (case doesn't matter). +The configuration below specifies that AppObserver is to monitor and report thresholds breaches for a collection of metrics on all services belong to any app that is deployed to the node, except for fabric:/MyApp. + +```JSON +[ + { + "targetApp": "*", + "appExcludeList": "fabric:/MyApp", + "cpuWarningLimitPercent": 75, + "memoryWarningLimitPerceent" : 40, + "networkWarningActivePorts": 2000, + "networkWarningEphemeralPorts": 1500 + }, + { + "targetApp": "fabric:/MyApp", + "memoryWarningLimitMb": 600 + } +] +``` +***Problem:*** I want to monitor the same resource metrics used by 3 apps and I don't like writing JSON. + +***Solution:*** AppObserver is your friend. Note, you can specify all app targets using either "*", "All", or "Any" (case doesn't matter). +The configuration below specifies that AppObserver is to monitor and report thresholds breaches for a collection of metrics on all services that belong to the apps supplied in appIncludeList. + +```JSON +[ + { + "targetApp": "*", + "appIncludeList": "fabric:/MyApp, fabric:/MyApp2, fabric:/MyApp3", + "cpuWarningLimitPercent": 75, + "memoryWarningLimitPerceent" : 40, + "networkWarningActivePorts": 2000, + "networkWarningEphemeralPorts": 1500 + } +] + +``` + > You can learn all about the currently implemeted Observers and their supported resource properties [***here***](/Documentation/Observers.md). diff --git a/FabricObserver.Extensibility/MachineInfoModel/ApplicationInfo.cs b/FabricObserver.Extensibility/MachineInfoModel/ApplicationInfo.cs index 6a95711f..ad7e17e4 100644 --- a/FabricObserver.Extensibility/MachineInfoModel/ApplicationInfo.cs +++ b/FabricObserver.Extensibility/MachineInfoModel/ApplicationInfo.cs @@ -19,6 +19,16 @@ public string TargetAppType get; set; } + public string AppExcludeList + { + get; set; + } + + public string AppIncludeList + { + get; set; + } + public string ServiceExcludeList { get; set; @@ -84,11 +94,6 @@ public int NetworkErrorFirewallRules get; set; } - public int NetworkWarningFirewallRules - { - get; set; - } - public bool DumpProcessOnError { get; set; @@ -106,6 +111,8 @@ public int WarningOpenFileHandles public override string ToString() => $"ApplicationName: {TargetApp ?? string.Empty}{Environment.NewLine}" + $"ApplicationTypeName: {TargetAppType ?? string.Empty}{Environment.NewLine}" + + $"AppExcludeList: {AppExcludeList ?? string.Empty}{Environment.NewLine}" + + $"AppIncludeList: {AppIncludeList ?? string.Empty}{Environment.NewLine}" + $"ServiceExcludeList: {ServiceExcludeList ?? string.Empty}{Environment.NewLine}" + $"ServiceIncludeList: {ServiceIncludeList ?? string.Empty}{Environment.NewLine}" + $"MemoryWarningLimitMB: {MemoryWarningLimitMb}{Environment.NewLine}" + @@ -118,8 +125,6 @@ public override string ToString() => $"ApplicationName: {TargetApp ?? string.Emp $"NetworkWarningActivePorts: {NetworkWarningActivePorts}{Environment.NewLine}" + $"NetworkErrorEphemeralPorts: {NetworkErrorEphemeralPorts}{Environment.NewLine}" + $"NetworkWarningEphemeralPorts: {NetworkWarningEphemeralPorts}{Environment.NewLine}" + - $"NetworkErrorFirewallRules: {NetworkErrorFirewallRules}{Environment.NewLine}" + - $"NetworkWarningFirewallRules: {NetworkWarningFirewallRules}{Environment.NewLine}" + $"DumpProcessOnError: {DumpProcessOnError}{Environment.NewLine}" + $"ErrorOpenFileHandles: {ErrorOpenFileHandles}{Environment.NewLine}" + $"WarningOpenFileHandles: {WarningOpenFileHandles}{Environment.NewLine}"; diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 624fb181..ec7757e9 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -31,6 +31,7 @@ public abstract class ObserverBase : IObserver private readonly Dictionary serviceDumpCountDictionary = new Dictionary(); private string SFLogRoot; private string dumpsPath; + private bool disposedValue; public string ObserverName { @@ -94,24 +95,29 @@ public bool IsEnabled } } - public bool IsObserverTelemetryEnabled + public bool IsTelemetryEnabled { get { if (ConfigurationSettings != null) { - return ConfigurationSettings.IsObserverTelemetryEnabled; + return IsTelemetryProviderEnabled && ConfigurationSettings.IsObserverTelemetryEnabled; } return false; } + } - set + public bool IsEtwEnabled + { + get { if (ConfigurationSettings != null) { - ConfigurationSettings.IsObserverTelemetryEnabled = value; + return ObserverLogger.EnableETWLogging && ConfigurationSettings.IsObserverEtwEnabled; } + + return false; } } @@ -162,16 +168,6 @@ public bool HasActiveFabricErrorOrWarning get; set; } - public List HealthReportSourceIds - { - get; set; - } = new List(); - - public List HealthReportProperties - { - get; set; - } = new List(); - public List AppNames { get; set; @@ -276,11 +272,9 @@ protected ITelemetryProvider TelemetryClient get; set; } - protected bool IsEtwEnabled + protected bool IsEtwProviderEnabled { - get => bool.TryParse(GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.EnableEventSourceProvider), out etwEnabled) && etwEnabled; - - set => etwEnabled = value; + get; set; } protected FabricClient FabricClientInstance @@ -306,7 +300,6 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel FabricServiceContext = statelessServiceContext; NodeName = FabricServiceContext.NodeContext.NodeName; NodeType = FabricServiceContext.NodeContext.NodeType; - ConfigurationSectionName = ObserverName + "Configuration"; SetConfiguration(); // Observer Logger setup. @@ -325,7 +318,10 @@ protected ObserverBase(FabricClient fabricClient, StatelessServiceContext statel logFolderBasePath = logFolderBase; } - ObserverLogger = new Logger(ObserverName, logFolderBasePath); + ObserverLogger = new Logger(ObserverName, logFolderBasePath) + { + EnableETWLogging = IsEtwProviderEnabled, + }; if (string.IsNullOrEmpty(dumpsPath)) { @@ -414,8 +410,6 @@ public void WriteToLogWithLevel( case LogLevel.Error: ObserverLogger.LogError("{0} logged at level {1}: {2}", property, level, description); break; - default: - throw new ArgumentOutOfRangeException(nameof(level), level, null); } Logger.Flush(); @@ -626,7 +620,7 @@ public void ProcessResourceDataReportHealth( string thresholdName = "Minimum"; bool warningOrError = false; - string repPartitionId = null, repOrInstanceId = null, name = null, id = null, procName = null; + string name = string.Empty, id = string.Empty, drive = string.Empty; T threshold = thresholdWarning; HealthState healthState = HealthState.Ok; Uri appName = null; @@ -637,13 +631,10 @@ public void ProcessResourceDataReportHealth( { if (replicaOrInstance != null) { - repPartitionId = $"Partition: {replicaOrInstance.PartitionId}"; - repOrInstanceId = $"Replica: {replicaOrInstance.ReplicaOrInstanceId}"; - // Create a unique id which will be used for health Warnings and OKs (clears). appName = replicaOrInstance.ApplicationName; serviceName = replicaOrInstance.ServiceName; - name = appName.OriginalString.Replace("fabric:/", string.Empty); + name = serviceName.OriginalString.Replace($"{appName.OriginalString}/", string.Empty); } else // System service report from FabricSystemObserver. { @@ -659,9 +650,9 @@ public void ProcessResourceDataReportHealth( telemetryData = new TelemetryData(FabricClientInstance, Token) { ApplicationName = appName?.OriginalString ?? string.Empty, - Code = FOErrorWarningCodes.Ok, - HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName = NodeName, + Code = string.Empty, + HealthState = string.Empty, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 0), @@ -674,73 +665,66 @@ public void ProcessResourceDataReportHealth( // If the source issue is from FSO, then set the SystemServiceProcessName on TD instance. if (appName != null && appName.OriginalString == "fabric:/System") { - telemetryData.SystemServiceProcessName = data.Id; + telemetryData.SystemServiceProcessName = name; } - try + // Container + if (!string.IsNullOrEmpty(replicaOrInstance?.ContainerId)) { - if (replicaOrInstance != null && replicaOrInstance.HostProcessId > 0) - { - using (Process process = Process.GetProcessById((int)replicaOrInstance.HostProcessId)) - { - procName = process.ProcessName; - } - } - else - { - // The name of the target service process is always the id for data containers coming from FSO. - procName = data.Id; - } - - telemetryData.ServiceName = procName; + telemetryData.ContainerId = replicaOrInstance.ContainerId; + } - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + try + { + // Telemetry - This is informational, per reading telemetry, healthstate is irrelevant here. + // Enable this for your observer if you want to send data to ApplicationInsights or LogAnalytics for each resource usage observation it makes per specified metric. + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, Token).ConfigureAwait(false); } + // ETW - This is informational, per reading EventSource tracing, healthstate is irrelevant here. + // Enable this for your observer if you want to log etw (which can then be read by some agent that will send it to some endpoint) + // for each resource usage observation it makes per specified metric. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { ApplicationName = appName?.OriginalString ?? string.Empty, - Code = FOErrorWarningCodes.Ok, - HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), NodeName, ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 0), PartitionId = replicaOrInstance?.PartitionId.ToString(), ReplicaId = replicaOrInstance?.ReplicaOrInstanceId.ToString(), - ServiceName = procName, + ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, + SystemServiceProcessName = appName?.OriginalString == "fabric:/system" ? name : string.Empty, }); } } - catch (ArgumentException) - { - return; - } - catch (InvalidOperationException) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { + // Process no longer exists. Do not report on it. return; } } else { - string drive = string.Empty; + drive = string.Empty; + id = data.Id; if (ObserverName == ObserverConstants.DiskObserverName) { - drive = $"{data.Id}: "; + drive = $"{id}: "; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - drive = $"{data.Id.Remove(1, 2)}: "; + drive = $"{id.Remove(1, 2)}: "; } } @@ -749,8 +733,8 @@ public void ProcessResourceDataReportHealth( // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { - Code = FOErrorWarningCodes.Ok, - HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), + Code = string.Empty, + HealthState = string.Empty, NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", @@ -758,7 +742,7 @@ public void ProcessResourceDataReportHealth( Value = Math.Round(data.AverageDataValue, 0), }; - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, @@ -767,12 +751,12 @@ public void ProcessResourceDataReportHealth( if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { - Code = FOErrorWarningCodes.Ok, - HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), + Code = string.Empty, + HealthState = string.Empty, NodeName, ObserverName, Metric = $"{drive}{data.Property}", @@ -798,27 +782,32 @@ public void ProcessResourceDataReportHealth( { int procId = (int)replicaOrInstance.HostProcessId; - if (!serviceDumpCountDictionary.ContainsKey(procName)) + using (var proc = Process.GetProcessById(procId)) { - serviceDumpCountDictionary.Add(procName, 0); - } + string procName = proc?.ProcessName; - if (serviceDumpCountDictionary[procName] < maxDumps) - { - // DumpServiceProcess defaults to a Full dump with - // process memory, handles and thread data. - bool success = DumpServiceProcessWindows(procId); + if (!serviceDumpCountDictionary.ContainsKey(procName)) + { + serviceDumpCountDictionary.Add(procName, 0); + } - if (success) + if (serviceDumpCountDictionary[procName] < maxDumps) { - serviceDumpCountDictionary[procName]++; + // DumpServiceProcess defaults to a Full dump with + // process memory, handles and thread data. + bool success = DumpServiceProcessWindows(procId); + + if (success) + { + serviceDumpCountDictionary[procName]++; + } } } } // Ignore these, it just means no dmp will be created.This is not // critical to FO. Log as info, not warning. - catch (Exception e) when (e is ArgumentException || e is InvalidOperationException) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { ObserverLogger.LogInfo($"Unable to generate dmp file:{Environment.NewLine}{e}"); } @@ -925,18 +914,6 @@ public void ProcessResourceDataReportHealth( StringBuilder healthMessage = new StringBuilder(); - string drive = string.Empty; - - if (ObserverName == ObserverConstants.DiskObserverName) - { - drive = $"{data.Id}: "; - - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - drive = $"{data.Id.Remove(1, 2)}: "; - } - } - _ = healthMessage.Append($"{drive}{data.Property} is at or above the specified {thresholdName} limit ({threshold}{data.Units})"); _ = healthMessage.Append($" - {data.Property}: {Math.Round(data.AverageDataValue, 0)}{data.Units}"); @@ -952,14 +929,10 @@ public void ProcessResourceDataReportHealth( } telemetryData.HealthState = Enum.GetName(typeof(HealthState), healthState); - telemetryData.HealthEventDescription = healthMessage.ToString(); - telemetryData.Metric = $"{drive}{data.Property}"; - telemetryData.ServiceName = serviceName?.OriginalString ?? string.Empty; - telemetryData.Source = ObserverConstants.FabricObserverName; - telemetryData.Value = Math.Round(data.AverageDataValue, 0); + telemetryData.Description = healthMessage.ToString(); // Send Health Report as Telemetry event (perhaps it signals an Alert from App Insights, for example.). - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, @@ -969,7 +942,7 @@ public void ProcessResourceDataReportHealth( // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { @@ -977,12 +950,13 @@ public void ProcessResourceDataReportHealth( Code = errorWarningCode, ContainerId = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty, HealthState = Enum.GetName(typeof(HealthState), healthState), - HealthEventDescription = healthMessage.ToString(), + Description = healthMessage.ToString(), Metric = $"{drive}{data.Property}", Node = NodeName, ServiceName = serviceName?.OriginalString ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 0), + SystemServiceProcessName = appName?.OriginalString == "fabric:/system" ? name : string.Empty, }); } @@ -1006,53 +980,8 @@ public void ProcessResourceDataReportHealth( AppNames.Add(appName?.OriginalString); } - // From FSO. - if (replicaOrInstance == null && healthReportType == HealthReportType.Application) - { - HealthReportProperties.Add(id); - } - else - { - if (HealthReportProperties.Count == 0) - { - switch(ObserverName) - { - case ObserverConstants.AppObserverName: - HealthReportProperties.Add("ApplicationHealth"); - break; - case ObserverConstants.CertificateObserverName: - HealthReportProperties.Add("SecurityHealth"); - break; - case ObserverConstants.DiskObserverName: - HealthReportProperties.Add("DiskHealth"); - break; - case ObserverConstants.FabricSystemObserverName: - HealthReportProperties.Add("FabricSystemServiceHealth"); - break; - case ObserverConstants.NetworkObserverName: - HealthReportProperties.Add("NetworkHealth"); - break; - case ObserverConstants.OSObserverName: - HealthReportProperties.Add("MachineInformation"); - break; - case ObserverConstants.NodeObserverName: - HealthReportProperties.Add("MachineResourceHealth"); - break; - default: - HealthReportProperties.Add($"{data.Property}"); - break; - } - } - } - - healthReport.Property = HealthReportProperties.Last(); - - if (HealthReportSourceIds.Count == 0) - { - HealthReportSourceIds.Add($"{ObserverName}({errorWarningCode})"); - } - - healthReport.SourceId = HealthReportSourceIds.Last(); + healthReport.Property = id; + healthReport.SourceId = $"{ObserverName}({errorWarningCode})"; // Generate a Service Fabric Health Report. HealthReporter.ReportHealthToServiceFabric(healthReport); @@ -1083,13 +1012,13 @@ public void ProcessResourceDataReportHealth( } telemetryData.HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok); - telemetryData.HealthEventDescription = $"{data.Property} is now within normal/expected range."; + telemetryData.Description = $"{data.Property} is now within normal/expected range."; telemetryData.Metric = data.Property; telemetryData.Source = ObserverConstants.FabricObserverName; telemetryData.Value = Math.Round(data.AverageDataValue, 0); // Telemetry - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, @@ -1099,7 +1028,7 @@ public void ProcessResourceDataReportHealth( // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { @@ -1107,12 +1036,13 @@ public void ProcessResourceDataReportHealth( Code = data.ActiveErrorOrWarningCode, ContainerId = replicaOrInstance != null ? replicaOrInstance.ContainerId ?? string.Empty : string.Empty, HealthState = Enum.GetName(typeof(HealthState), HealthState.Ok), - HealthEventDescription = $"{data.Property} is now within normal/expected range.", + Description = $"{data.Property} is now within normal/expected range.", Metric = data.Property, Node = NodeName, ServiceName = name ?? string.Empty, Source = ObserverConstants.FabricObserverName, Value = Math.Round(data.AverageDataValue, 0), + SystemServiceProcessName = appName?.OriginalString == "fabric:/system" ? name : string.Empty, }); } @@ -1131,58 +1061,8 @@ public void ProcessResourceDataReportHealth( ResourceUsageDataProperty = data.Property, }; - if (!AppNames.Any(a => a == appName?.OriginalString)) - { - AppNames.Add(appName?.OriginalString); - } - - // From FSO. - if (replicaOrInstance == null && healthReportType == HealthReportType.Application) - { - HealthReportProperties.Add(id); - } - else - { - if (HealthReportProperties.Count == 0) - { - switch (ObserverName) - { - case ObserverConstants.AppObserverName: - HealthReportProperties.Add("ApplicationHealth"); - break; - case ObserverConstants.CertificateObserverName: - HealthReportProperties.Add("SecurityHealth"); - break; - case ObserverConstants.DiskObserverName: - HealthReportProperties.Add("DiskHealth"); - break; - case ObserverConstants.FabricSystemObserverName: - HealthReportProperties.Add("FabricSystemServiceHealth"); - break; - case ObserverConstants.NetworkObserverName: - HealthReportProperties.Add("NetworkHealth"); - break; - case ObserverConstants.OSObserverName: - HealthReportProperties.Add("MachineInformation"); - break; - case ObserverConstants.NodeObserverName: - HealthReportProperties.Add("MachineResourceHealth"); - break; - default: - HealthReportProperties.Add($"{data.Property}"); - break; - } - } - } - - healthReport.Property = HealthReportProperties.Last(); - - if (HealthReportSourceIds.Count == 0) - { - HealthReportSourceIds.Add($"{ObserverName}({data.ActiveErrorOrWarningCode})"); - } - - healthReport.SourceId = HealthReportSourceIds.Last(); + healthReport.Property = id; + healthReport.SourceId = $"{ObserverName}({data.ActiveErrorOrWarningCode})"; // Emit an Ok Health Report to clear Fabric Health warning. HealthReporter.ReportHealthToServiceFabric(healthReport); @@ -1190,8 +1070,6 @@ public void ProcessResourceDataReportHealth( // Reset health states. data.ActiveErrorOrWarning = false; data.ActiveErrorOrWarningCode = FOErrorWarningCodes.Ok; - HealthReportProperties.Clear(); - HealthReportSourceIds.Clear(); HasActiveFabricErrorOrWarning = false; } } @@ -1211,34 +1089,33 @@ public void ProcessResourceDataReportHealth( } /// - /// Sets TTL for observer health reports based on how long it takes an observer to run, - /// plus more time to guarantee a health report will remain active until the next time the observer runs. + /// Computes TTL for an observer's health reports based on how long it takes an observer to run, time differential between runs, + /// observer loop sleep time, plus a little more time to guarantee that a health report will remain active until the next time the observer runs. + /// Note that if you set a RunInterval on an observer, that will be reflected here and the Warning will last for that amount of time at least. /// /// TimeSpan that contains the TTL value. - public TimeSpan SetHealthReportTimeToLive() + public TimeSpan GetHealthReportTimeToLive() { - int obsSleepTime = ObserverConstants.ObserverRunLoopSleepTimeSeconds; + _ = int.TryParse( + GetSettingParameterValue( + ObserverConstants.ObserverManagerConfigurationSectionName, + ObserverConstants.ObserverLoopSleepTimeSeconds), + out int obsSleepTime); // First run. if (LastRunDateTime == DateTime.MinValue) { - _ = int.TryParse(GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.ObserverLoopSleepTimeSeconds), out obsSleepTime); - return TimeSpan.FromSeconds(obsSleepTime) - .Add(TimeSpan.FromMinutes(TtlAddMinutes)); + .Add(TimeSpan.FromMinutes(TtlAddMinutes)) + .Add(RunInterval > TimeSpan.MinValue ? RunInterval : TimeSpan.Zero); } return DateTime.Now.Subtract(LastRunDateTime) - .Add(TimeSpan.FromSeconds(RunDuration > TimeSpan.MinValue ? RunDuration.TotalSeconds : 0)) - .Add(TimeSpan.FromSeconds(obsSleepTime)) - .Add(RunInterval > TimeSpan.MinValue ? RunInterval : TimeSpan.Zero); + .Add(TimeSpan.FromSeconds(RunDuration > TimeSpan.MinValue ? RunDuration.TotalSeconds : 0)) + .Add(TimeSpan.FromSeconds(obsSleepTime)); } // This is here so each Observer doesn't have to implement IDisposable. - // If an Observer needs to dispose, then override this non-impl. - private bool disposedValue; - private bool etwEnabled; - protected virtual void Dispose(bool disposing) { if (disposedValue) @@ -1246,15 +1123,6 @@ protected virtual void Dispose(bool disposing) return; } - if (disposing) - { - if (FabricClientInstance != null) - { - FabricClientInstance.Dispose(); - FabricClientInstance = null; - } - } - disposedValue = true; } @@ -1300,6 +1168,12 @@ private void SetConfiguration() IsTelemetryProviderEnabled = telemEnabled; } + // ETW + if (bool.TryParse(GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.EnableETWProvider), out bool etwProviderEnabled)) + { + IsEtwProviderEnabled = etwProviderEnabled; + } + if (IsTelemetryProviderEnabled) { string telemetryProviderType = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.TelemetryProviderType); diff --git a/FabricObserver.Extensibility/Utilities/ConfigSettings.cs b/FabricObserver.Extensibility/Utilities/ConfigSettings.cs index 932ee40e..27f56ab9 100644 --- a/FabricObserver.Extensibility/Utilities/ConfigSettings.cs +++ b/FabricObserver.Extensibility/Utilities/ConfigSettings.cs @@ -64,6 +64,11 @@ public bool UseCircularBuffer get; set; } + public bool IsObserverEtwEnabled + { + get; set; + } + public ConfigSettings(ConfigurationSettings settings, string observerConfiguration) { Settings = settings; @@ -72,8 +77,7 @@ public ConfigSettings(ConfigurationSettings settings, string observerConfigurati UpdateConfigSettings(); } - public void UpdateConfigSettings( - ConfigurationSettings settings = null) + public void UpdateConfigSettings(ConfigurationSettings settings = null) { if (settings != null) { @@ -82,72 +86,82 @@ public void UpdateConfigSettings( // Observer enabled? if (bool.TryParse( - GetConfigSettingValue( - ObserverConstants.ObserverEnabledParameter), - out bool enabled)) + GetConfigSettingValue( + ObserverConstants.ObserverEnabledParameter), + out bool enabled)) { IsEnabled = enabled; } // Observer telemetry enabled? if (bool.TryParse( - GetConfigSettingValue( - ObserverConstants.ObserverTelemetryEnabledParameter), - out bool telemetryEnabled)) + GetConfigSettingValue( + ObserverConstants.ObserverTelemetryEnabledParameter), + out bool telemetryEnabled)) { IsObserverTelemetryEnabled = telemetryEnabled; } + + // Observer etw enabled? + if (bool.TryParse( + GetConfigSettingValue( + ObserverConstants.ObserverEtwEnabledParameter), + out bool etwEnabled)) + { + IsObserverEtwEnabled = etwEnabled; + } + // Verbose logging? if (bool.TryParse( - GetConfigSettingValue( - ObserverConstants.EnableVerboseLoggingParameter), - out bool enableVerboseLogging)) + GetConfigSettingValue( + ObserverConstants.EnableVerboseLoggingParameter), + out bool enableVerboseLogging)) { EnableVerboseLogging = enableVerboseLogging; } // RunInterval? if (TimeSpan.TryParse( - GetConfigSettingValue( - ObserverConstants.ObserverRunIntervalParameter), - out TimeSpan runInterval)) + GetConfigSettingValue( + ObserverConstants.ObserverRunIntervalParameter), + out TimeSpan runInterval)) { RunInterval = runInterval; } // Monitor duration. if (TimeSpan.TryParse( - GetConfigSettingValue( - ObserverConstants.MonitorDurationParameter), - out TimeSpan monitorDuration)) + GetConfigSettingValue( + ObserverConstants.MonitorDurationParameter), + out TimeSpan monitorDuration)) { MonitorDuration = monitorDuration; } // Async cluster operation timeout setting.. if (int.TryParse( - GetConfigSettingValue( - ObserverConstants.AsyncClusterOperationTimeoutSeconds), - out int asyncOpTimeoutSeconds)) + GetConfigSettingValue( + ObserverConstants.AsyncClusterOperationTimeoutSeconds), + out int asyncOpTimeoutSeconds)) { AsyncTimeout = TimeSpan.FromSeconds(asyncOpTimeoutSeconds); } // Resource usage data collection item capacity. if (int.TryParse( - GetConfigSettingValue( - ObserverConstants.DataCapacityParameter), - out int dataCapacity)) + GetConfigSettingValue( + ObserverConstants.DataCapacityParameter), + out int dataCapacity)) { DataCapacity = dataCapacity; } // Resource usage data collection type. if (bool.TryParse( - GetConfigSettingValue( - ObserverConstants.UseCircularBufferParameter), - out bool useCircularBuffer)) + GetConfigSettingValue( + ObserverConstants.UseCircularBufferParameter), + out bool useCircularBuffer)) { UseCircularBuffer = useCircularBuffer; } diff --git a/FabricObserver.Extensibility/Utilities/CpuUtilization/WindowsCpuUtilizationProvider.cs b/FabricObserver.Extensibility/Utilities/CpuUtilization/WindowsCpuUtilizationProvider.cs index 7635cf3c..f1627d47 100644 --- a/FabricObserver.Extensibility/Utilities/CpuUtilization/WindowsCpuUtilizationProvider.cs +++ b/FabricObserver.Extensibility/Utilities/CpuUtilization/WindowsCpuUtilizationProvider.cs @@ -11,11 +11,13 @@ namespace FabricObserver.Observers.Utilities { public class WindowsCpuUtilizationProvider : CpuUtilizationProvider { - private PerformanceCounter performanceCounter = new PerformanceCounter( - categoryName: "Processor", - counterName: "% Processor Time", - instanceName: "_Total", - readOnly: true); + private PerformanceCounter performanceCounter = new PerformanceCounter() + { + CategoryName = "Processor", + CounterName = "% Processor Time", + InstanceName = "_Total", + ReadOnly = true, + }; public override Task NextValueAsync() { @@ -26,7 +28,11 @@ public override Task NextValueAsync() throw new ObjectDisposedException(nameof(WindowsCpuUtilizationProvider)); } + // warm up counter. + _ = perfCounter.NextValue(); + float result = perfCounter.NextValue(); + return Task.FromResult(result); } diff --git a/FabricObserver.Extensibility/Utilities/DiskUsage.cs b/FabricObserver.Extensibility/Utilities/DiskUsage.cs index 888c898a..403d3cda 100644 --- a/FabricObserver.Extensibility/Utilities/DiskUsage.cs +++ b/FabricObserver.Extensibility/Utilities/DiskUsage.cs @@ -14,9 +14,6 @@ namespace FabricObserver.Observers.Utilities { public static class DiskUsage { - private static PerformanceCounter diskAverageQueueLengthCounter = - new PerformanceCounter(categoryName: "LogicalDisk", counterName: "Avg. Disk Queue Length", readOnly: true); - public static bool ShouldCheckDrive(DriveInfo driveInfo) { if (!driveInfo.IsReady) @@ -99,33 +96,49 @@ public static double GetUsedDiskSpace(string driveName, SizeUnit sizeUnit = Size public static float GetAverageDiskQueueLength(string instance) { - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + // We do not support this on Linux for now. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) { - try - { - DiskUsage.diskAverageQueueLengthCounter.InstanceName = instance; - return DiskUsage.diskAverageQueueLengthCounter.NextValue(); - } - catch (Exception e) + return 0F; + } + + PerformanceCounter diskAverageQueueLengthCounter = null; + + try + { + diskAverageQueueLengthCounter = new PerformanceCounter() { - Logger logger = new Logger("Utilities"); + InstanceName = instance, + CategoryName = "LogicalDisk", + CounterName = "Avg. Disk Queue Length", + ReadOnly = true, + }; - if (e is ArgumentNullException || e is PlatformNotSupportedException - || e is System.ComponentModel.Win32Exception || e is UnauthorizedAccessException) - { - logger.LogError($"{DiskUsage.diskAverageQueueLengthCounter.CategoryName} {DiskUsage.diskAverageQueueLengthCounter.CounterName} PerfCounter handled exception: " + e); + // Warm up counter + _ = diskAverageQueueLengthCounter.NextValue(); - // Don't throw. - return 0F; - } + return diskAverageQueueLengthCounter.NextValue(); + } + catch (Exception e) + { + Logger logger = new Logger("Utilities"); + + if (e is ArgumentNullException || e is PlatformNotSupportedException + || e is System.ComponentModel.Win32Exception || e is UnauthorizedAccessException) + { + logger.LogError($"{diskAverageQueueLengthCounter.CategoryName} {diskAverageQueueLengthCounter.CounterName} PerfCounter handled exception: " + e); - logger.LogError($"{DiskUsage.diskAverageQueueLengthCounter.CategoryName} {DiskUsage.diskAverageQueueLengthCounter.CounterName} PerfCounter unhandled exception: " + e); - throw; + // Don't throw. + return 0F; } - } - // We do not support this on Linux for now - return 0F; + logger.LogError($"{diskAverageQueueLengthCounter.CategoryName} {diskAverageQueueLengthCounter.CounterName} PerfCounter unhandled exception: " + e); + throw; + } + finally + { + diskAverageQueueLengthCounter?.Dispose(); + } } private static double ConvertToSizeUnits(double amount, SizeUnit sizeUnit) diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index b27ed69e..428aaa0e 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -22,6 +22,7 @@ namespace FabricObserver.Observers.Utilities public sealed class Logger : IObserverLogger { private const int Retries = 5; + private EventSource etwLogger = null; // Text file logger for observers - info/warn/error. private ILogger OLogger @@ -31,11 +32,24 @@ private ILogger OLogger private readonly string loggerName; - public static EventSource EtwLogger + private EventSource EtwLogger { - get; private set; + get + { + if (EnableETWLogging && etwLogger == null) + { + etwLogger = new EventSource(ObserverConstants.EventSourceProviderName); + } + + return etwLogger; + } } + public bool EnableETWLogging + { + get; set; + } = false; + public bool EnableVerboseLogging { get; set; @@ -61,14 +75,6 @@ public string Filename get; } - static Logger() - { - if (EtwLogger == null) - { - EtwLogger = new EventSource(ObserverConstants.EventSourceProviderName); - } - } - /// /// Initializes a new instance of the class. /// @@ -123,6 +129,11 @@ public void LogWarning(string format, params object[] parameters) OLogger.Warn(format, parameters); } + public void LogEtw(string eventName, T data) + { + EtwLogger?.Write(eventName, data); + } + public bool TryWriteLogFile(string path, string content) { if (string.IsNullOrEmpty(content)) diff --git a/FabricObserver.Extensibility/Utilities/MemoryUsage/WindowsMemoryUsageProvider.cs b/FabricObserver.Extensibility/Utilities/MemoryUsage/WindowsMemoryUsageProvider.cs index 672ee057..21613856 100644 --- a/FabricObserver.Extensibility/Utilities/MemoryUsage/WindowsMemoryUsageProvider.cs +++ b/FabricObserver.Extensibility/Utilities/MemoryUsage/WindowsMemoryUsageProvider.cs @@ -9,14 +9,28 @@ namespace FabricObserver.Observers.Utilities { public class WindowsMemoryUsageProvider : MemoryUsageProvider { - private static readonly PerformanceCounter MemCommittedBytesPerfCounter = new PerformanceCounter( - categoryName: "Memory", - counterName: "Committed Bytes", - readOnly: true); - public override ulong GetCommittedBytes() { - return (ulong)MemCommittedBytesPerfCounter.NextValue(); + PerformanceCounter memCommittedBytesPerfCounter = null; + + try + { + memCommittedBytesPerfCounter = new PerformanceCounter() + { + CategoryName = "Memory", + CounterName = "Committed Bytes", + ReadOnly = true, + }; + + // warm up counter. + _ = memCommittedBytesPerfCounter.NextValue(); + + return (ulong)memCommittedBytesPerfCounter.NextValue(); + } + finally + { + memCommittedBytesPerfCounter?.Dispose(); + } } } } \ No newline at end of file diff --git a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs index 1623953a..651e144a 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverConstants.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverConstants.cs @@ -12,7 +12,7 @@ public sealed class ObserverConstants public const string ObserverWebApiAppDeployed = "ObserverWebApiEnabled"; public const string EnableLongRunningCsvLogging = "EnableLongRunningCSVLogging"; public const string Fqdn = "FQDN"; - public const string EnableEventSourceProvider = "EnableEventSourceProvider"; + public const string EnableETWProvider = "EnableETWProvider"; public const string EventSourceProviderName = "FabricObserverETWProvider"; public const string FabricObserverTelemetryEnabled = "EnableFabricObserverDiagnosticTelemetry"; public const string AsyncClusterOperationTimeoutSeconds = "ClusterOperationTimeoutSeconds"; @@ -42,6 +42,7 @@ public sealed class ObserverConstants public const string ObserverRunIntervalParameter = "RunInterval"; public const string ObserverEnabledParameter = "Enabled"; public const string ObserverTelemetryEnabledParameter = "EnableTelemetry"; + public const string ObserverEtwEnabledParameter = "EnableEtw"; public const string EnableVerboseLoggingParameter = "EnableVerboseLogging"; public const string DataCapacityParameter = "ResourceUsageDataCapacity"; public const string UseCircularBufferParameter = "UseCircularBuffer"; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index a599e945..f4721dc5 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -10,15 +10,9 @@ namespace FabricObserver.Observers.Utilities { - // Since we only create a single instance of WindowsProcessInfoProvider, it is OK - // to not dispose counters. -#pragma warning disable CA1001 // Types that own disposable fields should be disposable public class WindowsProcessInfoProvider : ProcessInfoProvider { const string CategoryName = "Process"; - - private readonly PerformanceCounter memProcessPrivateWorkingSetCounter = new PerformanceCounter(); - private readonly PerformanceCounter processFileHandleCounter = new PerformanceCounter(); private readonly object memPerfCounterLock = new object(); private readonly object fileHandlesPerfCounterLock = new object(); @@ -43,11 +37,19 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) lock (memPerfCounterLock) { + PerformanceCounter memProcessPrivateWorkingSetCounter = null; + try { - memProcessPrivateWorkingSetCounter.CategoryName = CategoryName; - memProcessPrivateWorkingSetCounter.CounterName = WorkingSetCounterName; - memProcessPrivateWorkingSetCounter.InstanceName = processName; + memProcessPrivateWorkingSetCounter = new PerformanceCounter + { + CategoryName = CategoryName, + CounterName = WorkingSetCounterName, + InstanceName = processName + }; + + // warm up counter. + _ = memProcessPrivateWorkingSetCounter.NextValue(); return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); } @@ -65,6 +67,11 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) throw; } + finally + { + memProcessPrivateWorkingSetCounter?.Dispose(); + memProcessPrivateWorkingSetCounter = null; + } } } @@ -95,11 +102,19 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService lock (fileHandlesPerfCounterLock) { + PerformanceCounter processFileHandleCounter = null; + try { - processFileHandleCounter.CategoryName = CategoryName; - processFileHandleCounter.CounterName = FileHandlesCounterName; - processFileHandleCounter.InstanceName = processName; + processFileHandleCounter = new PerformanceCounter + { + CategoryName = CategoryName, + CounterName = FileHandlesCounterName, + InstanceName = processName + }; + + // warm up counter. + _ = processFileHandleCounter.NextValue(); return processFileHandleCounter.NextValue(); } @@ -117,6 +132,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService throw; } + finally + { + processFileHandleCounter?.Dispose(); + processFileHandleCounter = null; + } } } } diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 8f73c814..637fc1a1 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -186,7 +186,7 @@ public Task ReportHealthAsync( { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.HealthEventDescription ?? string.Empty }, + { "HealthEventDescription", telemetryData.Description ?? string.Empty }, { "HealthState", telemetryData.HealthState ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, { "NodeName", telemetryData.NodeName ?? string.Empty }, @@ -259,7 +259,7 @@ public Task ReportMetricAsync( { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.HealthEventDescription ?? string.Empty }, + { "HealthEventDescription", telemetryData.Description ?? string.Empty }, { "HealthState", telemetryData.HealthState ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, { "NodeName", telemetryData.NodeName ?? string.Empty }, diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs index a0509433..dab3044d 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryData.cs @@ -33,7 +33,7 @@ public string ContainerId get; set; } - public string HealthEventDescription + public string Description { get; set; } diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 4b39204a..0ff60e44 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,8 +2,8 @@ %PACKAGE_ID% - 3.1.5 - Updated TelemetryData type with new member, SystemServiceProcessName. Updated ObserverBase's ProcessDataReportHealth function with support for new TD member in health report output. Minor bug fixes. + 3.1.6 + A few *important* bug fixes in AppObserver and Observerbase. Changes to Logger (ETW). Naming changes. **Please upgrade as soon as you can**. Changes: AppObserver will now monitor All applications for specified metric thresholds when you supply "All" or "*" for the targetApp setting in AppObserver's configuration file. See project site for details. Fixed minor bug in ObserverManager's StopObserversAsync function. BREAKING CHANGES: You have to now enable/disable ETW on each observer that you want to generate EventSource traces. Also, in your plugins or forked observers, you will need to replace Logger.EtwLogger.Write() calls with ObserverLogger.LogEtw(). ObserverBase's HealthReportProperties and HealthReportSourceIds are gone. Just supply string values for sourceid/property if you generate your own health events. Microsoft MIT false diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index 4111071d..b15edd49 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -23,8 +23,7 @@ namespace FabricObserver /// internal sealed class FabricObserver : StatelessService { - private ObserverManager observerManager; - private FabricClient fabricClient; + private readonly FabricClient fabricClient; /// /// Initializes a new instance of the class. @@ -47,7 +46,8 @@ protected override async Task RunAsync(CancellationToken cancellationToken) ConfigureServices(services); using ServiceProvider serviceProvider = services.BuildServiceProvider(); - observerManager = new ObserverManager(serviceProvider, fabricClient, cancellationToken); + using ObserverManager observerManager = new ObserverManager(serviceProvider, fabricClient, cancellationToken); + await observerManager.StartObserversAsync().ConfigureAwait(false); } diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index 9c32d9ac..ddc7e869 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -40,7 +40,6 @@ public class AppObserver : ObserverBase // deployedTargetList is the list of ApplicationInfo objects representing currently deployed applications in the user-supplied list. private readonly List deployedTargetList; - private bool disposed; private readonly MachineInfoModel.ConfigSettings configSettings; public List ReplicaOrInstanceList @@ -109,6 +108,7 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Reset(); await ReportAsync(token).ConfigureAwait(true); + LastRunDateTime = DateTime.Now; } @@ -123,7 +123,7 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - var healthReportTimeToLive = SetHealthReportTimeToLive(); + var healthReportTimeToLive = GetHealthReportTimeToLive(); // App-specific reporting. foreach (var app in deployedTargetList) @@ -135,7 +135,7 @@ public override Task ReportAsync(CancellationToken token) { token.ThrowIfCancellationRequested(); - if (!string.IsNullOrEmpty(app.TargetAppType) + if (!string.IsNullOrWhiteSpace(app.TargetAppType) && !string.Equals( repOrInst.ApplicationTypeName, app.TargetAppType, @@ -144,7 +144,7 @@ public override Task ReportAsync(CancellationToken token) continue; } - if (!string.IsNullOrEmpty(app.TargetApp) + if (!string.IsNullOrWhiteSpace(app.TargetApp) && !string.Equals( repOrInst.ApplicationName.OriginalString, app.TargetApp, @@ -272,16 +272,6 @@ public override Task ReportAsync(CancellationToken token) } } - protected override void Dispose(bool disposing) - { - if (disposed || !disposing) - { - return; - } - - disposed = true; - } - private static string GetAppNameOrType(ReplicaOrInstanceMonitoringInfo repOrInst) { // targetType specified as TargetAppType name, which means monitor all apps of specified type. @@ -355,10 +345,104 @@ private async Task InitializeAsync() return false; } + // Support for specifying single configuration item for any or all or * applications. + if (userTargetList != null && userTargetList.Any(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*")) + { + ApplicationInfo application = userTargetList.Find(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); + + var appList = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( + NodeName, + null, + ConfigurationSettings.AsyncTimeout, + Token).ConfigureAwait(false); + + foreach (var app in appList) + { + Token.ThrowIfCancellationRequested(); + + if (app.ApplicationName.OriginalString == "fabric:/System") + { + continue; + } + + // App filtering: AppExludeList, AppIncludeList. This is only useful when you are observing All/* applications for a range of thresholds. + if (!string.IsNullOrWhiteSpace(application.AppExcludeList) && application.AppExcludeList.Contains(app.ApplicationName.OriginalString)) + { + continue; + } + else if(!string.IsNullOrWhiteSpace(application.AppIncludeList) && !application.AppIncludeList.Contains(app.ApplicationName.OriginalString)) + { + continue; + } + + // Don't create a brand new entry for an existing (specified in configuration) app target/type. Just update the appConfig instance with data supplied in the All//* apps config entry. + // Note that if you supply a conflicting setting (where you specify a threshold for a specific app target config item and also in a global config item), then the target-specific setting will be used. + // E.g., if you supply a memoryWarningLimitMb threshold for an app named fabric:/MyApp and also supply a memoryWarningLimitMb threshold for all apps ("targetApp" : "All"), + // then the threshold specified for fabric:/MyApp will remain in place for that app target. So, target specificity overrides any global setting. + if (userTargetList.Any(a => a.TargetApp == app.ApplicationName.OriginalString || a.TargetAppType == app.ApplicationTypeName)) + { + var existingAppConfig = userTargetList.Find(a => a.TargetApp == app.ApplicationName.OriginalString || a.TargetAppType == app.ApplicationTypeName); + + if (existingAppConfig == null) + { + continue; + } + + existingAppConfig.ServiceExcludeList = string.IsNullOrWhiteSpace(existingAppConfig.ServiceExcludeList) && !string.IsNullOrWhiteSpace(application.ServiceExcludeList) ? application.ServiceExcludeList : existingAppConfig.ServiceExcludeList; + existingAppConfig.ServiceIncludeList = string.IsNullOrWhiteSpace(existingAppConfig.ServiceExcludeList) && !string.IsNullOrWhiteSpace(application.ServiceIncludeList) ? application.ServiceIncludeList : existingAppConfig.ServiceIncludeList; + existingAppConfig.MemoryWarningLimitMb = existingAppConfig.MemoryWarningLimitMb == 0 && application.MemoryWarningLimitMb > 0 ? application.MemoryWarningLimitMb : existingAppConfig.MemoryWarningLimitMb; + existingAppConfig.MemoryErrorLimitMb = existingAppConfig.MemoryErrorLimitMb == 0 && application.MemoryErrorLimitMb > 0 ? application.MemoryErrorLimitMb : existingAppConfig.MemoryErrorLimitMb; + existingAppConfig.MemoryWarningLimitPercent = existingAppConfig.MemoryWarningLimitPercent == 0 && application.MemoryWarningLimitPercent > 0 ? application.MemoryWarningLimitPercent : existingAppConfig.MemoryWarningLimitPercent; + existingAppConfig.MemoryErrorLimitPercent = existingAppConfig.MemoryErrorLimitPercent == 0 && application.MemoryErrorLimitPercent > 0 ? application.MemoryErrorLimitPercent : existingAppConfig.MemoryErrorLimitPercent; + existingAppConfig.CpuErrorLimitPercent = existingAppConfig.CpuErrorLimitPercent == 0 && application.CpuErrorLimitPercent > 0 ? application.CpuErrorLimitPercent : existingAppConfig.CpuErrorLimitPercent; + existingAppConfig.CpuWarningLimitPercent = existingAppConfig.CpuWarningLimitPercent == 0 && application.CpuWarningLimitPercent > 0 ? application.CpuWarningLimitPercent : existingAppConfig.CpuWarningLimitPercent; + existingAppConfig.NetworkErrorActivePorts = existingAppConfig.NetworkErrorActivePorts == 0 && application.NetworkErrorActivePorts > 0 ? application.NetworkErrorActivePorts : existingAppConfig.NetworkErrorActivePorts; + existingAppConfig.NetworkWarningActivePorts = existingAppConfig.NetworkWarningActivePorts == 0 && application.NetworkWarningActivePorts > 0 ? application.NetworkWarningActivePorts : existingAppConfig.NetworkWarningActivePorts; + existingAppConfig.NetworkErrorEphemeralPorts = existingAppConfig.NetworkErrorEphemeralPorts == 0 && application.NetworkErrorEphemeralPorts > 0 ? application.NetworkErrorEphemeralPorts : existingAppConfig.NetworkErrorEphemeralPorts; + existingAppConfig.NetworkWarningEphemeralPorts = existingAppConfig.NetworkWarningEphemeralPorts == 0 && application.NetworkWarningEphemeralPorts > 0 ? application.NetworkWarningEphemeralPorts : existingAppConfig.NetworkWarningEphemeralPorts; + existingAppConfig.DumpProcessOnError = application.DumpProcessOnError != existingAppConfig.DumpProcessOnError ? application.DumpProcessOnError : existingAppConfig.DumpProcessOnError; + existingAppConfig.ErrorOpenFileHandles = existingAppConfig.ErrorOpenFileHandles == 0 && application.ErrorOpenFileHandles > 0 ? application.ErrorOpenFileHandles : existingAppConfig.ErrorOpenFileHandles; + existingAppConfig.WarningOpenFileHandles = existingAppConfig.WarningOpenFileHandles == 0 && application.WarningOpenFileHandles > 0 ? application.WarningOpenFileHandles : existingAppConfig.WarningOpenFileHandles; + } + else + { + ApplicationInfo appConfig = new ApplicationInfo + { + TargetApp = app.ApplicationName.OriginalString, + TargetAppType = null, + AppExcludeList = application.AppExcludeList, + AppIncludeList = application.AppIncludeList, + ServiceExcludeList = application.ServiceExcludeList, + ServiceIncludeList = application.ServiceIncludeList, + MemoryWarningLimitMb = application.MemoryWarningLimitMb, + MemoryErrorLimitMb = application.MemoryErrorLimitMb, + MemoryWarningLimitPercent = application.MemoryWarningLimitPercent, + MemoryErrorLimitPercent = application.MemoryErrorLimitPercent, + CpuErrorLimitPercent = application.CpuErrorLimitPercent, + CpuWarningLimitPercent = application.CpuWarningLimitPercent, + NetworkErrorActivePorts = application.NetworkErrorActivePorts, + NetworkWarningActivePorts = application.NetworkWarningActivePorts, + NetworkErrorEphemeralPorts = application.NetworkErrorEphemeralPorts, + NetworkWarningEphemeralPorts = application.NetworkWarningEphemeralPorts, + DumpProcessOnError = application.DumpProcessOnError, + ErrorOpenFileHandles = application.ErrorOpenFileHandles, + WarningOpenFileHandles = application.WarningOpenFileHandles, + }; + + userTargetList.Add(appConfig); + } + } + + // Remove the All, Any, * config item. + userTargetList.Remove(application); + } + int settingSFail = 0; foreach (var application in userTargetList) { + Token.ThrowIfCancellationRequested(); + if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { @@ -379,7 +463,7 @@ private async Task InitializeAsync() return false; } - if (!string.IsNullOrEmpty(application.TargetAppType)) + if (!string.IsNullOrWhiteSpace(application.TargetAppType)) { await SetDeployedApplicationReplicaOrInstanceListAsync( null, @@ -387,15 +471,16 @@ await SetDeployedApplicationReplicaOrInstanceListAsync( } else { - await SetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.TargetApp)) - .ConfigureAwait(false); + await SetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.TargetApp)).ConfigureAwait(false); } } - foreach (var repOrInst in ReplicaOrInstanceList) + foreach (var app in deployedTargetList) { + Token.ThrowIfCancellationRequested(); + ObserverLogger.LogInfo( - $"Will observe resource consumption by {repOrInst.ApplicationName?.OriginalString} " + + $"Will observe resource consumption by {app.TargetApp} " + $"on Node {NodeName}."); } @@ -414,9 +499,10 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) int processId = (int)repOrInst.HostProcessId; var cpuUsage = new CpuUsage(); bool checkCpu = false, checkMemMb = false, checkMemPct = false, checkAllPorts = false, checkEphemeralPorts = false, checkHandles = false; - var application = deployedTargetList?.FirstOrDefault( + var application = deployedTargetList?.Find( app => app?.TargetApp?.ToLower() == repOrInst.ApplicationName?.OriginalString?.ToLower() || - app?.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); + !string.IsNullOrWhiteSpace(app?.TargetAppType) && + app.TargetAppType?.ToLower() == repOrInst.ApplicationTypeName?.ToLower()); if (application?.TargetApp == null && application?.TargetAppType == null) { @@ -529,11 +615,6 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); } - if (checkMemMb || checkMemPct) - { - _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); - } - timer.Start(); while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) @@ -619,7 +700,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) { WriteToLogWithLevel( ObserverName, - $"MonitorAsync failed to find current service process for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}{Environment.NewLine}{e}", + $"MonitorDeployedAppsAsync: failed to find current service process or target process is running at a higher privilege than FO for {repOrInst.ApplicationName?.OriginalString ?? repOrInst.ApplicationTypeName}{Environment.NewLine}{e}", LogLevel.Information); } else @@ -628,7 +709,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) { WriteToLogWithLevel( ObserverName, - $"Unhandled exception in MonitorAsync:{Environment.NewLine}{e}", + $"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}", LogLevel.Warning); } @@ -657,10 +738,12 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync( { deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName).ConfigureAwait(true); - if (deployedApps.Count > 0 && !string.IsNullOrEmpty(applicationType)) + if (deployedApps.Count > 0 && !string.IsNullOrWhiteSpace(applicationType)) { for (int i = 0; i < deployedApps.Count; i++) { + Token.ThrowIfCancellationRequested(); + if (deployedApps[i].ApplicationTypeName == applicationType) { continue; @@ -676,43 +759,47 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync( foreach (var deployedApp in deployedApps) { - List filteredServiceList = null; + Token.ThrowIfCancellationRequested(); - var appFilter = userTargetList.Where(x => (x.TargetApp != null || x.TargetAppType != null) - && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) - && (!string.IsNullOrEmpty(x.ServiceExcludeList) - || !string.IsNullOrEmpty(x.ServiceIncludeList)))?.FirstOrDefault(); + List filteredServiceList = null; - // Filter service list if include/exclude service(s) config setting is supplied. - var filterType = ServiceFilterType.None; + // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. + var serviceFilter = userTargetList.Find(x => (x.TargetApp != null || x.TargetAppType != null) + && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) + && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); - if (appFilter != null) + ServiceFilterType filterType = ServiceFilterType.None; + + if (serviceFilter != null) { - if (!string.IsNullOrEmpty(appFilter.ServiceExcludeList)) + if (!string.IsNullOrWhiteSpace(serviceFilter.ServiceExcludeList)) { - filteredServiceList = appFilter.ServiceExcludeList.Split(',').ToList(); + filteredServiceList = serviceFilter.ServiceExcludeList.Split(',').ToList(); filterType = ServiceFilterType.Exclude; } - else if (!string.IsNullOrEmpty(appFilter.ServiceIncludeList)) + else if (!string.IsNullOrWhiteSpace(serviceFilter.ServiceIncludeList)) { - filteredServiceList = appFilter.ServiceIncludeList.Split(',').ToList(); + filteredServiceList = serviceFilter.ServiceIncludeList.Split(',').ToList(); filterType = ServiceFilterType.Include; } } var replicasOrInstances = await GetDeployedPrimaryReplicaAsync( - deployedApp.ApplicationName, - filteredServiceList, - filterType, - applicationType).ConfigureAwait(true); + deployedApp.ApplicationName, + filteredServiceList, + filterType, + applicationType).ConfigureAwait(true); ReplicaOrInstanceList.AddRange(replicasOrInstances); deployedTargetList.AddRange(userTargetList.Where( - x => (x.TargetApp != null || x.TargetAppType != null) - && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()))); + x => + { + return (x.TargetApp != null || x.TargetAppType != null) + && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()); + })); } } @@ -738,7 +825,7 @@ private async Task> GetDeployedPrimaryRepl private void SetInstanceOrReplicaMonitoringList( Uri appName, - List serviceFilterList, + List filterList, ServiceFilterType filterType, string appTypeName, DeployedServiceReplicaList deployedReplicaList, @@ -746,6 +833,8 @@ private void SetInstanceOrReplicaMonitoringList( { foreach (var deployedReplica in deployedReplicaList) { + Token.ThrowIfCancellationRequested(); + ReplicaOrInstanceMonitoringInfo replicaInfo = null; if (deployedReplica is DeployedStatefulServiceReplica statefulReplica @@ -761,10 +850,10 @@ private void SetInstanceOrReplicaMonitoringList( ServiceName = statefulReplica.ServiceName, }; - if (serviceFilterList != null + if (filterList != null && filterType != ServiceFilterType.None) { - bool isInFilterList = serviceFilterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); switch (filterType) { @@ -786,10 +875,10 @@ private void SetInstanceOrReplicaMonitoringList( ServiceName = statelessInstance.ServiceName, }; - if (serviceFilterList != null + if (filterList != null && filterType != ServiceFilterType.None) { - bool isInFilterList = serviceFilterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); switch (filterType) { diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 0750a31a..38ee28d8 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -211,7 +211,7 @@ public override Task ReportAsync(CancellationToken token) HasActiveFabricErrorOrWarning = true; - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { TelemetryData telemetryData = new TelemetryData(FabricClientInstance, token) { @@ -219,7 +219,7 @@ public override Task ReportAsync(CancellationToken token) HealthState = "Warning", NodeName = NodeName, Metric = ErrorWarningProperty.CertificateExpiration, - HealthEventDescription = healthMessage, + Description = healthMessage, ObserverName = ObserverName, OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", Source = ObserverConstants.FabricObserverName, @@ -233,7 +233,7 @@ public override Task ReportAsync(CancellationToken token) if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index f06d0dff..d03159cb 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -61,7 +61,12 @@ public DiskObserver(FabricClient fabricClient, StatelessServiceContext context) DiskSpaceUsagePercentageData = new List>(); DiskSpaceAvailableMbData = new List>(); DiskSpaceTotalMbData = new List>(); - DiskAverageQueueLengthData = new List>(); + + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + DiskAverageQueueLengthData = new List>(); + } + stopWatch = new Stopwatch(); } @@ -119,8 +124,8 @@ public override async Task ObserveAsync(CancellationToken token) DiskSpaceUsagePercentageData.Add(new FabricResourceUsageData(ErrorWarningProperty.DiskSpaceUsagePercentage, id, DataCapacity)); } - // Current disk queue length. - if (DiskAverageQueueLengthData.All(data => data.Id != id) && (AverageQueueLengthErrorThreshold > 0 || AverageQueueLengthWarningThreshold > 0)) + // Current disk queue length. Windows only. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && DiskAverageQueueLengthData.All(data => data.Id != id) && (AverageQueueLengthErrorThreshold > 0 || AverageQueueLengthWarningThreshold > 0)) { DiskAverageQueueLengthData.Add(new FabricResourceUsageData(ErrorWarningProperty.DiskAverageQueueLength, id, DataCapacity)); } @@ -180,7 +185,7 @@ public override Task ReportAsync(CancellationToken token) { try { - var timeToLiveWarning = SetHealthReportTimeToLive(); + var timeToLiveWarning = GetHealthReportTimeToLive(); // User-supplied Disk Space Usage % thresholds from Settings.xml. foreach (var data in DiskSpaceUsagePercentageData) @@ -193,15 +198,18 @@ public override Task ReportAsync(CancellationToken token) timeToLiveWarning); } - // User-supplied Average disk queue length thresholds from Settings.xml. - foreach (var data in DiskAverageQueueLengthData) + // User-supplied Average disk queue length thresholds from Settings.xml. Windows only. + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - token.ThrowIfCancellationRequested(); - ProcessResourceDataReportHealth( - data, - AverageQueueLengthErrorThreshold, - AverageQueueLengthWarningThreshold, - timeToLiveWarning); + foreach (var data in DiskAverageQueueLengthData) + { + token.ThrowIfCancellationRequested(); + ProcessResourceDataReportHealth( + data, + AverageQueueLengthErrorThreshold, + AverageQueueLengthWarningThreshold, + timeToLiveWarning); + } } /* For ETW Only - These calls will just produce ETW (note the thresholds). */ diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index 3c261b57..9ea3a90e 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -31,7 +31,6 @@ public class FabricSystemObserver : ObserverBase { private readonly List processWatchList; private Stopwatch stopwatch; - private bool disposed; // Health Report data container - For use in analysis to determine health state. private List> allCpuData; @@ -246,7 +245,7 @@ public override Task ReportAsync(CancellationToken token) } // Informational report. - TimeSpan timeToLiveWarning = SetHealthReportTimeToLive(); + TimeSpan timeToLiveWarning = GetHealthReportTimeToLive(); HealthReport informationReport = new HealthReport { Observer = ObserverName, @@ -464,19 +463,6 @@ public void ReadServiceFabricWindowsEventLog() } } - protected override void Dispose(bool disposing) - { - if (disposed) - { - return; - } - - if (disposing) - { - disposed = true; - } - } - private Process[] GetDotnetProcessesByFirstArgument(string argument) { List result = new List(); @@ -484,6 +470,8 @@ private Process[] GetDotnetProcessesByFirstArgument(string argument) for (int i = 0; i < processes.Length; ++i) { + Token.ThrowIfCancellationRequested(); + Process p = processes[i]; try @@ -974,7 +962,7 @@ private void ProcessResourceDataList( dataItem, thresholdError, thresholdWarning, - SetHealthReportTimeToLive(), + GetHealthReportTimeToLive(), HealthReportType.Application); } } diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 35c328a0..2b3cb385 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -122,7 +122,7 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { - var timeToLiveWarning = SetHealthReportTimeToLive(); + var timeToLiveWarning = GetHealthReportTimeToLive(); // Report on connection state. foreach (var config in userConfig) @@ -147,13 +147,13 @@ public override Task ReportAsync(CancellationToken token) ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, HealthState = "Warning", - HealthEventDescription = healthMessage, + Description = healthMessage, ObserverName = ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName = NodeName, }; - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportMetricAsync( telemetryData, @@ -184,7 +184,7 @@ public override Task ReportAsync(CancellationToken token) // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { @@ -227,14 +227,14 @@ public override Task ReportAsync(CancellationToken token) HealthReporter.ReportHealthToServiceFabric(report); // Telemetry. - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { var telemetryData = new TelemetryData(FabricClientInstance, token) { ApplicationName = conn.TargetApp, Code = FOErrorWarningCodes.Ok, HealthState = "Ok", - HealthEventDescription = healthMessage, + Description = healthMessage, ObserverName = ObserverName, Metric = "Internet Connection State", NodeName = NodeName, @@ -248,7 +248,7 @@ public override Task ReportAsync(CancellationToken token) // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 40d94359..8be4c346 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -138,13 +138,11 @@ public double LinuxFileHandlesWarningPercent get; set; } - // TODO.. public int LinuxFileHandlesErrorTotalAllocated { get; set; } - // TODO.. public int LinuxFileHandlesWarningTotalAllocated { get; set; @@ -292,7 +290,7 @@ public override Task ReportAsync(CancellationToken token) // Report on the global health state (system-wide (node) metrics). // User-configurable in NodeObserver.config.json - var timeToLiveWarning = SetHealthReportTimeToLive(); + var timeToLiveWarning = GetHealthReportTimeToLive(); // CPU if (CpuTimeData != null && (CpuErrorUsageThresholdPct > 0 || CpuWarningUsageThresholdPct > 0)) @@ -680,9 +678,6 @@ error on these conditions. */ if (CpuTimeData != null && (CpuErrorUsageThresholdPct > 0 || CpuWarningUsageThresholdPct > 0)) { cpuUtilizationProvider = CpuUtilizationProvider.Create(); - - // Warm up the counter. - _ = await cpuUtilizationProvider.NextValueAsync(); } // OS-level file handle monitoring only makes sense for Linux, where the Maximum system-wide number of handles the kernel will allocate is a user-configurable setting. diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 791d7ee6..f6c64c6b 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -4,9 +4,11 @@ // ------------------------------------------------------------ using System; +using System.Collections.Generic; using System.ComponentModel; using System.Fabric; using System.Fabric.Health; +using System.Fabric.Query; using System.IO; using System.Linq; using System.Management; @@ -32,8 +34,12 @@ public class OSObserver : ObserverBase private string osReport; private string osStatus; private bool auStateUnknown; - private bool isWindowsUpdateAutoDownloadEnabled; - private bool isWUADSettingEnabled; + private bool isAUAutomaticDownloadEnabled; + + public bool IsAUCheckSettingEnabled + { + get; set; + } public string ClusterManifestPath { @@ -62,27 +68,17 @@ public override async Task ObserveAsync(CancellationToken token) // This only makes sense for Windows and only for non-dev clusters. if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - var nodes = FabricClientInstance.QueryManager.GetNodeListAsync( - null, - AsyncClusterOperationTimeoutSeconds, - Token).GetAwaiter().GetResult(); - - if (nodes.Count > 1 && bool.TryParse( - GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.EnableWindowsAutoUpdateCheck), - out isWUADSettingEnabled)) + await InitializeAUCheckAsync(); + + if (IsAUCheckSettingEnabled) { - if (isWUADSettingEnabled) - { - await CheckWuAutoDownloadEnabledAsync(token).ConfigureAwait(false); - } + await CheckWuAutoDownloadEnabledAsync(token).ConfigureAwait(false); } } await GetComputerInfoAsync(token).ConfigureAwait(false); - await ReportAsync(token).ConfigureAwait(false); + LastRunDateTime = DateTime.Now; } @@ -102,7 +98,7 @@ public override Task ReportAsync(CancellationToken token) NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Error, - HealthReportTimeToLive = SetHealthReportTimeToLive(), + HealthReportTimeToLive = GetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(healthReport); @@ -111,7 +107,7 @@ public override Task ReportAsync(CancellationToken token) HasActiveFabricErrorOrWarning = true; // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( HealthScope.Application, @@ -163,13 +159,13 @@ public override Task ReportAsync(CancellationToken token) HealthMessage = osReport, State = HealthState.Ok, NodeName = NodeName, - HealthReportTimeToLive = SetHealthReportTimeToLive(), + HealthReportTimeToLive = GetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); // Windows Update automatic download enabled? - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && isWindowsUpdateAutoDownloadEnabled) + if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) && isAUAutomaticDownloadEnabled) { string linkText = $"{Environment.NewLine}For clusters of Silver durability or above, " + @@ -187,22 +183,22 @@ public override Task ReportAsync(CancellationToken token) HealthMessage = auServiceEnabledMessage, State = HealthState.Warning, NodeName = NodeName, - HealthReportTimeToLive = SetHealthReportTimeToLive(), + HealthReportTimeToLive = GetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(report); if (IsTelemetryProviderEnabled - && IsObserverTelemetryEnabled + && IsTelemetryEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). var telemetryData = new TelemetryData(FabricClientInstance, token) { - HealthEventDescription = auServiceEnabledMessage, + Description = auServiceEnabledMessage, HealthState = "Warning", Metric = "WUAutoDownloadEnabled", - Value = isWindowsUpdateAutoDownloadEnabled, + Value = isAUAutomaticDownloadEnabled, NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, @@ -214,9 +210,9 @@ public override Task ReportAsync(CancellationToken token) } // ETW. - if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + if (IsEtwProviderEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { @@ -224,20 +220,12 @@ public override Task ReportAsync(CancellationToken token) HealthEventDescription = auServiceEnabledMessage, ObserverName, Metric = "WUAutoDownloadEnabled", - Value = isWindowsUpdateAutoDownloadEnabled, + Value = isAUAutomaticDownloadEnabled, NodeName, }); } } - if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) - { - // reset au globals for fresh detection during next observer run. - isWindowsUpdateAutoDownloadEnabled = false; - auStateUnknown = false; - isWUADSettingEnabled = false; - } - return Task.CompletedTask; } catch (Exception e) @@ -252,6 +240,42 @@ public override Task ReportAsync(CancellationToken token) } } + private async Task InitializeAUCheckAsync() + { + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + return; + } + + var checkAU = GetSettingParameterValue(ConfigurationSectionName, ObserverConstants.EnableWindowsAutoUpdateCheck); + var infraServices = await GetInfrastructureServiceInstancesAsync().ConfigureAwait(false); + + if (!string.IsNullOrEmpty(checkAU) && bool.TryParse(checkAU, out bool auChk) && infraServices?.Count() > 0) + { + IsAUCheckSettingEnabled = auChk; + } + } + + private async Task> GetInfrastructureServiceInstancesAsync() + { + var allSystemServices = + await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => + FabricClientInstance.QueryManager.GetServiceListAsync( + new Uri("fabric:/System"), + null, + ConfigurationSettings.AsyncTimeout, + Token), + Token).ConfigureAwait(false); + + var infraInstances = allSystemServices.Where( + i => i.ServiceTypeName.Equals( + "InfrastructureServiceType", + StringComparison.InvariantCultureIgnoreCase)); + + return infraInstances; + } + private static string GetWindowsHotFixes(CancellationToken token, bool generateUrl = true) { ManagementObjectSearcher searcher = null; @@ -317,7 +341,7 @@ private Task CheckWuAutoDownloadEnabledAsync(CancellationToken token) try { var wuLibAutoUpdates = new AutomaticUpdatesClass(); - isWindowsUpdateAutoDownloadEnabled = + isAUAutomaticDownloadEnabled = wuLibAutoUpdates.ServiceEnabled && wuLibAutoUpdates.Settings.NotificationLevel == AutomaticUpdatesNotificationLevel.aunlScheduledInstallation; } @@ -380,9 +404,8 @@ private async Task GetComputerInfoAsync(CancellationToken token) if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { - // WU AutoUpdate - Download enabled. - // If the config setting EnableWindowsAutoUpdateCheck is set to false, then don't add this info to sb. - if (isWUADSettingEnabled) + // WU AutoUpdate - Automatic Download enabled. + if (IsAUCheckSettingEnabled) { string auMessage = "WindowsUpdateAutoDownloadEnabled: "; @@ -392,7 +415,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) } else { - auMessage += isWindowsUpdateAutoDownloadEnabled; + auMessage += isAUAutomaticDownloadEnabled; } _ = sb.AppendLine(auMessage); } @@ -453,6 +476,8 @@ private async Task GetComputerInfoAsync(CancellationToken token) foreach (var (driveName, diskSize, percentConsumed) in drivesInformationTuple) { + Token.ThrowIfCancellationRequested(); + string drvSize; if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) @@ -503,7 +528,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) hotFixes = GetWindowsHotFixes(token, generateUrl: false).Replace("\r\n", ", ").TrimEnd(','); } - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { @@ -513,9 +538,9 @@ private async Task GetComputerInfoAsync(CancellationToken token) OS = osInfo.Name, OSVersion = osInfo.Version, OSInstallDate = osInfo.InstallDate, - AutoUpdateEnabled = auStateUnknown ? "Unknown" : isWindowsUpdateAutoDownloadEnabled.ToString(), + AutoUpdateEnabled = auStateUnknown ? "Unknown" : isAUAutomaticDownloadEnabled.ToString(), osInfo.LastBootUpTime, - WindowsAutoUpdateEnabled = isWindowsUpdateAutoDownloadEnabled, + WindowsAutoUpdateEnabled = isAUAutomaticDownloadEnabled, TotalMemorySizeGB = (int)(osInfo.TotalVisibleMemorySizeKB / 1048576), AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), @@ -533,7 +558,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) } // Telemetry - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { if (string.IsNullOrEmpty(hotFixes) && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { @@ -550,7 +575,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) OSVersion = osInfo.Version, OSInstallDate = osInfo.InstallDate, LastBootUpTime = osInfo.LastBootUpTime, - WindowsUpdateAutoDownloadEnabled = isWindowsUpdateAutoDownloadEnabled, + WindowsUpdateAutoDownloadEnabled = isAUAutomaticDownloadEnabled, TotalMemorySizeGB = (int)osInfo.TotalVisibleMemorySizeKB / 1048576, AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index a236ddad..8bb619e1 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -31,7 +31,6 @@ public class ObserverManager : IDisposable public readonly string nodeName; private readonly TelemetryEvents telemetryEvents; private List observers; - private EventWaitHandle globalShutdownEventHandle; private volatile bool shutdownSignaled; private TimeSpan observerExecTimeout = TimeSpan.FromMinutes(30); private CancellationToken token; @@ -41,6 +40,71 @@ public class ObserverManager : IDisposable private IEnumerable serviceCollection; private bool isConfigurationUpdateInProgess; + public static FabricClient FabricClientInstance + { + get; set; + } + + public static int ObserverExecutionLoopSleepSeconds + { + get; private set; + } = ObserverConstants.ObserverRunLoopSleepTimeSeconds; + + public static StatelessServiceContext FabricServiceContext + { + get; set; + } + + public static ITelemetryProvider TelemetryClient + { + get; set; + } + + public static bool TelemetryEnabled + { + get; set; + } + + public static bool FabricObserverInternalTelemetryEnabled + { get; set; } = true; + + public static bool ObserverWebAppDeployed + { + get; set; + } + + public static bool EtwEnabled + { + get => bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableETWProvider), out etwEnabled) && etwEnabled; + + set => etwEnabled = value; + } + + public string ApplicationName + { + get; set; + } + + public bool IsObserverRunning + { + get; set; + } + + private ObserverHealthReporter HealthReporter + { + get; set; + } + + private string Fqdn + { + get; set; + } + + private Logger Logger + { + get; set; + } + /// /// Initializes a new instance of the class. /// This is used for unit testing. @@ -142,71 +206,6 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie } } - public static FabricClient FabricClientInstance - { - get; set; - } - - public static int ObserverExecutionLoopSleepSeconds - { - get; private set; - } = ObserverConstants.ObserverRunLoopSleepTimeSeconds; - - public static StatelessServiceContext FabricServiceContext - { - get; set; - } - - public static ITelemetryProvider TelemetryClient - { - get; set; - } - - public static bool TelemetryEnabled - { - get; set; - } - - public static bool FabricObserverInternalTelemetryEnabled - { get; set; } = true; - - public static bool ObserverWebAppDeployed - { - get; set; - } - - public static bool EtwEnabled - { - get => bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableEventSourceProvider), out etwEnabled) && etwEnabled; - - set => etwEnabled = value; - } - - public string ApplicationName - { - get; set; - } - - public bool IsObserverRunning - { - get; set; - } - - private ObserverHealthReporter HealthReporter - { - get; set; - } - - private string Fqdn - { - get; set; - } - - private Logger Logger - { - get; set; - } - public async Task StartObserversAsync() { try @@ -217,9 +216,6 @@ public async Task StartObserversAsync() return; } - // Create Global Shutdown event handler - globalShutdownEventHandle = new EventWaitHandle(false, EventResetMode.ManualReset); - // Continue running until a shutdown signal is sent Logger.LogInfo("Starting Observers loop."); @@ -228,10 +224,7 @@ public async Task StartObserversAsync() { if (!isConfigurationUpdateInProgess && (shutdownSignaled || token.IsCancellationRequested)) { - _ = globalShutdownEventHandle.Set(); - Logger.LogWarning("Shutdown signaled. Stopping."); await ShutDownAsync().ConfigureAwait(false); - break; } @@ -242,8 +235,11 @@ public async Task StartObserversAsync() if (ObserverExecutionLoopSleepSeconds > 0) { - Logger.LogInfo($"Sleeping for {ObserverExecutionLoopSleepSeconds} seconds before running again."); - ThreadSleep(globalShutdownEventHandle, TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds)); + await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds), token); + } + else if (observers.Count == 1) + { + await Task.Delay(TimeSpan.FromSeconds(15), token); } } } @@ -275,14 +271,15 @@ public async Task StartObserversAsync() // ETW. if (EtwEnabled) { - Logger.EtwLogger?.Write( - $"FabricObserverServiceCriticalHealthEvent", + Logger.LogEtw( + ObserverConstants.FabricObserverETWEventName, new { - Level = 2, // Error + HealthScope = "Application", + HealthState = "Warning", Node = nodeName, Observer = ObserverConstants.ObserverManagerName, - Value = message, + Description = message, }); } @@ -320,7 +317,7 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig NodeName = obs.NodeName, }; - if (obs.AppNames.Count > 0 && obs.AppNames.All(a => !string.IsNullOrEmpty(a) && a.Contains("fabric:/"))) + if (obs.AppNames.Count > 0 && obs.AppNames.All(a => !string.IsNullOrWhiteSpace(a) && a.Contains("fabric:/"))) { foreach (var app in obs.AppNames) { @@ -328,33 +325,26 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig { Uri appName = new Uri(app); var appHealth = await FabricClientInstance.HealthManager.GetApplicationHealthAsync(appName).ConfigureAwait(false); - int? unhealthyEventsCount = appHealth.HealthEvents?.Count(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + var unhealthyFOAppEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)); - if (unhealthyEventsCount != null && unhealthyEventsCount > 0) + foreach (var evt in unhealthyFOAppEvents) { - foreach (var evt in appHealth.HealthEvents) - { - if (!evt.HealthInformation.SourceId.Contains(obs.ObserverName) || evt.HealthInformation.HealthState == HealthState.Ok) - { - continue; - } - - healthReport.AppName = appName; - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; - - var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); - healthReporter.ReportHealthToServiceFabric(healthReport); - - Thread.Sleep(250); - } + healthReport.AppName = appName; + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + + var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); + healthReporter.ReportHealthToServiceFabric(healthReport); + + await Task.Delay(250).ConfigureAwait(false); } } catch (Exception) { } - await Task.Delay(250, token).ConfigureAwait(false); + await Task.Delay(250).ConfigureAwait(false); } } else @@ -363,32 +353,29 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig { var nodeHealth = await FabricClientInstance.HealthManager.GetNodeHealthAsync(obs.NodeName).ConfigureAwait(false); - int? unhealthyEventsCount = nodeHealth.HealthEvents?.Count(s => s.HealthInformation.SourceId.Contains(obs.ObserverName)); + var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where( + s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)); healthReport.ReportType = HealthReportType.Node; - if (unhealthyEventsCount != null && unhealthyEventsCount > 0) + foreach (var evt in unhealthyFONodeEvents) { - foreach (var evt in nodeHealth.HealthEvents) - { - if (!evt.HealthInformation.SourceId.Contains(obs.ObserverName) || evt.HealthInformation.HealthState == HealthState.Ok) - { - continue; - } + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; + var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); + healthReporter.ReportHealthToServiceFabric(healthReport); - var healthReporter = new ObserverHealthReporter(Logger, FabricClientInstance); - healthReporter.ReportHealthToServiceFabric(healthReport); - - Thread.Sleep(250); - } + await Task.Delay(250).ConfigureAwait(false); } + } - catch (Exception) + catch (FabricException) { } + + await Task.Delay(250).ConfigureAwait(false); } obs.HasActiveFabricErrorOrWarning = false; @@ -477,8 +464,6 @@ private async Task ShutDownAsync() cts = null; } - globalShutdownEventHandle?.Dispose(); - // Flush and Dispose all NLog targets. No more logging. Logger.Flush(); DataTableFileLogger.Flush(); @@ -502,40 +487,6 @@ private string GetFabricObserverInternalConfiguration() return ret; } - // This impl is to ensure FO exits if shutdown is requested while the over loop is sleeping - // So, instead of blocking with a Thread.Sleep, for example, ThreadSleep is used to ensure - // we can receive signals and act accordingly during thread sleep state. - private void ThreadSleep(WaitHandle ewh, TimeSpan timeout) - { - // if timeout is <= 0, return. 0 is infinite, and negative is not valid - if (timeout.TotalMilliseconds <= 0) - { - return; - } - - var elapsedTime = new TimeSpan(0, 0, 0); - var stopwatch = new Stopwatch(); - - while (!shutdownSignaled && - !token.IsCancellationRequested && - timeout > elapsedTime) - { - stopwatch.Start(); - - // the event can be signaled by CtrlC, - // Exit ASAP when the program terminates (i.e., shutdown/abort is signalled.) - _ = ewh.WaitOne(timeout.Subtract(elapsedTime)); - stopwatch.Stop(); - - elapsedTime = stopwatch.Elapsed; - } - - if (stopwatch.IsRunning) - { - stopwatch.Stop(); - } - } - /// /// Event handler for application parameter updates (Un-versioned application parameter-only Application Upgrades). /// @@ -543,7 +494,7 @@ private void ThreadSleep(WaitHandle ewh, TimeSpan timeout) /// Contains the information necessary for setting new config params from updated package. private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) { - Logger.LogInfo("Application Parameter upgrade started..."); + Logger.LogWarning("Application Parameter upgrade started..."); try { @@ -567,16 +518,21 @@ private async void CodePackageActivationContext_ConfigurationPackageModifiedEven foreach (var observer in serviceCollection) { + if (token.IsCancellationRequested) + { + return; + } + observer.ConfigurationSettings = new ConfigSettings(e.NewPackage.Settings, $"{observer.ObserverName}Configuration"); if (observer.ConfigurationSettings.IsEnabled) { // The ObserverLogger instance (member of each observer type) checks its EnableVerboseLogging setting before writing Info events (it won't write if this setting is false, thus non-verbose). // So, we set it here in case the parameter update includes a change to this config setting. - // This is the only update-able setting that requires we do this as part of the config update event handling. - string oldVerboseLoggingSetting = e.NewPackage.Settings.Sections[$"{observer.ObserverName}Configuration"].Parameters[ObserverConstants.EnableVerboseLoggingParameter]?.Value.ToLower(); - string newVerboseLoggingSetting = e.OldPackage.Settings.Sections[$"{observer.ObserverName}Configuration"].Parameters[ObserverConstants.EnableVerboseLoggingParameter]?.Value.ToLower(); - + // This is the only updatable setting that requires we do this as part of the config update event handling. + string newVerboseLoggingSetting = e.NewPackage.Settings.Sections[$"{observer.ObserverName}Configuration"].Parameters[ObserverConstants.EnableVerboseLoggingParameter]?.Value.ToLower(); + string oldVerboseLoggingSetting = e.OldPackage.Settings.Sections[$"{observer.ObserverName}Configuration"].Parameters[ObserverConstants.EnableVerboseLoggingParameter]?.Value.ToLower(); + if (newVerboseLoggingSetting != oldVerboseLoggingSetting) { observer.ObserverLogger.EnableVerboseLogging = observer.ConfigurationSettings.EnableVerboseLogging; @@ -749,7 +705,16 @@ private void SetPropertieSFromConfigurationParameters() private void SignalAbortToRunningObserver() { Logger.LogInfo("Signalling task cancellation to currently running Observer."); - cts?.Cancel(); + + try + { + cts?.Cancel(); + } + catch (ObjectDisposedException) + { + + } + Logger.LogInfo("Successfully signaled cancellation to currently running Observer."); } @@ -775,8 +740,8 @@ private async Task RunObserversAsync() { // Shutdown/cancellation signaled, so stop. bool taskCancelled = linkedSFRuntimeObserverTokenSource != null ? - linkedSFRuntimeObserverTokenSource.Token.IsCancellationRequested : - token.IsCancellationRequested; + linkedSFRuntimeObserverTokenSource.Token.IsCancellationRequested : + token.IsCancellationRequested; if (taskCancelled || shutdownSignaled) { @@ -801,12 +766,13 @@ private async Task RunObserversAsync() // Currently, this observer will not run again for the lifetime of this FO service instance. if (!isCompleted) { - string observerHealthWarning = observer.ObserverName + $" has exceeded its specified run time of {observerExecTimeout.TotalSeconds} seconds. " + + string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified run time of {observerExecTimeout.TotalSeconds} seconds. " + $"This means something is wrong with {observer.ObserverName}. It will not be run again. Look into it."; Logger.LogError(observerHealthWarning); observer.IsUnhealthy = true; + // Telemetry. if (TelemetryEnabled) { await (TelemetryClient?.ReportHealthAsync( @@ -818,6 +784,20 @@ private async Task RunObserversAsync() token)).ConfigureAwait(false); } + // ETW. + if (EtwEnabled) + { + Logger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + Scope = HealthScope.Application, + Source = ObserverConstants.ObserverManagerName, + HealthState = "Error", + Message = observerHealthWarning, + }); + } + continue; } diff --git a/FabricObserver/Observers/SFConfigurationObserver.cs b/FabricObserver/Observers/SFConfigurationObserver.cs index 181fa091..7dbbcb8f 100644 --- a/FabricObserver/Observers/SFConfigurationObserver.cs +++ b/FabricObserver/Observers/SFConfigurationObserver.cs @@ -360,7 +360,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 87d1d148..29333fdb 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -1,39 +1,19 @@ [ { - "targetApp": "fabric:/ClusterObserver", - "cpuWarningLimitPercent": 30, - "memoryWarningLimitPercent": 30, - "networkWarningActivePorts": 80, - "networkWarningEphemeralPorts": 40, - "warningOpenFileHandles": 2000 + "targetApp": "*", + "appExcludeList": "fabric:/App1, fabric:/App13", + "cpuWarningLimitPercent": 65, + "memoryWarningLimitMb": 500, + "networkWarningActivePorts": 2000, + "networkWarningEphemeralPorts": 1500 }, { - "targetApp": "fabric:/FabricObserver", - "cpuWarningLimitPercent": 35, - "memoryWarningLimitPercent": 35, - "networkWarningActivePorts": 50, - "networkWarningEphemeralPorts": 40, - "warningOpenFileHandles": 2000 + "targetAppType": "SomeAppType42", + "memoryWarningLimitMb": 200, + "networkWarningActivePorts": 500 }, { - "targetAppType": "CpuStressType", - "cpuWarningLimitPercent": 20.75, - "memoryWarningLimitMb": 150, - "warningOpenFileHandles": 2000 - }, - { - "targetAppType": "PatchOrchestrationApplicationType", - "cpuWarningLimitPercent": 45, - "memoryWarningLimitPercent": 30, - "networkWarningActivePorts": 80, - "networkWarningEphemeralPorts": 40 - }, - { - "targetApp": "fabric:/MyApp42", - "cpuWarningLimitPercent": 60, - "memoryWarningLimitPercent": 50, - "networkWarningActivePorts": 600, - "networkWarningEphemeralPorts": 300, - "warningOpenFileHandles": 1500 + "targetApp": "fabric:/MyApp", + "cpuWarningLimitPercent": 15 } ] \ No newline at end of file diff --git a/FabricObserver/PackageRoot/Config/Settings.xml b/FabricObserver/PackageRoot/Config/Settings.xml index a4a6377e..5f3cc31c 100644 --- a/FabricObserver/PackageRoot/Config/Settings.xml +++ b/FabricObserver/PackageRoot/Config/Settings.xml @@ -4,7 +4,7 @@ - + - - + + + + @@ -102,6 +104,8 @@
+ + diff --git a/FabricObserver/PackageRoot/Config/containerobserver.config.json b/FabricObserver/PackageRoot/Config/containerobserver.config.json deleted file mode 100644 index f15cfbf4..00000000 --- a/FabricObserver/PackageRoot/Config/containerobserver.config.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "targetApp": "fabric:/ContainerFoo", - "cpuWarningLimitPercent": 10, - "memoryWarningLimitMB": 50 - }, - { - "targetApp": "fabric:/ContainerFoo2", - "cpuWarningLimitPercent": 10, - "memoryWarningLimitMB": 50 - } -] \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index f1b1ddad..f1812107 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,11 +27,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 02f37281..5e16cd38 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,11 +21,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index b1488d16..eff52fc3 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + - - + + + + + + + + + + + @@ -37,8 +47,8 @@ - + @@ -50,23 +60,23 @@ - + - + - + - + @@ -84,7 +94,7 @@ - + @@ -97,7 +107,7 @@ - + @@ -120,13 +130,14 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - +
+ @@ -139,6 +150,7 @@
+ @@ -150,6 +162,7 @@
+ @@ -161,6 +174,7 @@
+ @@ -185,6 +199,7 @@
+ @@ -192,6 +207,7 @@
+ @@ -218,6 +234,7 @@
+ @@ -225,12 +242,14 @@
+
diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index a5d30ff8..6b4e188f 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -285,27 +285,20 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() ReplicaOrInstanceList = new List(), }; - obs.ReplicaOrInstanceList.Add(new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = new Uri("fabric:/TestApp"), - PartitionId = Guid.NewGuid(), - HostProcessId = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 0 : 1, - ReplicaOrInstanceId = default(long), - }); - await obs.ObserveAsync(token).ConfigureAwait(true); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); - // observer detected no error conditions. + // observer detected no warning conditions. Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); + + await CleanupTestHealthReportsAsync(obs); obs.Dispose(); - } /// @@ -332,25 +325,18 @@ public async Task AppObserver_ObserveAsync_TargetAppType_Successful_Observer_IsH ReplicaOrInstanceList = new List(), }; - obs.ReplicaOrInstanceList.Add(new ReplicaOrInstanceMonitoringInfo - { - ApplicationName = new Uri("fabric:/TestApp"), - ApplicationTypeName = "TestAppType", - PartitionId = Guid.NewGuid(), - HostProcessId = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? 0 : 1, - ReplicaOrInstanceId = default(long), - }); - await obs.ObserveAsync(token).ConfigureAwait(true); // observer ran to completion with no errors. Assert.IsTrue(obs.LastRunDateTime > startDateTime); - // observer detected no error conditions. + // observer detected no warning conditions. Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); // observer did not have any internal errors during run. Assert.IsFalse(obs.IsUnhealthy); + + await CleanupTestHealthReportsAsync(obs); obs.Dispose(); } @@ -1043,7 +1029,7 @@ public async Task DiskObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin obs.Dispose(); obsMgr.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } /// @@ -1176,7 +1162,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); obs.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } /// @@ -1201,7 +1187,6 @@ public async Task SFConfigurationObserver_ObserveAsync_Successful_Observer_IsHea { IsEnabled = true, ClusterManifestPath = Path.Combine(Environment.CurrentDirectory, "clusterManifest.xml"), - IsObserverTelemetryEnabled = false, }; await obs.ObserveAsync(token).ConfigureAwait(true); @@ -1331,7 +1316,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth obs.Dispose(); obsMgr.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } @@ -1388,7 +1373,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth obs.Dispose(); obsMgr.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } /// @@ -1445,7 +1430,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth obs.Dispose(); obsMgr.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } /// @@ -1502,7 +1487,7 @@ public async Task FabricSystemObserver_ObserveAsync_Successful_Observer_IsHealth obs.Dispose(); obsMgr.Dispose(); - CleanupTestHealthReports(); + await CleanupTestHealthReportsAsync(); } /// @@ -1622,9 +1607,9 @@ private async Task WaitAsync(Func predicate, int timeoutInSeconds) } } - private void CleanupTestHealthReports() + private async Task CleanupTestHealthReportsAsync(ObserverBase obs = null) { - // Clear any existing node or fabric:/System app Test Health Reports. + // Clear any existing user app, node or fabric:/System app Test Health Reports. try { FabricObserver.Observers.Utilities.HealthReport healthReport = new FabricObserver.Observers.Utilities.HealthReport @@ -1637,13 +1622,46 @@ private void CleanupTestHealthReports() }; var logger = new Logger("TestCleanUp"); - var fabricClient = new FabricClient(FabricClientRole.Admin); - // System apps reports - var appHealth = fabricClient.HealthManager.GetApplicationHealthAsync(new Uri("fabric:/System")).GetAwaiter().GetResult(); + // App reports + if (obs != null && obs.HasActiveFabricErrorOrWarning && + obs.ObserverName != ObserverConstants.NetworkObserverName) + { + if (obs.AppNames.Count > 0 && obs.AppNames.All(a => !string.IsNullOrEmpty(a) && a.Contains("fabric:/"))) + { + foreach (var app in obs.AppNames) + { + try + { + Uri appName = new Uri(app); + var appHealth = await fabricClient.HealthManager.GetApplicationHealthAsync(appName); + var unhealthyEvents = appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains(obs.ObserverName) + && (s.HealthInformation.HealthState == HealthState.Error || s.HealthInformation.HealthState == HealthState.Warning)); + + foreach (var evt in unhealthyEvents) + { + healthReport.AppName = appName; + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; + + var healthReporter = new ObserverHealthReporter(logger, fabricClient); + healthReporter.ReportHealthToServiceFabric(healthReport); + + Thread.Sleep(250); + } + } + catch (Exception) + { + } + } + } + } + + // System reports + var sysAppHealth = await fabricClient.HealthManager.GetApplicationHealthAsync(new Uri("fabric:/System")); - foreach (var evt in appHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("FabricSystemObserver"))) + foreach (var evt in sysAppHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("FabricSystemObserver"))) { if (evt.HealthInformation.HealthState == HealthState.Ok) { @@ -1658,29 +1676,28 @@ private void CleanupTestHealthReports() healthReporter.ReportHealthToServiceFabric(healthReport); Thread.Sleep(250); - } - + } // Node reports - var nodeHealth = fabricClient.HealthManager.GetNodeHealthAsync("_Node_0").GetAwaiter().GetResult(); - healthReport.ReportType = HealthReportType.Node; + var nodeHealth = await fabricClient.HealthManager.GetNodeHealthAsync(this.context.NodeContext.NodeName); - foreach (var evt in nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("NodeObserver") - || s.HealthInformation.SourceId.Contains("DiskObserver"))) - { - if (evt.HealthInformation.HealthState == HealthState.Ok) - { - continue; - } + var unhealthyFONodeEvents = nodeHealth.HealthEvents?.Where(s => s.HealthInformation.SourceId.Contains("NodeObserver") + || s.HealthInformation.SourceId.Contains("DiskObserver") + && (s.HealthInformation.HealthState == HealthState.Error + || s.HealthInformation.HealthState == HealthState.Warning)); + + healthReport.ReportType = HealthReportType.Node; - healthReport.Property = evt.HealthInformation.Property; - healthReport.SourceId = evt.HealthInformation.SourceId; + foreach (var evt in unhealthyFONodeEvents) + { + healthReport.Property = evt.HealthInformation.Property; + healthReport.SourceId = evt.HealthInformation.SourceId; - var healthReporter = new ObserverHealthReporter(logger, fabricClient); - healthReporter.ReportHealthToServiceFabric(healthReport); + var healthReporter = new ObserverHealthReporter(logger, fabricClient); + healthReporter.ReportHealthToServiceFabric(healthReport); - Thread.Sleep(250); - } + Thread.Sleep(250); + } } catch (FabricException) { diff --git a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json index 397f9c0a..d1db115a 100644 --- a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json @@ -1,22 +1,19 @@ [ { - "targetAppType": "AppTestType", - "cpuWarningLimitPercent": 20, + "targetApp": "*", + "cpuWarningLimitPercent": 60, + "networkWarningActivePorts": 1800, + "networkWarningEphemeralPorts": 1400, "warningOpenFileHandles": 5000 }, { - "targetApp": "fabric:/MyApp", + "targetAppType": "MyAppType", "cpuWarningLimitPercent": 30, - "memoryWarningLimitPercent": 30, - "networkWarningActivePorts": 80, - "networkWarningEphemeralPorts": 40, - "warningOpenFileHandles": 10 + "memoryWarningLimitPercent": 20 }, { - "targetApp": "fabric:/MyApp42", - "cpuWarningLimitPercent": 60, - "memoryWarningLimitPercent": 50, - "networkWarningActivePorts": 600, - "networkWarningEphemeralPorts": 300 + "targetApp": "fabric:/MyHardWorkingApp42", + "cpuWarningLimitPercent": 90, + "memoryWarningLimitPercent": 60 } ] \ No newline at end of file diff --git a/README.md b/README.md index 134a0fd4..1af89e79 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.5 +# FabricObserver 3.1.6 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of resources that tend to be important to all Service Fabric applications, like disk, CPU, memory, networking, and cluster certificates out-of-the-box. diff --git a/SampleObserverPlugin/ReadMe.md b/SampleObserverPlugin/ReadMe.md index 44ecc470..42f70d17 100644 --- a/SampleObserverPlugin/ReadMe.md +++ b/SampleObserverPlugin/ReadMe.md @@ -126,7 +126,7 @@ You can deploy using the contents of your build out directory - just remove the * Create new instance of FO, which will contain your observer plugin ```Powershell $path = "[sourcedir]\MyObserverPlugin\bin\release\netstandard2.0\[target os platform, e.g., win-x64 or linux-x64]" -Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV315 -TimeoutSec 1800 -Register-ServiceFabricApplicationType -ApplicationPathInImageStore FabricObserverV315 -New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.5 +Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV316 -TimeoutSec 1800 +Register-ServiceFabricApplicationType -ApplicationPathInImageStore FabricObserverV316 +New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.6 ``` diff --git a/SampleObserverPlugin/SampleNewObserver.cs b/SampleObserverPlugin/SampleNewObserver.cs index 5a4c22f2..4ce49a17 100644 --- a/SampleObserverPlugin/SampleNewObserver.cs +++ b/SampleObserverPlugin/SampleNewObserver.cs @@ -110,13 +110,6 @@ public override Task ReportAsync(CancellationToken token) /* Report to Fabric */ - // These values will be preserved across observer runs and are useful for clearing warnings - // by reporting Ok health state health events with the same property and sourceid values - // as the error/warning health events when FO is safely taken down (e.g., app is being uninstalled, - // safe restart of fabric node it's running on, etc.). - HealthReportProperties.Add("SomePropertyName"); - HealthReportSourceIds.Add($"{ObserverName}_SomethingUniqueToThisReport"); - var healthReporter = new ObserverHealthReporter(ObserverLogger, FabricClientInstance); var healthReport = new Utilities.HealthReport { @@ -124,7 +117,7 @@ public override Task ReportAsync(CancellationToken token) HealthMessage = message.ToString(), NodeName = NodeName, Observer = ObserverName, - Property = HealthReportProperties[HealthReportProperties.Count - 1], + Property = "SomeUniquePropertyForMyHealthEvent", ReportType = HealthReportType.Node, State = HealthState.Ok, }; @@ -135,14 +128,14 @@ public override Task ReportAsync(CancellationToken token) var telemetryData = new TelemetryData(FabricClientInstance, Token) { Code = FOErrorWarningCodes.Ok, - HealthEventDescription = message.ToString(), + Description = message.ToString(), HealthState = "Ok", NodeName = NodeName, ObserverName = ObserverName, Source = ObserverConstants.FabricObserverName, }; - if (IsTelemetryProviderEnabled && IsObserverTelemetryEnabled) + if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( telemetryData, @@ -152,7 +145,7 @@ public override Task ReportAsync(CancellationToken token) // ETW. if (IsEtwEnabled) { - Logger.EtwLogger?.Write( + ObserverLogger.LogEtw( ObserverConstants.FabricObserverETWEventName, new { diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index cf440d02..30ae95cc 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -8,8 +8,8 @@ AnyCPU;x64 - - + +