diff --git a/Build-COSFPkgs.ps1 b/Build-COSFPkgs.ps1 index 1875d66a..f2c42ac9 100644 --- a/Build-COSFPkgs.ps1 +++ b/Build-COSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.6" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.6" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.SelfContained.2.1.7" "$scriptPath\bin\release\ClusterObserver\linux-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Linux.FrameworkDependent.2.1.7" "$scriptPath\bin\release\ClusterObserver\linux-x64\framework-dependent\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.6" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.6" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.SelfContained.2.1.7" "$scriptPath\bin\release\ClusterObserver\win-x64\self-contained\ClusterObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.ClusterObserver.Windows.FrameworkDependent.2.1.7" "$scriptPath\bin\release\ClusterObserver\win-x64\framework-dependent\ClusterObserverType" } finally { Pop-Location diff --git a/Build-FOWSFPkgs.ps1 b/Build-FOWSFPkgs.ps1 index 51c39465..128897d4 100644 --- a/Build-FOWSFPkgs.ps1 +++ b/Build-FOWSFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "FabricObserverWeb.Linux.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\self-contained\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Linux.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\framework-dependent\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Linux.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\self-contained\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Linux.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\linux-x64\framework-dependent\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Windows.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\self-contained\FabricObserverWebApiType" - Build-SFPkg "FabricObserverWeb.Windows.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\framework-dependent\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Windows.SelfContained.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\self-contained\FabricObserverWebApiType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserverWeb.Windows.FrameworkDependent.2.0.1" "$scriptPath\bin\release\FabricObserverWeb\win-x64\framework-dependent\FabricObserverWebApiType" } finally { Pop-Location diff --git a/Build-SFPkgs.ps1 b/Build-SFPkgs.ps1 index dd79d5e3..cf32cd67 100644 --- a/Build-SFPkgs.ps1 +++ b/Build-SFPkgs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.8" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.8" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.1.9" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.1.9" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.8" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.8" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.1.9" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.1.9" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType" } finally { Pop-Location diff --git a/ClusterObserver.nuspec.template b/ClusterObserver.nuspec.template index d6c6b081..9e789f12 100644 --- a/ClusterObserver.nuspec.template +++ b/ClusterObserver.nuspec.template @@ -2,8 +2,12 @@ %PACKAGE_ID% - 2.1.6 - FO 3.1.8 support. Crashing bug fix. + 2.1.7 + + - Fixed important bug in Application health query processor. + - ApplicationInsights TelemetryProvider impl update. + - Improvements in service close cleanup code. + Microsoft MIT false diff --git a/ClusterObserver/ApplicationInsights.config b/ClusterObserver/ApplicationInsights.config index f64501d6..bba013fa 100644 --- a/ClusterObserver/ApplicationInsights.config +++ b/ClusterObserver/ApplicationInsights.config @@ -1,7 +1,6 @@  - + diff --git a/ClusterObserver/ClusterObserver.cs b/ClusterObserver/ClusterObserver.cs index 96dce491..43ae9428 100644 --- a/ClusterObserver/ClusterObserver.cs +++ b/ClusterObserver/ClusterObserver.cs @@ -37,7 +37,7 @@ private HealthState LastKnownClusterHealthState private Dictionary NodeStatusDictionary { get; - } = new Dictionary(); + } protected bool TelemetryEnabled => ClusterObserverManager.TelemetryEnabled; @@ -50,11 +50,6 @@ public ConfigSettings ConfigSettings get; set; } - public bool IsTestRun - { - get; set; - } = false; - public string ObserverName { get; set; @@ -134,6 +129,7 @@ public ClusterObserver(ConfigurationSettings settings = null) FabricServiceContext = ClusterObserverManager.FabricServiceContext; NodeName = FabricServiceContext.NodeContext.NodeName; NodeType = FabricServiceContext.NodeContext.NodeType; + NodeStatusDictionary = new Dictionary(); if (settings == null) { @@ -191,13 +187,10 @@ private async Task ReportClusterHealthAsync(CancellationToken token) foreach (var repair in repairsInProgress) { - ids += $"TaskId: {repair.TaskId}{Environment.NewLine}" + - $"State: {repair.State}{Environment.NewLine}"; + ids += $"TaskId: {repair.TaskId}{Environment.NewLine}State: {repair.State}{Environment.NewLine}"; } - telemetryDescription += - $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}" + - $"{ids}"; + telemetryDescription += $"Note: There are currently one or more Repair Tasks processing in the cluster.{Environment.NewLine}{ids}"; } int udInClusterUpgrade = await UpgradeChecker.GetUdsWhereFabricUpgradeInProgressAsync(FabricClientInstance, token); @@ -231,15 +224,15 @@ private async Task ReportClusterHealthAsync(CancellationToken token) if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Cluster", - HealthState = "Ok", - HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", - Metric = "AggregatedClusterHealth", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Cluster", + HealthState = "Ok", + HealthEventDescription = "Cluster has recovered from previous Error/Warning state.", + Metric = "AggregatedClusterHealth", + Source = ObserverName, + }); } } else @@ -274,12 +267,9 @@ private async Task ReportClusterHealthAsync(CancellationToken token) { await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } else if (evaluation.Kind == HealthEvaluationKind.Application @@ -290,12 +280,9 @@ e is OperationCanceledException || { await ProcessApplicationHealthAsync(clusterHealth.ApplicationHealthStates, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is OperationCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } else @@ -304,12 +291,9 @@ e is OperationCanceledException || { await ProcessGenericEntityHealthAsync(evaluation, token).ConfigureAwait(false); } - catch (Exception e) when - (e is FabricException || - e is TimeoutException || - e is OperationCanceledException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { - ObserverLogger.LogWarning($"Handled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"); + continue; } } } @@ -318,11 +302,14 @@ e is TimeoutException || LastKnownClusterHealthState = clusterHealth.AggregatedHealthState; } } - catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException || e is TimeoutException) + catch (FabricException fe) // This can happen when running CO unit test. In production, this is very rare. { - // Handled by ignoring. + string msg = $"Handled transient FabricException in ReportClusterHealthAsync:{Environment.NewLine}{fe}"; + + // Log it locally. + ObserverLogger.LogWarning(msg); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { string msg = $"Unhandled exception in ReportClusterHealthAsync:{Environment.NewLine}{e}"; @@ -345,12 +332,12 @@ e is TimeoutException || if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthState = "Warning", - HealthEventDescription = msg, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthState = "Warning", + HealthEventDescription = msg, + }); } // Fix the bug. @@ -386,18 +373,13 @@ private async Task ProcessApplicationHealthAsync(IList a // Check upgrade status of unhealthy application. Note, this doesn't apply to System applications as they update as part of a platform update. if (appName.OriginalString != "fabric:/System") { - var appUpgradeStatus = - await FabricClientInstance.ApplicationManager.GetApplicationUpgradeProgressAsync(appName); + var appUpgradeStatus = await FabricClientInstance.ApplicationManager.GetApplicationUpgradeProgressAsync(appName); if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending) { - var udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync( - FabricClientInstance, - token, - appName); - + List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(FabricClientInstance, token, appName); string udText = string.Empty; // -1 means no upgrade in progress for application @@ -412,139 +394,98 @@ private async Task ProcessApplicationHealthAsync(IList a } } - var appHealthEvents = - appHealth.HealthEvents.Where(e => e.HealthInformation.HealthState == HealthState.Error || e.HealthInformation.HealthState == HealthState.Warning); + var appHealthEvents = appHealth.HealthEvents.Where(e => e.HealthInformation.HealthState == HealthState.Error || e.HealthInformation.HealthState == HealthState.Warning); if (appHealthEvents.Count() == 0) { - var evals = appHealth.UnhealthyEvaluations; + continue; + } - if (evals.Count == 0) + foreach (HealthEvent healthEvent in appHealthEvents.OrderByDescending(f => f.SourceUtcTimestamp)) + { + var foTelemetryData = TryGetFOHealthStateEventData(healthEvent, HealthScope.Application); + + // From FabricObserver? + if (foTelemetryData != null) { - continue; - } - - var eval = appHealth.UnhealthyEvaluations[0]; + // Telemetry. + if (TelemetryEnabled && ObserverTelemetryClient != null) + { + await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); + } - telemetryDescription += eval.Description; - - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) - { - var telemetryData = new TelemetryData(FabricClientInstance, token) + // ETW. + if (etwEnabled) { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - Description = telemetryDescription, - Source = ObserverName, - }; + double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); - } + Logger.EtwLogger?.Write( + ObserverConstants.ClusterObserverETWEventName, + new + { + foTelemetryData.ApplicationName, + foTelemetryData.ServiceName, + foTelemetryData.HealthState, + foTelemetryData.Description, + foTelemetryData.Metric, + foTelemetryData.ObserverName, + foTelemetryData.NodeName, + Source = ObserverName, + foTelemetryData.PartitionId, + foTelemetryData.ProcessId, + foTelemetryData.ReplicaId, + foTelemetryData.SystemServiceProcessName, + // 0 could be a real value, thus defaulting to -1 when tryparse returns false (see above).. + Value = value > -1 ? value : 0, + }); + } - // ETW. - if (etwEnabled) - { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName, - }); + // Reset + telemetryDescription = string.Empty; } - - // Reset - telemetryDescription = string.Empty; - } - else - { - // We only care about the latest (most recent) health event - there can be a very large number of events in the Health Event Store. - foreach (HealthEvent healthEvent in appHealthEvents.OrderByDescending(f => f.SourceUtcTimestamp).Take(1)) + else { - var foTelemetryData = TryGetFOHealthStateEventData(healthEvent, HealthScope.Application); - - // From FabricObserver? - if (foTelemetryData != null) + if (!string.IsNullOrWhiteSpace(healthEvent.HealthInformation.Description)) { - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) - { - await ObserverTelemetryClient.ReportHealthAsync(foTelemetryData, token); - } - - // ETW. - if (etwEnabled) - { - double value = double.TryParse(foTelemetryData.Value?.ToString(), out double val) ? val : -1; - - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - foTelemetryData.ApplicationName, - foTelemetryData.HealthState, - foTelemetryData.Description, - foTelemetryData.Metric, - foTelemetryData.ObserverName, - foTelemetryData.NodeName, - Source = ObserverName, - foTelemetryData.PartitionId, - foTelemetryData.ReplicaId, - foTelemetryData.SystemServiceProcessName, - Value = value, - }); - } - - // Reset - telemetryDescription = string.Empty; + telemetryDescription += healthEvent.HealthInformation.Description; } else { - if (!string.IsNullOrWhiteSpace(healthEvent.HealthInformation.Description)) - { - telemetryDescription += healthEvent.HealthInformation.Description; - } - else - { - telemetryDescription += string.Join($"{Environment.NewLine}", appHealth.UnhealthyEvaluations); - } + telemetryDescription += string.Join($"{Environment.NewLine}", appHealth.UnhealthyEvaluations); + } - // Telemetry. - if (TelemetryEnabled && ObserverTelemetryClient != null) + // Telemetry. + if (TelemetryEnabled && ObserverTelemetryClient != null) + { + var telemetryData = new TelemetryData(FabricClientInstance, token) { - var telemetryData = new TelemetryData(FabricClientInstance, token) - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - Description = telemetryDescription, - Source = ObserverName, - }; - - await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); - } + ApplicationName = appName.OriginalString, + HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), + Description = telemetryDescription, + Source = ObserverName, + }; - // ETW. - if (etwEnabled) - { - Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - ApplicationName = appName.OriginalString, - HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), - HealthEventDescription = telemetryDescription, - Source = ObserverName, - }); - } + await ObserverTelemetryClient.ReportHealthAsync(telemetryData, token); + } - // Reset - telemetryDescription = string.Empty; + // ETW. + if (etwEnabled) + { + Logger.EtwLogger?.Write( + ObserverConstants.ClusterObserverETWEventName, + new + { + ApplicationName = appName.OriginalString, + HealthState = Enum.GetName(typeof(HealthState), appHealth.AggregatedHealthState), + HealthEventDescription = telemetryDescription, + Source = ObserverName, + }); } + + // Reset + telemetryDescription = string.Empty; } - } + } } } @@ -552,9 +493,7 @@ private async Task ProcessNodeHealthAsync(IList nodeHealthState { // Check cluster upgrade status. int udInClusterUpgrade = await UpgradeChecker.GetUdsWhereFabricUpgradeInProgressAsync(FabricClientInstance, token).ConfigureAwait(false); - - var supportedNodeHealthStates = - nodeHealthStates.Where( a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); + var supportedNodeHealthStates = nodeHealthStates.Where( a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); foreach (var node in supportedNodeHealthStates) { @@ -680,13 +619,13 @@ private async Task ProcessGenericEntityHealthAsync(HealthEvaluation evaluation, if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthEventDescription = telemetryDescription, - HealthState = healthState, - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthEventDescription = telemetryDescription, + HealthState = healthState, + Source = ObserverName, + }); } } @@ -695,19 +634,14 @@ private async Task MonitorNodeStatusAsync(CancellationToken token) // If a node's NodeStatus is Disabling, Disabled, or Down // for at or above the specified maximum time (in Settings.xml), // then CO will emit a Warning signal. - var nodeList = - await FabricClientInstance.QueryManager.GetNodeListAsync( - null, - ConfigSettings.AsyncTimeout, - token).ConfigureAwait(true); + var nodeList = await FabricClientInstance.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, token).ConfigureAwait(true); // Are any of the nodes that were previously in non-Up status, now Up? if (NodeStatusDictionary.Count > 0) { foreach (var nodeDictItem in NodeStatusDictionary) { - if (!nodeList.Any(n => n.NodeName == nodeDictItem.Key - && n.NodeStatus == NodeStatus.Up)) + if (!nodeList.Any(n => n.NodeName == nodeDictItem.Key && n.NodeStatus == NodeStatus.Up)) { continue; } @@ -732,17 +666,17 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Node", - HealthState = "Ok", - HealthEventDescription = $"{nodeDictItem.Key} is now Up.", - Metric = "NodeStatus", - NodeName = nodeDictItem.Key, - NodeStatus = "Up", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Node", + HealthState = "Ok", + HealthEventDescription = $"{nodeDictItem.Key} is now Up.", + Metric = "NodeStatus", + NodeName = nodeDictItem.Key, + NodeStatus = "Up", + Source = ObserverName, + }); } // Clear dictionary entry. @@ -750,9 +684,7 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( } } - if (!nodeList.All( - n => - n.NodeStatus == NodeStatus.Up)) + if (!nodeList.All(n => n.NodeStatus == NodeStatus.Up)) { var filteredList = nodeList.Where( node => node.NodeStatus == NodeStatus.Disabled @@ -763,14 +695,11 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( { if (!NodeStatusDictionary.ContainsKey(node.NodeName)) { - NodeStatusDictionary.Add( - node.NodeName, - (node.NodeStatus, DateTime.Now, DateTime.Now)); + NodeStatusDictionary.Add(node.NodeName, (node.NodeStatus, DateTime.Now, DateTime.Now)); } else { - if (NodeStatusDictionary.TryGetValue( - node.NodeName, out var tuple)) + if (NodeStatusDictionary.TryGetValue(node.NodeName, out var tuple)) { NodeStatusDictionary[node.NodeName] = (node.NodeStatus, tuple.FirstDetectedTime, DateTime.Now); } @@ -811,17 +740,17 @@ await FabricClientInstance.QueryManager.GetNodeListAsync( if (etwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Node", - HealthState = "Warning", - HealthEventDescription = message, - Metric = "NodeStatus", - NodeName = kvp.Key, - NodeStatus = $"{kvp.Value.NodeStatus}", - Source = ObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Node", + HealthState = "Warning", + HealthEventDescription = message, + Metric = "NodeStatus", + NodeName = kvp.Key, + NodeStatus = $"{kvp.Value.NodeStatus}", + Source = ObserverName, + }); } } } diff --git a/ClusterObserver/ClusterObserverManager.cs b/ClusterObserver/ClusterObserverManager.cs index 184b4969..32fc9151 100644 --- a/ClusterObserver/ClusterObserverManager.cs +++ b/ClusterObserver/ClusterObserverManager.cs @@ -5,11 +5,9 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Fabric; using System.Fabric.Health; using System.IO; -using System.Text; using System.Threading; using System.Threading.Tasks; using ClusterObserver.Interfaces; @@ -20,18 +18,17 @@ namespace ClusterObserver { public class ClusterObserverManager : IDisposable { + private static bool etwEnabled; private readonly string nodeName; + private readonly CancellationTokenSource linkedSFRuntimeObserverTokenSource; + private readonly CancellationToken token; private ClusterObserver observer; - private EventWaitHandle globalShutdownEventHandle; - private volatile bool shutdownSignaled; private int shutdownGracePeriodInSeconds = 2; private TimeSpan observerExecTimeout = TimeSpan.FromMinutes(30); - private CancellationToken token; private CancellationTokenSource cts; - private CancellationTokenSource linkedSFRuntimeObserverTokenSource; + private volatile bool shutdownSignaled; private bool hasDisposed; private bool appParamsUpdating; - private static bool etwEnabled; public bool IsObserverRunning { @@ -133,6 +130,7 @@ private static string GetConfigSettingValue(string parameterName) } catch (Exception e) when (e is KeyNotFoundException || e is FabricElementNotFoundException) { + } return null; @@ -145,84 +143,38 @@ private async void ShutdownHandler(object sender, ConsoleCancelEventArgs console return; } - Thread.Sleep(shutdownGracePeriodInSeconds * 1000); + await Task.Delay(shutdownGracePeriodInSeconds).ConfigureAwait(false); shutdownSignaled = true; - _ = globalShutdownEventHandle?.Set(); await StopAsync(); } - // This impl is to ensure FCO exits if shutdown is requested while the over loop is sleeping - // So, instead of blocking with a Thread.Sleep, for example, ThreadSleep is used to ensure - // we can receive signals and act accordingly during thread sleep state. - private void ThreadSleep(EventWaitHandle ewh, TimeSpan timeout) - { - // if timeout is <= 0, return. 0 is infinite, and negative is not valid - if (timeout.TotalMilliseconds <= 0) - { - return; - } - - var elapsedTime = new TimeSpan(0, 0, 0); - var stopwatch = new Stopwatch(); - - while (!shutdownSignaled && - !token.IsCancellationRequested && - timeout > elapsedTime) - { - stopwatch.Start(); - - // The event can be signaled by CtrlC, - // Exit ASAP when the program terminates (i.e., shutdown/abort is signaled.) - _ = ewh.WaitOne(timeout.Subtract(elapsedTime)); - stopwatch.Stop(); - - elapsedTime = stopwatch.Elapsed; - } - - if (stopwatch.IsRunning) - { - stopwatch.Stop(); - } - } - private void SetPropertiesFromConfigurationParameters() { // Observer - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverExecutionTimeout), - out int result)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverExecutionTimeout), out int result)) { observerExecTimeout = TimeSpan.FromSeconds(result); } // Logger - if (bool.TryParse( - GetConfigSettingValue(ObserverConstants.EnableVerboseLoggingParameter), - out bool enableVerboseLogging)) + if (bool.TryParse(GetConfigSettingValue(ObserverConstants.EnableVerboseLoggingParameter), out bool enableVerboseLogging)) { Logger.EnableVerboseLogging = enableVerboseLogging; } - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverLoopSleepTimeSeconds), - out int execFrequency)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverLoopSleepTimeSeconds), out int execFrequency)) { ObserverExecutionLoopSleepSeconds = execFrequency; - - Logger.LogInfo($"ExecutionFrequency is {ObserverExecutionLoopSleepSeconds} Seconds"); } // Shutdown - if (int.TryParse( - GetConfigSettingValue(ObserverConstants.ObserverShutdownGracePeriodInSeconds), - out int gracePeriodInSeconds)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.ObserverShutdownGracePeriodInSeconds), out int gracePeriodInSeconds)) { shutdownGracePeriodInSeconds = gracePeriodInSeconds; } - if (int.TryParse(GetConfigSettingValue(ObserverConstants.AsyncOperationTimeoutSeconds), - out int asyncTimeout)) + if (int.TryParse(GetConfigSettingValue(ObserverConstants.AsyncOperationTimeoutSeconds), out int asyncTimeout)) { AsyncOperationTimeoutSeconds = asyncTimeout; } @@ -240,14 +192,12 @@ private void SetPropertiesFromConfigurationParameters() if (string.IsNullOrEmpty(telemetryProviderType)) { TelemetryEnabled = false; - return; } if (!Enum.TryParse(telemetryProviderType, out TelemetryProviderType telemetryProvider)) { TelemetryEnabled = false; - return; } @@ -255,29 +205,22 @@ private void SetPropertiesFromConfigurationParameters() { case TelemetryProviderType.AzureLogAnalytics: - var logAnalyticsLogType = - GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter) ?? "Application"; - - var logAnalyticsSharedKey = - GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter); + string logAnalyticsLogType = GetConfigSettingValue(ObserverConstants.LogAnalyticsLogTypeParameter) ?? "Application"; + string logAnalyticsSharedKey = GetConfigSettingValue(ObserverConstants.LogAnalyticsSharedKeyParameter); + string logAnalyticsWorkspaceId = GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter); - var logAnalyticsWorkspaceId = - GetConfigSettingValue(ObserverConstants.LogAnalyticsWorkspaceIdParameter); - - if (string.IsNullOrEmpty(logAnalyticsSharedKey) - || string.IsNullOrEmpty(logAnalyticsWorkspaceId)) + if (string.IsNullOrEmpty(logAnalyticsSharedKey) || string.IsNullOrEmpty(logAnalyticsWorkspaceId)) { TelemetryEnabled = false; - return; } TelemetryClient = new LogAnalyticsTelemetry( - logAnalyticsWorkspaceId, - logAnalyticsSharedKey, - logAnalyticsLogType, - FabricClientInstance, - token); + logAnalyticsWorkspaceId, + logAnalyticsSharedKey, + logAnalyticsLogType, + FabricClientInstance, + token); break; @@ -288,7 +231,6 @@ private void SetPropertiesFromConfigurationParameters() if (string.IsNullOrEmpty(aiKey)) { TelemetryEnabled = false; - return; } @@ -303,58 +245,57 @@ public async Task StartAsync() { try { - if (globalShutdownEventHandle == null) - { - globalShutdownEventHandle = new EventWaitHandle(false, EventResetMode.ManualReset); - } - while (true) { if (!appParamsUpdating && (shutdownSignaled || token.IsCancellationRequested)) { - _ = globalShutdownEventHandle.Set(); Logger.LogInfo("Shutdown signaled. Stopping."); + await StopAsync().ConfigureAwait(false); break; } await RunObserverAync().ConfigureAwait(false); - Logger.LogInfo($"Sleeping for {(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 10)} seconds before running again."); - ThreadSleep(globalShutdownEventHandle, TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 10)); + Logger.LogInfo($"Sleeping for {(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 30)} seconds before running again."); + await Task.Delay(TimeSpan.FromSeconds(ObserverExecutionLoopSleepSeconds > 0 ? ObserverExecutionLoopSleepSeconds : 30), token); Logger.Flush(); } } - catch (Exception ex) + catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) { - var message = $"Unhanded Exception in ClusterObserverManager on node {nodeName}. Taking down CO process. Error info:{Environment.NewLine}{ex}"; + + } + catch (Exception e) + { + string message = $"Unhanded Exception in ClusterObserverManager on node {nodeName}. Taking down CO process. Error info:{Environment.NewLine}{e}"; Logger.LogError(message); // Telemetry. if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ClusterObserverServiceHealth", - HealthState.Warning, - message, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ClusterObserverServiceHealth", + HealthState.Warning, + message, + ObserverConstants.ObserverManagerName, + token); } // ETW. if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = message, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = message, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } // Don't swallow the unhandled exception. Fix the bug. @@ -388,27 +329,27 @@ private Task SignalAbortToRunningObserverAsync() if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ClusterObserverServiceHealth", - HealthState.Warning, - $"{e}", - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ClusterObserverServiceHealth", + HealthState.Warning, + $"{e}", + ObserverConstants.ObserverManagerName, + token); } // ETW. if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = $"{e}", - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = $"{e}", + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } } @@ -422,93 +363,81 @@ private async Task RunObserverAync() return; } - var exceptionBuilder = new StringBuilder(); - try { Logger.LogInfo($"Starting {observer.ObserverName}"); IsObserverRunning = true; // Synchronous call. - var isCompleted = observer.ObserveAsync( - linkedSFRuntimeObserverTokenSource != null ? linkedSFRuntimeObserverTokenSource.Token : token).Wait(observerExecTimeout); + var isCompleted = observer.ObserveAsync(linkedSFRuntimeObserverTokenSource != null ? linkedSFRuntimeObserverTokenSource.Token : token).Wait(observerExecTimeout); // The observer is taking too long (hung?) if (!isCompleted) { string observerHealthWarning = $"{observer.ObserverName} has exceeded its specified run time of {observerExecTimeout.TotalSeconds} seconds. Aborting."; - await SignalAbortToRunningObserverAsync(); + await SignalAbortToRunningObserverAsync().ConfigureAwait(false); Logger.LogWarning(observerHealthWarning); if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ObserverHealthReport", - HealthState.Warning, - observerHealthWarning, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ObserverHealthReport", + HealthState.Warning, + observerHealthWarning, + ObserverConstants.ObserverManagerName, + token); } if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = observerHealthWarning, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = observerHealthWarning, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } + // Create new instance of CO. observer = new ClusterObserver(); cts = new CancellationTokenSource(); } } - catch (AggregateException ex) when ( - ex.InnerException is OperationCanceledException || - ex.InnerException is TaskCanceledException || - ex.InnerException is TimeoutException) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - IsObserverRunning = false; - _ = exceptionBuilder.AppendLine($"Handled Exception from {observer.ObserverName}:{Environment.NewLine}{ex.InnerException}"); - Logger.LogError(exceptionBuilder.ToString()); - _ = exceptionBuilder.Clear(); - } - catch (Exception e) - { - string msg = $"Unhandled exception in ClusterObserverManager.Run(). Taking down process. Error info:{Environment.NewLine}{e}"; + string msg = $"Unhandled exception in ClusterObserverManager.RunObserverAync(). Taking down process. Error info:{Environment.NewLine}{e}"; Logger.LogError(msg); if (TelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - "ObserverHealthReport", - HealthState.Warning, - msg, - ObserverConstants.ObserverManagerName, - token); + HealthScope.Application, + "ObserverHealthReport", + HealthState.Warning, + msg, + ObserverConstants.ObserverManagerName, + token); } if (EtwEnabled) { Logger.EtwLogger?.Write( - ObserverConstants.ClusterObserverETWEventName, - new - { - HealthScope = "Application", - HealthState = "Warning", - HealthEventDescription = msg, - Metric = "ClusterObserverServiceHealth", - Source = ObserverConstants.ClusterObserverName, - }); + ObserverConstants.ClusterObserverETWEventName, + new + { + HealthScope = "Application", + HealthState = "Warning", + HealthEventDescription = msg, + Metric = "ClusterObserverServiceHealth", + Source = ObserverConstants.ClusterObserverName, + }); } throw; @@ -517,15 +446,21 @@ ex.InnerException is TaskCanceledException || IsObserverRunning = false; } - private void CodePackageActivationContext_ConfigurationPackageModifiedEvent( - object sender, - PackageModifiedEventArgs e) + /// + /// App parameter config update handler. This will recreate CO instance with new ConfigSettings applied. + /// + /// + /// + private async void CodePackageActivationContext_ConfigurationPackageModifiedEvent(object sender, PackageModifiedEventArgs e) { appParamsUpdating = true; Logger.LogInfo("Application Parameter upgrade started..."); - SignalAbortToRunningObserverAsync(); + + await SignalAbortToRunningObserverAsync(); + observer = new ClusterObserver(e.NewPackage.Settings); cts = new CancellationTokenSource(); + Logger.LogInfo("Application Parameter upgrade complete..."); appParamsUpdating = false; } @@ -547,8 +482,6 @@ protected virtual void Dispose(bool disposing) StopAsync().GetAwaiter().GetResult(); } - globalShutdownEventHandle?.Dispose(); - if (FabricClientInstance != null) { FabricClientInstance.Dispose(); @@ -564,9 +497,7 @@ protected virtual void Dispose(bool disposing) // Flush and Dispose all NLog targets. No more logging. Logger.Flush(); Logger.ShutDown(); - FabricServiceContext.CodePackageActivationContext.ConfigurationPackageModifiedEvent -= CodePackageActivationContext_ConfigurationPackageModifiedEvent; - hasDisposed = true; } diff --git a/ClusterObserver/FabricClusterObserver.cs b/ClusterObserver/FabricClusterObserver.cs index 74a61c2a..1d34b5d0 100644 --- a/ClusterObserver/FabricClusterObserver.cs +++ b/ClusterObserver/FabricClusterObserver.cs @@ -10,8 +10,6 @@ namespace ClusterObserver /// internal sealed class FabricClusterObserver : StatelessService { - private ClusterObserverManager observerManager; - public FabricClusterObserver(StatelessServiceContext context) : base(context) { @@ -24,20 +22,8 @@ public FabricClusterObserver(StatelessServiceContext context) /// Canceled when Service Fabric needs to shut down this service instance. protected override async Task RunAsync(CancellationToken cancellationToken) { - observerManager = new ClusterObserverManager(Context, cancellationToken); - + using var observerManager = new ClusterObserverManager(Context, cancellationToken); await observerManager.StartAsync().ConfigureAwait(true); } - - - protected override Task OnCloseAsync(CancellationToken cancellationToken) - { - if (observerManager != null) - { - observerManager.Dispose(); - } - - return base.OnCloseAsync(cancellationToken); - } } } diff --git a/ClusterObserver/PackageRoot/Config/Settings.xml b/ClusterObserver/PackageRoot/Config/Settings.xml index 9570a3c3..302cbdf8 100644 --- a/ClusterObserver/PackageRoot/Config/Settings.xml +++ b/ClusterObserver/PackageRoot/Config/Settings.xml @@ -1,12 +1,12 @@ 
- + - + - - + + - + @@ -32,7 +32,7 @@ - +
- + ClusterObserver @@ -21,7 +21,7 @@ - + diff --git a/ClusterObserver/Readme.md b/ClusterObserver/Readme.md index 1da6d140..7c30aa6f 100644 --- a/ClusterObserver/Readme.md +++ b/ClusterObserver/Readme.md @@ -1,4 +1,4 @@ -### ClusterObserver 2.1.0 +### ClusterObserver 2.1.7 ClusterObserver (CO) is a standalone SF singleton stateless service that runs on one node (1) and is independent from FabricObserver, which runs on all nodes (-1). CO observes cluster health (aggregated) and sends telemetry when cluster is in Error (and optionally in Warning). CO shares a very small subset of FabricObserver's (FO) code. It is designed to be completely independent from FO sources, diff --git a/ClusterObserver/Utilities/ConfigSettings.cs b/ClusterObserver/Utilities/ConfigSettings.cs index e9c61c99..e729eade 100644 --- a/ClusterObserver/Utilities/ConfigSettings.cs +++ b/ClusterObserver/Utilities/ConfigSettings.cs @@ -8,7 +8,6 @@ using System.Fabric; using System.Fabric.Description; using System.Linq; -using System.Threading; namespace ClusterObserver.Utilities { diff --git a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs index beef5084..7b479269 100644 --- a/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -19,7 +19,7 @@ namespace ClusterObserver.Utilities.Telemetry /// Abstracts the ApplicationInsights telemetry API calls allowing /// other telemetry providers to be plugged in. /// - public class AppInsightsTelemetry : ITelemetryProvider, IDisposable + public class AppInsightsTelemetry : ITelemetryProvider { /// /// ApplicationInsights telemetry client. @@ -57,14 +57,8 @@ public AppInsightsTelemetry(string key) /// public string Key { - get - { - return telemetryClient?.InstrumentationKey; - } - set - { - telemetryClient.InstrumentationKey = value; - } + get => telemetryClient?.InstrumentationKey; + set => telemetryClient.InstrumentationKey = value; } /// @@ -81,15 +75,15 @@ public string Key /// CancellationToken instance. /// A representing the asynchronous operation. public Task ReportAvailabilityAsync( - Uri serviceName, - string instance, - string testName, - DateTimeOffset captured, - TimeSpan duration, - string location, - bool success, - CancellationToken cancellationToken, - string message = null) + Uri serviceName, + string instance, + string testName, + DateTimeOffset captured, + TimeSpan duration, + string location, + bool success, + CancellationToken cancellationToken, + string message = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -117,13 +111,9 @@ public Task ReportAvailabilityAsync( /// TelemetryData instance. /// CancellationToken instance. /// a Task. - public Task ReportHealthAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (!IsEnabled - || cancellationToken.IsCancellationRequested - || telemetryData == null) + if (!IsEnabled || cancellationToken.IsCancellationRequested || telemetryData == null) { return Task.FromResult(1); } @@ -141,29 +131,28 @@ public Task ReportHealthAsync( Dictionary properties = new Dictionary { - { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, + { "HealthState", telemetryData.HealthState ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, { "ErrorCode", telemetryData.Code ?? string.Empty }, { "Description", telemetryData.Description ?? string.Empty }, - { "HealthState", telemetryData.HealthState ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", $"{telemetryData.PartitionId}" }, - { "Replica", $"{telemetryData.ReplicaId}" }, - { "Source", telemetryData.Source ?? ObserverConstants.ClusterObserverName }, { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, }; telemetryClient.TrackEvent(ObserverConstants.ClusterObserverETWEventName, properties); } catch (Exception e) { - logger.LogWarning( - $"Unhandled exception in TelemetryClient.ReportHealthAsync:" + - $"{Environment.NewLine}{e}"); - - throw; + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); } return Task.FromResult(0); @@ -182,14 +171,14 @@ public Task ReportHealthAsync( /// Optional: TraceTelemetry context cloud instance name. /// A representing the asynchronous operation. public Task ReportHealthAsync( - HealthScope scope, - string propertyName, - HealthState state, - string unhealthyEvaluations, - string source, - CancellationToken cancellationToken, - string serviceName = null, - string instanceName = null) + HealthScope scope, + string propertyName, + HealthState state, + string unhealthyEvaluations, + string source, + CancellationToken cancellationToken, + string serviceName = null, + string instanceName = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -219,7 +208,6 @@ public Task ReportHealthAsync( catch (Exception e) { logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); - throw; } return Task.FromResult(0); @@ -232,10 +220,7 @@ public Task ReportHealthAsync( /// Value of the property. /// CancellationToken instance. /// Task of bool. - public Task ReportMetricAsync( - string name, - T value, - CancellationToken cancellationToken) + public Task ReportMetricAsync(string name, T value, CancellationToken cancellationToken) { if (!IsEnabled || cancellationToken.IsCancellationRequested || string.IsNullOrEmpty(name)) { @@ -317,17 +302,17 @@ public async Task ReportMetricAsync(string role, long id, string name, long valu /// CancellationToken instance. /// A representing the asynchronous operation. public Task ReportMetricAsync( - string roleName, - string instance, - string name, - long value, - int count, - long min, - long max, - long sum, - double deviation, - IDictionary properties, - CancellationToken cancellationToken) + string roleName, + string instance, + string name, + long value, + int count, + long min, + long max, + long sum, + double deviation, + IDictionary properties, + CancellationToken cancellationToken) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -359,38 +344,5 @@ public Task ReportMetricAsync( return Task.FromResult(0); } - - private bool disposedValue; // To detect redundant calls - - protected virtual void Dispose(bool disposing) - { - if (!disposedValue) - { - if (disposing) - { - } - - disposedValue = true; - } - } - - // TODO: override a finalizer only if Dispose(bool disposing) above has code to free unmanaged resources. - // ~AppInsightsTelemetry() - // { - // // Do not change this code. Put cleanup code in Dispose(bool disposing) above. - // Dispose(false); - // } - - // This code added to correctly implement the disposable pattern. - - - public void Dispose() - { - // Do not change this code. Put cleanup code in Dispose(bool disposing) above. - Dispose(true); - - // TODO: uncomment the following line if the finalizer is overridden above. - // GC.SuppressFinalize(this); - } } } \ No newline at end of file diff --git a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs index 31cdb896..a25ff4ed 100644 --- a/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs +++ b/ClusterObserver/Utilities/Telemetry/LogAnalyticsTelemetry.cs @@ -121,13 +121,11 @@ private async Task SendTelemetryAsync(string payload, CancellationToken token) logger.LogWarning($"Unexpected response from server in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{responseAsync.StatusCode}: {responseAsync.StatusDescription}"); } -#pragma warning disable CA1031 // Do not take down process due to unhandled exception during telemetry transmission. Log it, fix the bug. catch (Exception e) { // An Exception during telemetry data submission should never take down CO process. Log it. Don't throw it. Fix it. logger.LogWarning($"Handled Exception in LogAnalyticsTelemetry.SendTelemetryAsync:{Environment.NewLine}{e}"); } -#pragma warning restore CA1031 // It's ok here. if (retries < MaxRetries) { @@ -178,24 +176,23 @@ public async Task ReportHealthAsync( string serviceName = null, string instanceName = null) { - var (clusterId, _) = - await ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).ConfigureAwait(true); + var (clusterId, _) = await ClusterIdentificationUtility.TupleGetClusterIdAndTypeAsync(fabricClient, token).ConfigureAwait(true); string jsonPayload = JsonConvert.SerializeObject( - new - { - id = $"CO_{Guid.NewGuid()}", - datetime = DateTime.UtcNow, - clusterId = clusterId ?? string.Empty, - source = ObserverConstants.ClusterObserverName, - property = propertyName, - healthScope = scope.ToString(), - healthState = state.ToString(), - healthEvaluation = unhealthyEvaluations, - serviceName = serviceName ?? string.Empty, - instanceName = instanceName ?? string.Empty, - osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", - }); + new + { + id = $"CO_{Guid.NewGuid()}", + datetime = DateTime.UtcNow, + clusterId = clusterId ?? string.Empty, + source = ObserverConstants.ClusterObserverName, + property = propertyName, + healthScope = scope.ToString(), + healthState = state.ToString(), + healthEvaluation = unhealthyEvaluations, + serviceName = serviceName ?? string.Empty, + instanceName = instanceName ?? string.Empty, + osPlatform = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", + }); await SendTelemetryAsync(jsonPayload, cancellationToken).ConfigureAwait(false); } diff --git a/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs b/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs index 54dd37c0..93e15fcb 100644 --- a/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs +++ b/ClusterObserver/Utilities/Telemetry/TelemetryProdiverType.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace ClusterObserver.Utilities.Telemetry { public enum TelemetryProviderType diff --git a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index bcd7086c..12cc57f5 100644 --- a/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/ClusterObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -7,14 +7,14 @@ - + - + diff --git a/FabricObserver.Extensibility/ObserverBase.cs b/FabricObserver.Extensibility/ObserverBase.cs index 0f594485..f93c2a32 100644 --- a/FabricObserver.Extensibility/ObserverBase.cs +++ b/FabricObserver.Extensibility/ObserverBase.cs @@ -683,8 +683,6 @@ public void ProcessResourceDataReportHealth( { ApplicationName = appName?.OriginalString ?? string.Empty, NodeName = NodeName, - Code = string.Empty, - HealthState = string.Empty, ObserverName = ObserverName, Metric = data.Property, Value = Math.Round(data.AverageDataValue, 0), @@ -758,8 +756,6 @@ public void ProcessResourceDataReportHealth( // of user telemetry settings. telemetryData = new TelemetryData(FabricClientInstance, Token) { - Code = string.Empty, - HealthState = string.Empty, NodeName = NodeName, ObserverName = ObserverName, Metric = $"{drive}{data.Property}", @@ -778,8 +774,6 @@ public void ProcessResourceDataReportHealth( ObserverConstants.FabricObserverETWEventName, new { - Code = string.Empty, - HealthState = string.Empty, NodeName, ObserverName, Metric = $"{drive}{data.Property}", @@ -1042,7 +1036,7 @@ public void ProcessResourceDataReportHealth( // Telemetry if (IsTelemetryEnabled) { - _ = TelemetryClient?.ReportMetricAsync(telemetryData, Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. @@ -1195,7 +1189,7 @@ private void SetObserverConfiguration() IsEtwProviderEnabled = etwProviderEnabled; } - // (Assuming Diagnostics/Analytics cloud service implemented) Telemetry. + // Telemetry. if (bool.TryParse(GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.TelemetryEnabled), out bool telemEnabled)) { IsTelemetryProviderEnabled = telemEnabled; @@ -1232,11 +1226,9 @@ private void SetObserverConfiguration() string logAnalyticsWorkspaceId = GetSettingParameterValue(ObserverConstants.ObserverManagerConfigurationSectionName, ObserverConstants.LogAnalyticsWorkspaceIdParameter); - if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) - || string.IsNullOrEmpty(logAnalyticsSharedKey)) + if (string.IsNullOrEmpty(logAnalyticsWorkspaceId) || string.IsNullOrEmpty(logAnalyticsSharedKey)) { IsTelemetryProviderEnabled = false; - return; } @@ -1256,7 +1248,6 @@ private void SetObserverConfiguration() if (string.IsNullOrEmpty(aiKey)) { IsTelemetryProviderEnabled = false; - return; } @@ -1274,8 +1265,6 @@ private void SetObserverConfiguration() private void InitializeCsvLogger() { - // This could be called from app paramter-only update handler. - // You can turn CSV data logging on and off with app parameter updates for 3 observers: AppObserver, FabricSystemObserver and NodeObserver. if (CsvFileLogger != null) { return; @@ -1320,6 +1309,7 @@ private bool IsObserverWebApiAppInstalled() } catch (Exception e) when (e is FabricException || e is TimeoutException) { + } return false; diff --git a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs index e3c5fc52..e6053e66 100644 --- a/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs +++ b/FabricObserver.Extensibility/Utilities/DataTableFileLogger.cs @@ -4,9 +4,7 @@ // ------------------------------------------------------------ using System; -using System.Collections.Generic; using System.IO; -using System.Linq; using System.Runtime.InteropServices; using FabricObserver.Interfaces; using NLog; @@ -23,8 +21,6 @@ private static ILogger DataLogger get; set; } - private readonly Dictionary FolderCleanedState; - public string DataLogFolder { get; set; @@ -41,7 +37,7 @@ public CsvFileWriteFormat FileWriteFormat } /// - /// The maximum number of archive files that will be stored. + /// The maximum number of days that archive files will be stored. /// 0 means there is no limit set. /// public int MaxArchiveCsvFileLifetimeDays @@ -51,7 +47,7 @@ public int MaxArchiveCsvFileLifetimeDays public DataTableFileLogger() { - FolderCleanedState = new Dictionary(); + } public void ConfigureLogger(string filename) @@ -103,22 +99,10 @@ public void ConfigureLogger(string filename) var csvPath = Path.Combine(logFullPath, filename + ".csv"); - // Clean out old files. + // Clean out old files if written as MultipleFilesNoArchives. if (MaxArchiveCsvFileLifetimeDays > 0 && FileWriteFormat == CsvFileWriteFormat.MultipleFilesNoArchives) { - // Add folder path to state dictionary. - if (!FolderCleanedState.ContainsKey(logFullPath)) - { - FolderCleanedState.Add(logFullPath, DateTime.UtcNow); - } - else - { - // Only clean a folder that hasn't been cleaned for MaxArchiveCsvFileLifetimeDays days. - if (DateTime.UtcNow.Subtract(FolderCleanedState[logFullPath]) >= TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)) - { - CleanLogFolder(logFullPath, TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)); - } - } + TryCleanLogFolder(logFullPath, TimeSpan.FromDays(MaxArchiveCsvFileLifetimeDays)); } if (DataLogger == null) @@ -181,11 +165,9 @@ public static void Flush() LogManager.Flush(); } - private void CleanLogFolder(string folderPath, TimeSpan maxAge) + private void TryCleanLogFolder(string folderPath, TimeSpan maxAge) { - int count = 0; - - if (Directory.Exists(folderPath)) + if (Directory.Exists(folderPath) && DateTime.UtcNow.Subtract(Directory.GetLastWriteTimeUtc(folderPath)) >= maxAge) { string[] files = Directory.GetFiles(folderPath, "*", SearchOption.AllDirectories); @@ -196,7 +178,6 @@ private void CleanLogFolder(string folderPath, TimeSpan maxAge) if (DateTime.UtcNow.Subtract(File.GetCreationTime(file)) >= maxAge) { File.Delete(file); - count++; } } catch (Exception e) when (e is ArgumentException || e is IOException || e is UnauthorizedAccessException || e is PathTooLongException) @@ -204,12 +185,6 @@ private void CleanLogFolder(string folderPath, TimeSpan maxAge) } } - - if (count > 0) - { - // The dictionary will always contain the folderPath key. See calling code. - FolderCleanedState[folderPath] = DateTime.UtcNow; - } } } } diff --git a/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs b/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs index 209fb4d2..9d32655e 100644 --- a/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs +++ b/FabricObserver.Extensibility/Utilities/FabricClientRetryErrors.cs @@ -30,7 +30,7 @@ public class FabricClientRetryErrors public static readonly Lazy MoveSecondaryFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.AlreadySecondaryReplica); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.AlreadySecondaryReplica); retryErrors.RetryableFabricErrorCodes.Add(FabricErrorCode.PLBNotReady); return retryErrors; }); @@ -41,7 +41,7 @@ public class FabricClientRetryErrors public static readonly Lazy MovePrimaryFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.AlreadyPrimaryReplica); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.AlreadyPrimaryReplica); retryErrors.RetryableFabricErrorCodes.Add(FabricErrorCode.PLBNotReady); return retryErrors; }); @@ -92,7 +92,7 @@ public class FabricClientRetryErrors public static readonly Lazy ProvisionFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricVersionAlreadyExists); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricVersionAlreadyExists); return retryErrors; }); @@ -102,8 +102,8 @@ public class FabricClientRetryErrors public static readonly Lazy UpgradeFabricErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricUpgradeInProgress); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.FabricAlreadyInTargetVersion); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricUpgradeInProgress); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.FabricAlreadyInTargetVersion); return retryErrors; }); @@ -113,7 +113,7 @@ public class FabricClientRetryErrors public static readonly Lazy RemoveUnreliableTransportBehaviorErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.InternalRetrySuccesSFabricErrorCodes.Add(2147949808); + retryErrors.InternalRetrySuccessFabricErrorCodes.Add(2147949808); return retryErrors; }); @@ -123,7 +123,7 @@ public class FabricClientRetryErrors public static readonly Lazy CreateAppErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.ApplicationAlreadyExists); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.ApplicationAlreadyExists); return retryErrors; }); @@ -133,7 +133,7 @@ public class FabricClientRetryErrors public static readonly Lazy DeleteAppErrors = new Lazy(() => { var retryErrors = new FabricClientRetryErrors(); - retryErrors.RetrySuccesSFabricErrorCodes.Add(FabricErrorCode.ApplicationNotFound); + retryErrors.RetrySuccessFabricErrorCodes.Add(FabricErrorCode.ApplicationNotFound); return retryErrors; }); @@ -146,9 +146,8 @@ public FabricClientRetryErrors() RetryableExceptions = new List(); RetryableFabricErrorCodes = new List(); RetrySuccessExceptions = new List(); - RetrySuccesSFabricErrorCodes = new List(); - - InternalRetrySuccesSFabricErrorCodes = new List(); + RetrySuccessFabricErrorCodes = new List(); + InternalRetrySuccessFabricErrorCodes = new List(); PopulateDefaultValues(); } @@ -180,7 +179,7 @@ public IList RetrySuccessExceptions /// /// Gets list of success error codes that are retry-able. /// - public IList RetrySuccesSFabricErrorCodes + public IList RetrySuccessFabricErrorCodes { get; private set; } @@ -188,7 +187,7 @@ public IList RetrySuccesSFabricErrorCodes /// /// Gets list of internal success error codes that are retry-able. /// - internal IList InternalRetrySuccesSFabricErrorCodes + internal IList InternalRetrySuccessFabricErrorCodes { get; private set; } diff --git a/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs b/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs index ef2fda25..4711df9b 100644 --- a/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs +++ b/FabricObserver.Extensibility/Utilities/FabricClientRetryHelper.cs @@ -26,15 +26,13 @@ public static class FabricClientRetryHelper /// Action to be performed. /// Cancellation token for Async operation. /// Task object. - public static async Task ExecuteFabricActionWithRetryAsync( - Func> function, - CancellationToken cancellationToken) + public static async Task ExecuteFabricActionWithRetryAsync(Func> function, CancellationToken cancellationToken) { return await ExecuteFabricActionWithRetryAsync( - function, - new FabricClientRetryErrors(), - DefaultOperationTimeout, - cancellationToken).ConfigureAwait(false); + function, + new FabricClientRetryErrors(), + DefaultOperationTimeout, + cancellationToken).ConfigureAwait(false); } /// @@ -46,10 +44,10 @@ public static async Task ExecuteFabricActionWithRetryAsync( /// Cancellation token for Async operation. /// Task object. public static async Task ExecuteFabricActionWithRetryAsync( - Func> function, - FabricClientRetryErrors errors, - TimeSpan operationTimeout, - CancellationToken cancellationToken) + Func> function, + FabricClientRetryErrors errors, + TimeSpan operationTimeout, + CancellationToken cancellationToken) { bool needToWait = false; var watch = new Stopwatch(); @@ -77,16 +75,15 @@ public static async Task ExecuteFabricActionWithRetryAsync( if (retryElseSuccess) { - Logger.LogInfo( - $"ExecuteFabricActionWithRetryAsync: Retrying due to Exception: {e}"); + Logger.LogInfo($"ExecuteFabricActionWithRetryAsync: Retrying due to Exception: {e}"); if (watch.Elapsed > operationTimeout) { Logger.LogWarning( - "ExecuteFabricActionWithRetryAsync: Done Retrying. " + - $"Time Elapsed: {watch.Elapsed.TotalSeconds}, " + - $"Timeout: {operationTimeout.TotalSeconds}. " + - $"Throwing Exception: {e}"); + "ExecuteFabricActionWithRetryAsync: Done Retrying. " + + $"Time Elapsed: {watch.Elapsed.TotalSeconds}, " + + $"Timeout: {operationTimeout.TotalSeconds}. " + + $"Throwing Exception: {e}"); throw; } @@ -96,70 +93,58 @@ public static async Task ExecuteFabricActionWithRetryAsync( continue; } - Logger.LogInfo( - $"ExecuteFabricActionWithRetryAsync: Exception {e} Handled but No Retry."); + Logger.LogInfo($"ExecuteFabricActionWithRetryAsync: Exception {e} Handled but No Retry."); return default; } } } - private static bool HandleException( - Exception e, - FabricClientRetryErrors errors, - out bool retryElseSuccess) + private static bool HandleException(Exception e, FabricClientRetryErrors errors, out bool retryElseSuccess) { var fabricException = e as FabricException; if (errors.RetryableExceptions.Contains(e.GetType())) { retryElseSuccess = true /*retry*/; - return true; } if (fabricException != null && errors.RetryableFabricErrorCodes.Contains(fabricException.ErrorCode)) { retryElseSuccess = true /*retry*/; - return true; } if (errors.RetrySuccessExceptions.Contains(e.GetType())) { retryElseSuccess = false /*success*/; - return true; } if (fabricException != null - && errors.RetrySuccesSFabricErrorCodes.Contains(fabricException.ErrorCode)) + && errors.RetrySuccessFabricErrorCodes.Contains(fabricException.ErrorCode)) { retryElseSuccess = false /*success*/; - return true; } if (e.GetType() == typeof(FabricTransientException)) { retryElseSuccess = true /*retry*/; - return true; } if (fabricException?.InnerException != null) { - if (fabricException.InnerException is COMException ex - && errors.InternalRetrySuccesSFabricErrorCodes.Contains((uint)ex.ErrorCode)) + if (fabricException.InnerException is COMException ex && errors.InternalRetrySuccessFabricErrorCodes.Contains((uint)ex.ErrorCode)) { retryElseSuccess = false /*success*/; - return true; } } retryElseSuccess = false; - return false; } } diff --git a/FabricObserver.Extensibility/Utilities/Logger.cs b/FabricObserver.Extensibility/Utilities/Logger.cs index 7116b4c3..21543792 100644 --- a/FabricObserver.Extensibility/Utilities/Logger.cs +++ b/FabricObserver.Extensibility/Utilities/Logger.cs @@ -22,7 +22,9 @@ namespace FabricObserver.Observers.Utilities public sealed class Logger : IObserverLogger { private const int Retries = 5; - private EventSource etwLogger = null; + + // This needs to be static to prevent internal EventSource instantiation errors. + private static EventSource etwLogger = null; // Text file logger for observers - info/warn/error. private ILogger OLogger @@ -76,7 +78,7 @@ public string Filename } /// - /// The maximum number of archive files that will be stored. + /// The maximum number of days that archive files will be stored. /// 0 means there is no limit set. /// public int MaxArchiveFileLifetimeDays @@ -276,8 +278,8 @@ public void InitializeLoggers() FileName = file, Layout = "${longdate}--${uppercase:${level}}--${message}", OpenFileCacheTimeout = 5, - ArchiveNumbering = ArchiveNumberingMode.DateAndSequence, ArchiveEvery = FileArchivePeriod.Day, + ArchiveNumbering = ArchiveNumberingMode.DateAndSequence, MaxArchiveDays = MaxArchiveFileLifetimeDays <= 0 ? 7 : MaxArchiveFileLifetimeDays, AutoFlush = true, }; diff --git a/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs b/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs index 9f2fd5ee..c4401d97 100644 --- a/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs +++ b/FabricObserver.Extensibility/Utilities/MemInfoConstants.cs @@ -1,4 +1,7 @@ -using NLog.LayoutRenderers; +// ------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License (MIT). See License.txt in the repo root for license information. +// ------------------------------------------------------------ namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs index 437f4ceb..7af5cd8f 100644 --- a/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs +++ b/FabricObserver.Extensibility/Utilities/ObserverHealthReporter.cs @@ -38,19 +38,21 @@ public ObserverHealthReporter(Logger logger, FabricClient fabricClient) /// Name of the health property. /// Health state (Ok, Error, etc). /// Description of the health condition. - public void ReportFabricObserverServiceHealth( - string serviceName, - string propertyName, - HealthState healthState, - string description) + public void ReportFabricObserverServiceHealth(string serviceName, string propertyName, HealthState healthState, string description) { + string msg = $"{propertyName} reporting {healthState}: {description}"; + if (healthState == HealthState.Error) { - logger.LogError("FabricObserver service health error: " + serviceName + " | " + propertyName + " | {0}", description); + logger.LogError(msg); } else if (healthState == HealthState.Warning) { - logger.LogWarning("FabricObserver service health warning: " + serviceName + " | " + propertyName + " | {0}", description); + logger.LogWarning(msg); + } + else if (logger.EnableVerboseLogging) + { + logger.LogInfo(msg); } } @@ -86,16 +88,14 @@ public void ReportHealthToServiceFabric(HealthReport healthReport) string errWarnPreamble = string.Empty; - if (healthReport.State == HealthState.Error - || healthReport.State == HealthState.Warning) + if (healthReport.State == HealthState.Error || healthReport.State == HealthState.Warning) { errWarnPreamble = $"{healthReport.Observer} detected " + $"{Enum.GetName(typeof(HealthState), healthReport.State)} threshold breach. "; // OSObserver does not monitor resources and therefore does not support related usage threshold configuration. - if (healthReport.Observer == ObserverConstants.OSObserverName - && healthReport.Property == "OSConfiguration") + if (healthReport.Observer == ObserverConstants.OSObserverName && healthReport.Property == "OSConfiguration") { errWarnPreamble = $"{ObserverConstants.OSObserverName} detected potential problem with OS configuration: "; } diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs index 71849c33..6a11a38d 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/LinuxInfoProvider.cs @@ -30,7 +30,7 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem return (totalMem, Math.Round(pctUsed, 2)); } - public override int GetActivePortCount(int processId = -1, ServiceContext context = null) + public override int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null) { int count = GetPortCount(processId, predicate: (line) => true, context); return count; diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs index eda5b721..9c7cde60 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/OperatingSystemInfoProvider.cs @@ -53,7 +53,7 @@ protected Logger Logger public abstract (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMemorySizeAndPercentInUse(); - public abstract int GetActivePortCount(int processId = -1, ServiceContext context = null); + public abstract int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null); public abstract int GetActiveEphemeralPortCount(int processId = -1, ServiceContext context = null); diff --git a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs index 53d19cef..31caba21 100644 --- a/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/OperatingSystemInfo/WindowsInfoProvider.cs @@ -63,11 +63,9 @@ public override (long TotalMemory, double PercentInUse) TupleGetTotalPhysicalMem return (visibleTotal / 1024 / 1024, Math.Round(usedPct, 2)); } } - catch (Exception e) when ( - e is FormatException - || e is InvalidCastException - || e is ManagementException) + catch (Exception e) when (e is FormatException || e is InvalidCastException || e is ManagementException) { + Logger.LogWarning($"Handled failure in TupleGetTotalPhysicalMemorySizeAndPercentInUse:{Environment.NewLine}{e}"); } finally { @@ -78,13 +76,208 @@ e is FormatException return (-1L, -1); } + public override (int LowPort, int HighPort) TupleGetDynamicPortRange() + { + using (var p = new Process()) + { + try + { + var ps = new ProcessStartInfo + { + Arguments = $"/c netsh int ipv4 show dynamicportrange {TcpProtocol} | find /i \"port\"", + FileName = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe", + UseShellExecute = false, + WindowStyle = ProcessWindowStyle.Hidden, + RedirectStandardInput = true, + RedirectStandardOutput = true, + }; + + p.StartInfo = ps; + _ = p.Start(); + + var stdOutput = p.StandardOutput; + string output = stdOutput.ReadToEnd(); + Match match = Regex.Match( + output, + @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", + RegexOptions.Singleline | RegexOptions.IgnoreCase); + + string startPort = match.Groups["startPort"].Value; + string portCount = match.Groups["numberOfPorts"].Value; + string exitStatus = p.ExitCode.ToString(); + stdOutput.Close(); + + if (exitStatus != "0") + { + return (-1, -1); + } + + int lowPortRange = int.Parse(startPort); + int highPortRange = lowPortRange + int.Parse(portCount); + + return (lowPortRange, highPortRange); + } + catch (Exception e) when ( + e is ArgumentException + || e is IOException + || e is InvalidOperationException + || e is RegexMatchTimeoutException + || e is Win32Exception) + { + } + } + + return (-1, -1); + } + /// - /// Compute count of active ports in dynamic range. + /// Compute count of active TCP ports in dynamic range. /// /// Optional: If supplied, then return the number of ephemeral ports in use by the process. - /// Optional (this is used by Linux callers only): If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. - /// + /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): + /// If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. + /// number of active Epehemeral TCP ports as int value public override int GetActiveEphemeralPortCount(int processId = -1, ServiceContext context = null) + { + int count; + + try + { + count = Retry.Do(() => GetEphemeralPortCount(processId), TimeSpan.FromSeconds(3), CancellationToken.None); + } + catch (AggregateException ae) + { + Logger.LogWarning($"Retry failed for GetActiveEphemeralPortCount:{Environment.NewLine}{ae}"); + count = -1; + } + + return count; + } + + /// + /// Compute count of active TCP ports. + /// + /// Optional: If supplied, then return the number of tcp ports in use by the process. + /// Optional (this is used by Linux callers only - see LinuxInfoProvider.cs): + /// If supplied, will use the ServiceContext to find the Linux Capabilities binary to run this command. + /// number of active TCP ports as int value + public override int GetActiveTcpPortCount(int processId = -1, ServiceContext context = null) + { + int count; + + try + { + count = Retry.Do(() => GetTcpPortCount(processId), TimeSpan.FromSeconds(3), CancellationToken.None); + } + catch (AggregateException ae) + { + Logger.LogWarning($"Retry failed for GetActivePortCount:{Environment.NewLine}{ae}"); + count = -1; + } + + return count; + } + + public override Task GetOSInfoAsync(CancellationToken cancellationToken) + { + ManagementObjectSearcher win32OsInfo = null; + ManagementObjectCollection results = null; + + OSInfo osInfo = default; + + try + { + win32OsInfo = new ManagementObjectSearcher("SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory,TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); + results = win32OsInfo.Get(); + + foreach (var prop in results) + { + cancellationToken.ThrowIfCancellationRequested(); + + foreach (var p in prop.Properties) + { + cancellationToken.ThrowIfCancellationRequested(); + + string name = p.Name; + string value = p.Value.ToString(); + + if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(value)) + { + continue; + } + + switch (name.ToLowerInvariant()) + { + case "caption": + osInfo.Name = value; + break; + case "numberofprocesses": + if (int.TryParse(value, out int numProcesses)) + { + osInfo.NumberOfProcesses = numProcesses; + } + else + { + osInfo.NumberOfProcesses = -1; + } + + break; + case "status": + osInfo.Status = value; + break; + case "oslanguage": + osInfo.Language = value; + break; + case "version": + osInfo.Version = value; + break; + case "installdate": + osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); + break; + case "lastbootuptime": + osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); + break; + case "freephysicalmemory": + osInfo.AvailableMemoryKB = ulong.Parse(value); + break; + case "freevirtualmemory": + osInfo.FreeVirtualMemoryKB = ulong.Parse(value); + break; + case "totalvirtualmemorysize": + osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); + break; + case "totalvisiblememorysize": + osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); + break; + } + } + } + } + catch (ManagementException) + { + } + finally + { + results?.Dispose(); + win32OsInfo?.Dispose(); + } + + return Task.FromResult(osInfo); + } + + // Not implemented. No Windows support. + public override int GetMaximumConfiguredFileHandlesCount() + { + return -1; + } + + // Not implemented. No Windows support. + public override int GetTotalAllocatedFileHandlesCount() + { + return -1; + } + + private int GetEphemeralPortCount(int processId = -1) { try { @@ -113,7 +306,7 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte p.StartInfo = ps; _ = p.Start(); var stdOutput = p.StandardOutput; - + (int lowPortRange, int highPortRange) = TupleGetDynamicPortRange(); string portRow; while ((portRow = stdOutput.ReadLine()) != null) @@ -129,6 +322,8 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte // would artificially increase the count of ports that FO computes. if (processId > 0) { + /* A pid could be a subset of a port number, so make sure that we only match pid. */ + List stats = portRow.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); if (stats.Count != 5 || !int.TryParse(stats[4], out int pidPart)) @@ -169,77 +364,26 @@ public override int GetActiveEphemeralPortCount(int processId = -1, ServiceConte if (exitStatus != 0) { - return -1; - } - } - - return count; - } - catch (Exception e) when ( - e is ArgumentException - || e is InvalidOperationException - || e is Win32Exception) - { - } - - return -1; - } + string msg = $"netstat failure: {exitStatus}"; + Logger.LogWarning(msg); - public override (int LowPort, int HighPort) TupleGetDynamicPortRange() - { - using (var p = new Process()) - { - try - { - var ps = new ProcessStartInfo - { - Arguments = $"/c netsh int ipv4 show dynamicportrange {TcpProtocol} | find /i \"port\"", - FileName = $"{Environment.GetFolderPath(Environment.SpecialFolder.System)}\\cmd.exe", - UseShellExecute = false, - WindowStyle = ProcessWindowStyle.Hidden, - RedirectStandardInput = true, - RedirectStandardOutput = true, - }; - - p.StartInfo = ps; - _ = p.Start(); - - var stdOutput = p.StandardOutput; - string output = stdOutput.ReadToEnd(); - Match match = Regex.Match( - output, - @"Start Port\s+:\s+(?\d+).+?Number of Ports\s+:\s+(?\d+)", - RegexOptions.Singleline | RegexOptions.IgnoreCase); - - string startPort = match.Groups["startPort"].Value; - string portCount = match.Groups["numberOfPorts"].Value; - string exitStatus = p.ExitCode.ToString(); - stdOutput.Close(); - - if (exitStatus != "0") - { - return (-1, -1); + // this will be handled by Retry.Do(). + throw new Exception(msg); } - int lowPortRange = int.Parse(startPort); - int highPortRange = lowPortRange + int.Parse(portCount); - - return (lowPortRange, highPortRange); - } - catch (Exception e) when ( - e is ArgumentException - || e is IOException - || e is InvalidOperationException - || e is RegexMatchTimeoutException - || e is Win32Exception) - { + return count; } } - - return (-1, -1); + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) + { + Logger.LogWarning($"Handled Exception in GetEphemeralPortCount:{Environment.NewLine}{e}"); + + // This will be handled by Retry.Do(). + throw; + } } - public override int GetActivePortCount(int processId = -1, ServiceContext context = null) + private int GetTcpPortCount(int processId = -1) { try { @@ -282,8 +426,10 @@ public override int GetActivePortCount(int processId = -1, ServiceContext contex // Only add unique pid (if supplied in call) and local port data to list. if (processId > 0) { + /* A pid could be a subset of a port number, so make sure that we only match pid. */ + List stats = portRow.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(); - + if (stats.Count != 5 || !int.TryParse(stats[4], out int pidPart)) { continue; @@ -310,114 +456,29 @@ public override int GetActivePortCount(int processId = -1, ServiceContext contex output = tempLocalPortData.Count; p.WaitForExit(); - string exitStatus = p.ExitCode.ToString(); + int exitStatus = p.ExitCode; stdOutput.Close(); tempLocalPortData.Clear(); - if (exitStatus != "0") + if (exitStatus != 0) { - return -1; - } + string msg = $"netstat failure: {exitStatus}"; + Logger.LogWarning(msg); + // this will be handled by Retry.Do(). + throw new Exception(msg); + } + return output; } } - catch (Exception e) when ( - e is ArgumentException - || e is InvalidOperationException - || e is Win32Exception) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is Win32Exception) { + Logger.LogWarning($"Handled Exception in GetTcpPortCount:{Environment.NewLine}{e}"); + // This will be handled by Retry.Do(). + throw; } - - return -1; - } - - public override Task GetOSInfoAsync(CancellationToken cancellationToken) - { - ManagementObjectSearcher win32OsInfo = null; - ManagementObjectCollection results = null; - - OSInfo osInfo = default; - - try - { - win32OsInfo = new ManagementObjectSearcher("SELECT Caption,Version,Status,OSLanguage,NumberOfProcesses,FreePhysicalMemory,FreeVirtualMemory,TotalVirtualMemorySize,TotalVisibleMemorySize,InstallDate,LastBootUpTime FROM Win32_OperatingSystem"); - results = win32OsInfo.Get(); - - foreach (var prop in results) - { - cancellationToken.ThrowIfCancellationRequested(); - - foreach (var p in prop.Properties) - { - cancellationToken.ThrowIfCancellationRequested(); - - string name = p.Name; - string value = p.Value.ToString(); - - if (string.IsNullOrEmpty(name) || string.IsNullOrEmpty(value)) - { - continue; - } - - switch (name.ToLowerInvariant()) - { - case "caption": - osInfo.Name = value; - break; - case "numberofprocesses": - if (int.TryParse(value, out int numProcesses)) - { - osInfo.NumberOfProcesses = numProcesses; - } - else - { - osInfo.NumberOfProcesses = -1; - } - - break; - case "status": - osInfo.Status = value; - break; - case "oslanguage": - osInfo.Language = value; - break; - case "version": - osInfo.Version = value; - break; - case "installdate": - osInfo.InstallDate = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - case "lastbootuptime": - osInfo.LastBootUpTime = ManagementDateTimeConverter.ToDateTime(value).ToUniversalTime().ToString("o"); - break; - case "freephysicalmemory": - osInfo.AvailableMemoryKB = ulong.Parse(value); - break; - case "freevirtualmemory": - osInfo.FreeVirtualMemoryKB = ulong.Parse(value); - break; - case "totalvirtualmemorysize": - osInfo.TotalVirtualMemorySizeKB = ulong.Parse(value); - break; - case "totalvisiblememorysize": - osInfo.TotalVisibleMemorySizeKB = ulong.Parse(value); - break; - } - } - } - } - catch (ManagementException) - { - } - finally - { - results?.Dispose(); - win32OsInfo?.Dispose(); - } - - return Task.FromResult(osInfo); } private int GetLocalPortFromConsoleOutputRow(string portRow) @@ -446,17 +507,5 @@ private int GetLocalPortFromConsoleOutputRow(string portRow) return int.Parse(localPort); } - - // Not implemented. No Windows support. - public override int GetMaximumConfiguredFileHandlesCount() - { - return -1; - } - - // Not implemented. No Windows support. - public override int GetTotalAllocatedFileHandlesCount() - { - return -1; - } } } diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs index f9070a6e..61e4ea43 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/IProcessInfoProvider.cs @@ -4,8 +4,6 @@ // ------------------------------------------------------------ using System.Fabric; -using System.Threading; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs index 3e3cc8c1..57734004 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/LinuxProcFS.cs @@ -3,7 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; using System.Collections.Generic; using System.IO; using System.Text; diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs index 2f000b6b..a430ec2e 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/ProcessInfoProvider.cs @@ -5,8 +5,6 @@ using System.Fabric; using System.Runtime.InteropServices; -using System.Threading; -using System.Threading.Tasks; namespace FabricObserver.Observers.Utilities { diff --git a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs index f4721dc5..099799ce 100644 --- a/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs +++ b/FabricObserver.Extensibility/Utilities/ProcessInfo/WindowsProcessInfoProvider.cs @@ -28,10 +28,10 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) processName = process.ProcessName; } } - catch (ArgumentException ex) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogError(ex.Message); + Logger.LogWarning($"Handled Exception in GetProcessPrivateWorkingSetInMB: {e.Message}"); return 0F; } @@ -48,15 +48,11 @@ public override float GetProcessPrivateWorkingSetInMB(int processId) InstanceName = processName }; - // warm up counter. - _ = memProcessPrivateWorkingSetCounter.NextValue(); - return memProcessPrivateWorkingSetCounter.NextValue() / (1024 * 1024); } - catch (Exception e) when (e is ArgumentNullException || e is PlatformNotSupportedException || - e is Win32Exception || e is UnauthorizedAccessException) + catch (Exception e) when (e is ArgumentNullException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogError($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{CategoryName} {WorkingSetCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return 0F; @@ -93,10 +89,10 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService processName = process.ProcessName; } } - catch (ArgumentException ex) + catch (Exception e) when (e is ArgumentException || e is InvalidOperationException || e is NotSupportedException) { // "Process with an Id of 12314 is not running." - Logger.LogError(ex.Message); + Logger.LogWarning($"Handled Exception in GetProcessAllocatedHandles: {e.Message}"); return -1F; } @@ -113,15 +109,11 @@ public override float GetProcessAllocatedHandles(int processId, StatelessService InstanceName = processName }; - // warm up counter. - _ = processFileHandleCounter.NextValue(); - return processFileHandleCounter.NextValue(); } - catch (Exception e) when (e is ArgumentNullException || e is PlatformNotSupportedException || - e is Win32Exception || e is UnauthorizedAccessException) + catch (Exception e) when (e is InvalidOperationException || e is Win32Exception || e is UnauthorizedAccessException) { - Logger.LogError($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); + Logger.LogWarning($"{CategoryName} {FileHandlesCounterName} PerfCounter handled error:{Environment.NewLine}{e}"); // Don't throw. return -1F; diff --git a/FabricObserver.Extensibility/Utilities/Retry.cs b/FabricObserver.Extensibility/Utilities/Retry.cs index 08c645ab..ac825b71 100644 --- a/FabricObserver.Extensibility/Utilities/Retry.cs +++ b/FabricObserver.Extensibility/Utilities/Retry.cs @@ -12,11 +12,7 @@ namespace FabricObserver.Observers.Utilities // https://stackoverflow.com/questions/1563191/cleanest-way-to-write-retry-logic public static class Retry { - public static void Do( - Action action, - TimeSpan retryInterval, - CancellationToken token, - int maxAttempts = 3) + public static void Do(Action action, TimeSpan retryInterval, CancellationToken token, int maxAttempts = 3) { _ = Do( () => @@ -30,11 +26,7 @@ public static void Do( maxAttempts); } - public static T Do( - Func action, - TimeSpan retryInterval, - CancellationToken token, - int maxAttemptCount = 3) + public static T Do(Func action, TimeSpan retryInterval, CancellationToken token, int maxAttemptCount = 3) { var exceptions = new List(); diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs index 637fc1a1..3a0be3a3 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/AppInsightsTelemetry.cs @@ -72,15 +72,15 @@ public string Key /// Error message on availability test run failure. /// A representing the asynchronous operation. public Task ReportAvailabilityAsync( - Uri serviceName, - string instance, - string testName, - DateTimeOffset captured, - TimeSpan duration, - string location, - bool success, - CancellationToken cancellationToken, - string message = null) + Uri serviceName, + string instance, + string testName, + DateTimeOffset captured, + TimeSpan duration, + string location, + bool success, + CancellationToken cancellationToken, + string message = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -110,14 +110,14 @@ public Task ReportAvailabilityAsync( /// Optional: TraceTelemetry context cloud instance name. /// A representing the asynchronous operation. public Task ReportHealthAsync( - HealthScope scope, - string propertyName, - HealthState state, - string unhealthyEvaluations, - string source, - CancellationToken cancellationToken, - string serviceName = null, - string instanceName = null) + HealthScope scope, + string propertyName, + HealthState state, + string unhealthyEvaluations, + string source, + CancellationToken cancellationToken, + string serviceName = null, + string instanceName = null) { if (!IsEnabled || cancellationToken.IsCancellationRequested) { @@ -147,7 +147,6 @@ public Task ReportHealthAsync( catch (Exception e) { logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); - throw; } return Task.CompletedTask; @@ -159,15 +158,11 @@ public Task ReportHealthAsync( /// TelemetryData instance. /// CancellationToken instance. /// a Task. - public Task ReportHealthAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportHealthAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { - if (!IsEnabled - || cancellationToken.IsCancellationRequested - || telemetryData == null) + if (!IsEnabled || cancellationToken.IsCancellationRequested || telemetryData == null) { - return Task.FromResult(1); + return Task.CompletedTask; } try @@ -183,31 +178,28 @@ public Task ReportHealthAsync( Dictionary properties = new Dictionary { - { "Application", telemetryData.ApplicationName ?? string.Empty }, { "ClusterId", telemetryData.ClusterId ?? string.Empty }, - { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.Description ?? string.Empty }, { "HealthState", telemetryData.HealthState ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "ErrorCode", telemetryData.Code ?? string.Empty }, + { "Description", telemetryData.Description ?? string.Empty }, { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", $"{telemetryData.PartitionId}" }, - { "Replica", $"{telemetryData.ReplicaId}" }, - { "Source", telemetryData.Source ?? string.Empty }, { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, }; - telemetryClient.TrackEvent( - $"{telemetryData.ObserverName ?? "ClusterObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); } catch (Exception e) { - logger.LogWarning( - $"Unhandled exception in TelemetryClient.ReportHealthAsync:" + - $"{Environment.NewLine}{e}"); - - throw; + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportHealthAsync:{Environment.NewLine}{e}"); } return Task.CompletedTask; @@ -222,10 +214,10 @@ public Task ReportHealthAsync( /// cancellation token. /// A Task of bool. public Task ReportMetricAsync( - string metric, - T value, - string source, - CancellationToken cancellationToken) + string metric, + T value, + string source, + CancellationToken cancellationToken) { if (!IsEnabled || string.IsNullOrEmpty(metric) || cancellationToken.IsCancellationRequested) { @@ -233,8 +225,8 @@ public Task ReportMetricAsync( } telemetryClient?.TrackEvent( - string.IsNullOrEmpty(source) ? ObserverConstants.FabricObserverETWEventName : source, - new Dictionary { { metric, value?.ToString() } }); + string.IsNullOrEmpty(source) ? ObserverConstants.FabricObserverETWEventName : source, + new Dictionary { { metric, value?.ToString() } }); return Task.FromResult(true); } @@ -245,35 +237,44 @@ public Task ReportMetricAsync( /// TelemetryData instance. /// Cancellation token. /// A task. - public Task ReportMetricAsync( - TelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportMetricAsync(TelemetryData telemetryData, CancellationToken cancellationToken) { if (telemetryData == null) { return Task.CompletedTask; } - Dictionary properties = new Dictionary + string value = null; + + if (telemetryData.Value != null) { - { "Application", telemetryData.ApplicationName ?? string.Empty }, - { "ClusterId", telemetryData.ClusterId ?? string.Empty }, - { "ErrorCode", telemetryData.Code ?? string.Empty }, - { "HealthEventDescription", telemetryData.Description ?? string.Empty }, - { "HealthState", telemetryData.HealthState ?? string.Empty }, - { "Metric", telemetryData.Metric ?? string.Empty }, - { "NodeName", telemetryData.NodeName ?? string.Empty }, - { "ObserverName", telemetryData.ObserverName ?? string.Empty }, - { "OSPlatform", telemetryData.OS }, - { "Partition", telemetryData.PartitionId ?? string.Empty }, - { "Replica", telemetryData.ReplicaId ?? string.Empty }, - { "Source", telemetryData.Source ?? string.Empty }, - { "Value", telemetryData.Value?.ToString() ?? string.Empty }, - }; + value = telemetryData.Value.ToString(); + } + + try + { + Dictionary properties = new Dictionary + { + { "ClusterId", telemetryData.ClusterId ?? string.Empty }, + { "Application", telemetryData.ApplicationName ?? string.Empty }, + { "Service", telemetryData.ServiceName ?? string.Empty }, + { "ProcessId", telemetryData.ProcessId ?? string.Empty }, + { "SystemServiceProcessName", telemetryData.SystemServiceProcessName ?? string.Empty }, + { "Metric", telemetryData.Metric ?? string.Empty }, + { "Value", value ?? string.Empty }, + { "Partition", telemetryData.PartitionId }, + { "Replica", telemetryData.ReplicaId }, + { "Source", telemetryData.ObserverName }, + { "NodeName", telemetryData.NodeName ?? string.Empty }, + { "OS", telemetryData.OS ?? string.Empty }, + }; - telemetryClient.TrackEvent( - $"{telemetryData.ObserverName ?? "FabricObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } return Task.CompletedTask; } @@ -284,42 +285,45 @@ public Task ReportMetricAsync( /// TelemetryData instance. /// Cancellation token. /// A task. - public Task ReportMetricAsync( - MachineTelemetryData telemetryData, - CancellationToken cancellationToken) + public Task ReportMetricAsync(MachineTelemetryData telemetryData, CancellationToken cancellationToken) { if (telemetryData == null || cancellationToken.IsCancellationRequested) { return Task.CompletedTask; } - Dictionary properties = new Dictionary + try { - { "ActiveEphemeralPorts", telemetryData.ActiveEphemeralPorts.ToString() }, - { "ActiveFirewallRules", telemetryData.ActiveFirewallRules.ToString() }, - { "ActivePorts", telemetryData.ActivePorts.ToString() }, - { "AvailablePhysicalMemory", telemetryData.AvailablePhysicalMemoryGB.ToString() }, - { "AvailableVirtualMemory", telemetryData.AvailableVirtualMemoryGB.ToString() }, - { "DriveInfo", telemetryData.DriveInfo }, - { "FabricAppPortRange", telemetryData.FabricAppPortRange.ToString() }, - { "HotFixes", telemetryData.HotFixes.ToString() }, - { "LastBootUpTime", telemetryData.LastBootUpTime.ToString() }, - { "Level", telemetryData.HealthState.ToString() }, - { "LogicalDriveCount", telemetryData.LogicalDriveCount.ToString() }, - { "LogicalProcessorCount", telemetryData.LogicalProcessorCount.ToString() }, - { "Node", telemetryData.Node.ToString() }, - { "NumberOfRunningProcesses", telemetryData.NumberOfRunningProcesses.ToString() }, - { "Observer", telemetryData.Observer }, - { "OS", telemetryData.OS }, - { "OSInstallDate", telemetryData.OSInstallDate }, - { "OSVersion", telemetryData.OSVersion }, - { "TotalMemorySizeGB", telemetryData.TotalMemorySizeGB.ToString() }, - { "WindowsDynamicPortRange", telemetryData.WindowsDynamicPortRange }, - }; + Dictionary properties = new Dictionary + { + { "ActiveEphemeralPorts", telemetryData.ActiveEphemeralPorts.ToString() }, + { "ActiveFirewallRules", telemetryData.ActiveFirewallRules.ToString() }, + { "ActivePorts", telemetryData.ActivePorts.ToString() }, + { "AvailablePhysicalMemory", telemetryData.AvailablePhysicalMemoryGB.ToString() }, + { "AvailableVirtualMemory", telemetryData.AvailableVirtualMemoryGB.ToString() }, + { "DriveInfo", telemetryData.DriveInfo }, + { "FabricAppPortRange", telemetryData.FabricAppPortRange.ToString() }, + { "HotFixes", telemetryData.HotFixes.ToString() }, + { "LastBootUpTime", telemetryData.LastBootUpTime.ToString() }, + { "Level", telemetryData.HealthState.ToString() }, + { "LogicalDriveCount", telemetryData.LogicalDriveCount.ToString() }, + { "LogicalProcessorCount", telemetryData.LogicalProcessorCount.ToString() }, + { "Node", telemetryData.Node.ToString() }, + { "NumberOfRunningProcesses", telemetryData.NumberOfRunningProcesses.ToString() }, + { "Observer", telemetryData.Observer }, + { "OS", telemetryData.OS }, + { "OSInstallDate", telemetryData.OSInstallDate }, + { "OSVersion", telemetryData.OSVersion }, + { "TotalMemorySizeGB", telemetryData.TotalMemorySizeGB.ToString() }, + { "WindowsDynamicPortRange", telemetryData.WindowsDynamicPortRange }, + }; - telemetryClient.TrackEvent( - $"{telemetryData.Observer ?? "FabricObserver"}DataEvent", - properties); + telemetryClient.TrackEvent(ObserverConstants.FabricObserverETWEventName, properties); + } + catch (Exception e) + { + logger.LogWarning($"Unhandled exception in TelemetryClient.ReportMetricAsync:{Environment.NewLine}{e}"); + } return Task.CompletedTask; } diff --git a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs index 5dfbf99d..60cea80b 100644 --- a/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs +++ b/FabricObserver.Extensibility/Utilities/Telemetry/TelemetryProdiverType.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace FabricObserver.Observers.Utilities.Telemetry { public enum TelemetryProviderType diff --git a/FabricObserver.nuspec.template b/FabricObserver.nuspec.template index 7ca5fc2b..0cbfd072 100644 --- a/FabricObserver.nuspec.template +++ b/FabricObserver.nuspec.template @@ -2,8 +2,18 @@ %PACKAGE_ID% - 3.1.8 - Updated Windows port monitoring code (ephemeral port range). Added new member to TelemetryData, ProcessId. Observers will no longer locally log Error or Warning Health Events when EnableVerboseLogging is set to false - and it should be false, generally. Only enable EnableVerboseLogging for debugging purposes. You should emit health telemetry to an external service as a general rule. FO supports ApplicationInsights and LogAnalytics out of the box. If you are using the FabricObserverWebApi service, you will still get local Error/Warning logs, regardless of EnableVerboseLogging setting. + 3.1.9 + + - Fixed minor EventSource logger bug. + - Added retry logic and logging to Windows port monitoring code. + - Added result set paging support in AppObserver's DeployedApplication query logic. + - Added retry logic to AppObserver's FabricClient calls. + - Added best effort code to automatically fix malformed targetApp values supplied in AppObserver configuration. + - Updated ApplicationInsights telemetry provider impl. + - Updated CsvLogger file management logic. + - Removed counter warm up calls in Windows Provider impls. Moved to callers. + - Code maintenance. + Microsoft MIT false diff --git a/FabricObserver/ApplicationInsights.config b/FabricObserver/ApplicationInsights.config index e18496a5..bba013fa 100644 --- a/FabricObserver/ApplicationInsights.config +++ b/FabricObserver/ApplicationInsights.config @@ -1,7 +1,6 @@  - + diff --git a/FabricObserver/FabricObserver.cs b/FabricObserver/FabricObserver.cs index b15edd49..9352c900 100644 --- a/FabricObserver/FabricObserver.cs +++ b/FabricObserver/FabricObserver.cs @@ -96,10 +96,7 @@ private void LoadObserversFromPlugins(ServiceCollection services) foreach (string pluginDll in pluginDlls) { - PluginLoader loader = PluginLoader.CreateFromAssemblyFile( - pluginDll, - sharedTypes); - + PluginLoader loader = PluginLoader.CreateFromAssemblyFile(pluginDll, sharedTypes); pluginLoaders.Add(loader); } @@ -107,8 +104,7 @@ private void LoadObserversFromPlugins(ServiceCollection services) { Assembly pluginAssembly = pluginLoader.LoadDefaultAssembly(); - FabricObserverStartupAttribute[] startupAttributes = - pluginAssembly.GetCustomAttributes().ToArray(); + FabricObserverStartupAttribute[] startupAttributes = pluginAssembly.GetCustomAttributes().ToArray(); for (int i = 0; i < startupAttributes.Length; ++i) { diff --git a/FabricObserver/Observers/AppObserver.cs b/FabricObserver/Observers/AppObserver.cs index d24beb04..588db4d9 100644 --- a/FabricObserver/Observers/AppObserver.cs +++ b/FabricObserver/Observers/AppObserver.cs @@ -8,6 +8,7 @@ using System.ComponentModel; using System.Diagnostics; using System.Fabric; +using System.Fabric.Description; using System.Fabric.Health; using System.Fabric.Query; using System.IO; @@ -87,10 +88,10 @@ public override async Task ObserveAsync(CancellationToken token) if (!initialized) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "This observer was unable to initialize correctly due to missing configuration info."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "This observer was unable to initialize correctly due to missing configuration info."); stopwatch.Stop(); stopwatch.Reset(); @@ -307,14 +308,13 @@ private async Task InitializeAsync() } configSettings.Initialize( - FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( - ObserverConstants.ObserverConfigurationPackageName)?.Settings, - ConfigurationSectionName, - "AppObserverDataFileName"); + FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( + ObserverConstants.ObserverConfigurationPackageName)?.Settings, + ConfigurationSectionName, + "AppObserverDataFileName"); - var appObserverConfigFileName = Path.Combine( - ConfigPackagePath ?? string.Empty, - configSettings.AppObserverConfigFileName ?? string.Empty); + // Unit tests may have null path and filename, thus the null equivalence operations. + var appObserverConfigFileName = Path.Combine(ConfigPackagePath ?? string.Empty, configSettings.AppObserverConfigFileName ?? string.Empty); if (!File.Exists(appObserverConfigFileName)) { @@ -361,19 +361,49 @@ private async Task InitializeAsync() return false; } - // Support for specifying single configuration item for any or all or * applications. + // Support for specifying single configuration item for all or * applications. if (userTargetList != null && userTargetList.Any(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*")) { ApplicationInfo application = userTargetList.Find(app => app.TargetApp?.ToLower() == "all" || app.TargetApp == "*"); - // TODO: This should be paged for cases where a node has hundreds of apps. - var appList = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( - NodeName, - null, - ConfigurationSettings.AsyncTimeout, - Token).ConfigureAwait(false); + // Let's make sure that we page through app lists that are huge (like 4MB result set (that's a lot of apps)). + var deployedAppQueryDesc = new PagedDeployedApplicationQueryDescription(NodeName) + { + IncludeHealthState = false, + MaxResults = 150, + }; + + var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + + // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. + // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports + // all-app service process monitoring with a very simple configuration pattern. + var apps = appList.ToList(); + + // The GetDeployedApplicationPagedList api will set a continuation token value if it knows it did not return all the results in one swoop. + // Check that it is not null, and make a new query passing back the token it gave you. + while (appList.ContinuationToken != null) + { + Token.ThrowIfCancellationRequested(); + + deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; + + appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); - foreach (var app in appList) + apps.AddRange(appList.ToList()); + } + + foreach (var app in apps) { Token.ThrowIfCancellationRequested(); @@ -452,29 +482,61 @@ private async Task InitializeAsync() // Remove the All or * config item. userTargetList.Remove(application); + apps.Clear(); + apps = null; } - int settingSFail = 0; + int settingsFail = 0; - foreach (var application in userTargetList) + for (int i = 0; i < userTargetList.Count; i++) { Token.ThrowIfCancellationRequested(); + Uri appUri = null; + ApplicationInfo application = userTargetList[i]; + if (string.IsNullOrWhiteSpace(application.TargetApp) && string.IsNullOrWhiteSpace(application.TargetAppType)) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.ToString(), - ObserverName, - HealthState.Warning, - $"Initialize() | {application.TargetApp}: Required setting, target, is not set."); - - settingSFail++; + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"InitializeAsync() | {application.TargetApp}: Required setting, target, is not set."); + settingsFail++; continue; } + else if (!string.IsNullOrWhiteSpace(application.TargetApp)) + { + try + { + if (!application.TargetApp.StartsWith("fabric:/")) + { + application.TargetApp = application.TargetApp.Insert(0, $"fabric:/"); + } + + if (application.TargetApp.Contains(" ")) + { + application.TargetApp = application.TargetApp.Replace(" ", string.Empty); + } + + appUri = new Uri(application.TargetApp); + } + catch (Exception e) when (e is ArgumentException || e is UriFormatException) + { + HealthReporter.ReportFabricObserverServiceHealth( + FabricServiceContext.ServiceName.ToString(), + ObserverName, + HealthState.Warning, + $"InitializeAsync() | {application.TargetApp}: Invalid TargetApp value. Value must be a valid Uri string of format \"fabric:/MyApp\", for example."); + + settingsFail++; + continue; + } + } // No required settings supplied for deployed application(s). - if (settingSFail == userTargetList.Count) + if (settingsFail == userTargetList.Count) { return false; } @@ -485,7 +547,7 @@ private async Task InitializeAsync() } else { - await SetDeployedApplicationReplicaOrInstanceListAsync(new Uri(application.TargetApp)).ConfigureAwait(false); + await SetDeployedApplicationReplicaOrInstanceListAsync(appUri).ConfigureAwait(false); } } @@ -496,7 +558,7 @@ private async Task InitializeAsync() try { // For hosted container apps, the host service is Fabric. AppObserver can't monitor these types of services. - // Please use ContainerObserver for SF container app service monitoring. + // Please use ContainerObserver for SF container app service monitoring. https://github.com/gittorre/ContainerObserver using Process p = Process.GetProcessById((int)rep.HostProcessId); if (p.ProcessName == "Fabric") @@ -609,7 +671,7 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) // Measure Total and Ephemeral ports. if (checkAllPorts) { - AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActivePortCount(currentProcess.Id, FabricServiceContext)); + AllAppTotalActivePortsData.FirstOrDefault(x => x.Id == id).Data.Add(OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(currentProcess.Id, FabricServiceContext)); } if (checkEphemeralPorts) @@ -636,19 +698,30 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) /* CPU and Memory Usage */ - TimeSpan duration = TimeSpan.FromSeconds(10); + TimeSpan duration = TimeSpan.FromSeconds(3); if (MonitorDuration > TimeSpan.MinValue) { duration = MonitorDuration; } - // Warm up the counters. + /* Warm up counters. */ + if (checkCpu) { _ = cpuUsage.GetCpuUsagePercentageProcess(currentProcess); } + if (checkHandles) + { + _ = ProcessInfoProvider.Instance.GetProcessAllocatedHandles(currentProcess.Id, FabricServiceContext); + } + + if (checkMemMb || checkMemPct) + { + _ = ProcessInfoProvider.Instance.GetProcessPrivateWorkingSetInMB(currentProcess.Id); + } + timer.Start(); while (!currentProcess.HasExited && timer.Elapsed.Seconds <= duration.Seconds) @@ -722,14 +795,8 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) #endif continue; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in MonitorDeployedAppsAsync:{Environment.NewLine}{e}", @@ -748,31 +815,53 @@ private async Task MonitorDeployedAppsAsync(CancellationToken token) private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicationNameFilter = null, string applicationType = null) { - DeployedApplicationList deployedApps; + List deployedApps = new List(); if (applicationNameFilter != null) { - deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(true); + var app = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, applicationNameFilter).ConfigureAwait(false); + deployedApps.AddRange(app.ToList()); } - else + else if (!string.IsNullOrWhiteSpace(applicationType)) { - deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName).ConfigureAwait(true); - - if (deployedApps.Count > 0 && !string.IsNullOrWhiteSpace(applicationType)) + // Let's make sure that we page through app lists that are huge (like 4MB result set (that's a lot of apps)). + var deployedAppQueryDesc = new PagedDeployedApplicationQueryDescription(NodeName) { - for (int i = 0; i < deployedApps.Count; i++) - { - Token.ThrowIfCancellationRequested(); + IncludeHealthState = false, + MaxResults = 150, + }; + + var appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + + // DeployedApplicationList is a wrapper around List, but does not support AddRange.. Thus, cast it ToList and add to the temp list, then iterate through it. + // In reality, this list will never be greater than, say, 1000 apps deployed to a node, but it's a good idea to be prepared since AppObserver supports + // all-app service process monitoring with a very simple configuration pattern. + deployedApps = appList.ToList(); + + // The GetDeployedApplicationPagedList api will set a continuation token value if it knows it did not return all the results in one swoop. + // Check that it is not null, and make a new query passing back the token it gave you. + while (appList.ContinuationToken != null) + { + Token.ThrowIfCancellationRequested(); - if (deployedApps[i].ApplicationTypeName == applicationType) - { - continue; - } + deployedAppQueryDesc.ContinuationToken = appList.ContinuationToken; - deployedApps.Remove(deployedApps[i]); - --i; - } + appList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedApplicationPagedListAsync( + deployedAppQueryDesc, + ConfigurationSettings.AsyncTimeout, + Token), + Token); + + deployedApps.AddRange(appList.ToList()); } + + deployedApps = deployedApps.Where(a => a.ApplicationTypeName == applicationType)?.ToList(); } var currentReplicaInfoList = new List(); @@ -784,10 +873,9 @@ private async Task SetDeployedApplicationReplicaOrInstanceListAsync(Uri applicat List filteredServiceList = null; // Filter service list if ServiceExcludeList/ServiceIncludeList config setting is non-empty. - var serviceFilter = userTargetList.Find(x => (x.TargetApp != null || x.TargetAppType != null) - && (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() - || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) - && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); + var serviceFilter = userTargetList.Find(x => (x.TargetApp?.ToLower() == deployedApp.ApplicationName?.OriginalString.ToLower() + || x.TargetAppType?.ToLower() == deployedApp.ApplicationTypeName?.ToLower()) + && (!string.IsNullOrWhiteSpace(x.ServiceExcludeList) || !string.IsNullOrWhiteSpace(x.ServiceIncludeList))); ServiceFilterType filterType = ServiceFilterType.None; @@ -829,7 +917,10 @@ private async Task> GetDeployedPrimaryRepl ServiceFilterType filterType = ServiceFilterType.None, string appTypeName = null) { - var deployedReplicaList = await FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, appName).ConfigureAwait(true); + var deployedReplicaList = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( + () => FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, appName), + Token); + var replicaMonitoringList = new List(); SetInstanceOrReplicaMonitoringList( @@ -857,9 +948,20 @@ private void SetInstanceOrReplicaMonitoringList( ReplicaOrInstanceMonitoringInfo replicaInfo = null; - if (deployedReplica is DeployedStatefulServiceReplica statefulReplica - && statefulReplica.ReplicaRole == ReplicaRole.Primary) + if (deployedReplica is DeployedStatefulServiceReplica statefulReplica && statefulReplica.ReplicaRole == ReplicaRole.Primary) { + if (filterList != null && filterType != ServiceFilterType.None) + { + bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + + switch (filterType) + { + case ServiceFilterType.Include when !isInFilterList: + case ServiceFilterType.Exclude when isInFilterList: + continue; + } + } + replicaInfo = new ReplicaOrInstanceMonitoringInfo() { ApplicationName = appName, @@ -869,10 +971,12 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statefulReplica.Partitionid, ServiceName = statefulReplica.ServiceName, }; - + } + else if (deployedReplica is DeployedStatelessServiceInstance statelessInstance) + { if (filterList != null && filterType != ServiceFilterType.None) { - bool isInFilterList = filterList.Any(s => statefulReplica.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); + bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); switch (filterType) { @@ -881,9 +985,7 @@ private void SetInstanceOrReplicaMonitoringList( continue; } } - } - else if (deployedReplica is DeployedStatelessServiceInstance statelessInstance) - { + replicaInfo = new ReplicaOrInstanceMonitoringInfo() { ApplicationName = appName, @@ -893,19 +995,6 @@ private void SetInstanceOrReplicaMonitoringList( PartitionId = statelessInstance.Partitionid, ServiceName = statelessInstance.ServiceName, }; - - if (filterList != null - && filterType != ServiceFilterType.None) - { - bool isInFilterList = filterList.Any(s => statelessInstance.ServiceName.OriginalString.ToLower().Contains(s.ToLower())); - - switch (filterType) - { - case ServiceFilterType.Include when !isInFilterList: - case ServiceFilterType.Exclude when isInFilterList: - continue; - } - } } if (replicaInfo != null) diff --git a/FabricObserver/Observers/CertificateObserver.cs b/FabricObserver/Observers/CertificateObserver.cs index 38ee28d8..1354afe9 100644 --- a/FabricObserver/Observers/CertificateObserver.cs +++ b/FabricObserver/Observers/CertificateObserver.cs @@ -79,8 +79,7 @@ public override async Task ObserveAsync(CancellationToken token) { // Only run once per specified time in Settings.xml. (default is already set to 1 day for CertificateObserver) // See Settings.xml, CertificateObserverConfiguration section, RunInterval parameter. - if (RunInterval > TimeSpan.MinValue - && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) + if (RunInterval > TimeSpan.MinValue && DateTime.Now.Subtract(LastRunDateTime) < RunInterval) { return; } @@ -165,18 +164,14 @@ public override Task ReportAsync(CancellationToken token) token.ThrowIfCancellationRequested(); // Someone calling without observing first, must be run after a new run of ObserveAsync - if (ExpiringWarnings == null || - ExpiredWarnings == null || - NotFoundWarnings == null) + if (ExpiringWarnings == null || ExpiredWarnings == null || NotFoundWarnings == null) { return Task.CompletedTask; } HealthReport healthReport; - if (ExpiringWarnings.Count == 0 - && ExpiredWarnings.Count == 0 - && NotFoundWarnings.Count == 0) + if (ExpiringWarnings.Count == 0 && ExpiredWarnings.Count == 0 && NotFoundWarnings.Count == 0) { healthReport = new HealthReport { @@ -226,27 +221,25 @@ public override Task ReportAsync(CancellationToken token) Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Code = FOErrorWarningCodes.WarningCertificateExpiration, - HealthState = "Warning", - NodeName, - Metric = ErrorWarningProperty.CertificateExpiration, - HealthEventDescription = healthMessage, - ObserverName, - OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", - Source = ObserverConstants.FabricObserverName, - Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), - }); + ObserverConstants.FabricObserverETWEventName, + new + { + Code = FOErrorWarningCodes.WarningCertificateExpiration, + HealthState = "Warning", + NodeName, + Metric = ErrorWarningProperty.CertificateExpiration, + HealthEventDescription = healthMessage, + ObserverName, + OS = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? "Windows" : "Linux", + Source = ObserverConstants.FabricObserverName, + Value = FOErrorWarningCodes.GetErrorWarningNameFromFOCode(FOErrorWarningCodes.WarningCertificateExpiration), + }); } } @@ -300,14 +293,14 @@ private async Task Initialize(CancellationToken token) token.ThrowIfCancellationRequested(); var daysUntilClusterExpireWarningThreshold = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverDaysUntilClusterExpiryWarningThreshold); + ConfigurationSectionName, + ObserverConstants.CertificateObserverDaysUntilClusterExpiryWarningThreshold); DaysUntilClusterExpireWarningThreshold = !string.IsNullOrEmpty(daysUntilClusterExpireWarningThreshold) ? int.Parse(daysUntilClusterExpireWarningThreshold) : 14; var daysUntilAppExpireWarningClusterThreshold = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverDaysUntilAppExpiryWarningThreshold); + ConfigurationSectionName, + ObserverConstants.CertificateObserverDaysUntilAppExpiryWarningThreshold); DaysUntilAppExpireWarningThreshold = !string.IsNullOrEmpty(daysUntilAppExpireWarningClusterThreshold) ? int.Parse(daysUntilAppExpireWarningClusterThreshold) : 14; @@ -323,8 +316,8 @@ private async Task Initialize(CancellationToken token) if (AppCertificateCommonNamesToObserve == null) { var appCommonNamesToObserve = GetSettingParameterValue( - ConfigurationSectionName, - ObserverConstants.CertificateObserverAppCertificateCommonNames); + ConfigurationSectionName, + ObserverConstants.CertificateObserverAppCertificateCommonNames); AppCertificateCommonNamesToObserve = !string.IsNullOrEmpty(appCommonNamesToObserve) ? JsonHelper.ConvertFromString>(appCommonNamesToObserve) : new List(); } @@ -337,11 +330,7 @@ private async Task GetSecurityTypes(CancellationToken token) token.ThrowIfCancellationRequested(); SecurityConfiguration = new SecurityConfiguration(); - - string clusterManifestXml = await FabricClientInstance.ClusterManager.GetClusterManifestAsync( - AsyncClusterOperationTimeoutSeconds, - Token).ConfigureAwait(true); - + string clusterManifestXml = await FabricClientInstance.ClusterManager.GetClusterManifestAsync(AsyncClusterOperationTimeoutSeconds, Token).ConfigureAwait(true); XmlReader xreader = null; StringReader sreader = null; @@ -391,12 +380,12 @@ private async Task GetSecurityTypes(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { WriteToLogWithLevel( - ObserverName, - $"There was an issue parsing the cluster manifest. Observer cannot run.\nError Details:\n{e}", - LogLevel.Error); + ObserverName, + $"There was an issue parsing the cluster manifest. Observer cannot run. Error Details:{Environment.NewLine}{e}", + LogLevel.Error); throw; } @@ -459,11 +448,7 @@ private void CheckLatestBySubjectName(X509Store store, string subjectName, int w private void CheckByThumbprint(X509Store store, string thumbprint, int warningThreshold) { - X509Certificate2Collection certificates = store.Certificates.Find( - X509FindType.FindByThumbprint, - thumbprint, - validOnly: false); - + X509Certificate2Collection certificates = store.Certificates.Find(X509FindType.FindByThumbprint, thumbprint, validOnly: false); X509Certificate2 certificate; if (certificates.Count == 0) @@ -473,15 +458,15 @@ private void CheckByThumbprint(X509Store store, string thumbprint, int warningTh if (!TryFindCertificate("/var/lib/sfcerts", thumbprint, out certificate) && !TryFindCertificate("/var/lib/waagent", thumbprint, out certificate)) { - NotFoundWarnings.Add( - $"Could not find requested certificate with thumbprint: {thumbprint} in /var/lib/sfcerts, /var/lib/waagent, and LocalMachine/Root"); + NotFoundWarnings.Add($"Could not find requested certificate with thumbprint: {thumbprint} in /var/lib/sfcerts, /var/lib/waagent, and LocalMachine/Root"); + return; } } else { - NotFoundWarnings.Add( - $"Could not find requested certificate with thumbprint: {thumbprint} in LocalMachine/My"); + NotFoundWarnings.Add($"Could not find requested certificate with thumbprint: {thumbprint} in LocalMachine/My"); + return; } } @@ -502,16 +487,16 @@ private void CheckByThumbprint(X509Store store, string thumbprint, int warningTh if (timeUntilExpiry.TotalMilliseconds < 0) { ExpiredWarnings.Add($"Certificate Expired on {expiry.ToShortDateString()}: " + - $"Thumbprint: {certificate.Thumbprint} " + - $"Issuer {certificate.Issuer}, " + - $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); + $"Thumbprint: {certificate.Thumbprint} " + + $"Issuer {certificate.Issuer}, " + + $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); } else if (timeUntilExpiry.TotalDays < warningThreshold) { ExpiringWarnings.Add($"Certificate Expiring on {expiry.ToShortDateString()}: " + - $"Thumbprint: {certificate.Thumbprint} " + - $"Issuer {certificate.Issuer}, " + - $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); + $"Thumbprint: {certificate.Thumbprint} " + + $"Issuer {certificate.Issuer}, " + + $"Subject: {certificate.Subject}{Environment.NewLine}{message}"); } } } diff --git a/FabricObserver/Observers/DiskObserver.cs b/FabricObserver/Observers/DiskObserver.cs index 716e7b9f..32277cfe 100644 --- a/FabricObserver/Observers/DiskObserver.cs +++ b/FabricObserver/Observers/DiskObserver.cs @@ -181,14 +181,8 @@ public override async Task ObserveAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", diff --git a/FabricObserver/Observers/FabricSystemObserver.cs b/FabricObserver/Observers/FabricSystemObserver.cs index e3a844ad..9c82a120 100644 --- a/FabricObserver/Observers/FabricSystemObserver.cs +++ b/FabricObserver/Observers/FabricSystemObserver.cs @@ -193,14 +193,8 @@ public override async Task ObserveAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ObserveAsync:{Environment.NewLine}{e}", @@ -386,14 +380,8 @@ public override Task ReportAsync(CancellationToken token) } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in ReportAsync:{Environment.NewLine}{e}", @@ -796,7 +784,7 @@ private async Task GetProcessInfoAsync(string procName) Token.ThrowIfCancellationRequested(); // Ports - Active TCP All - int activePortCount = OperatingSystemInfoProvider.Instance.GetActivePortCount(process.Id, FabricServiceContext); + int activePortCount = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(process.Id, FabricServiceContext); // This is used for info report. TotalActivePortCountAllSystemServices += activePortCount; @@ -877,14 +865,8 @@ private async Task GetProcessInfoAsync(string procName) await Task.Delay(250, Token).ConfigureAwait(false); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled Exception thrown in GetProcessInfoAsync:{Environment.NewLine}{e}", @@ -908,13 +890,8 @@ private async Task GetProcessInfoAsync(string procName) #endif continue; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - WriteToLogWithLevel( ObserverName, $"Unhandled exception in GetProcessInfoAsync:{Environment.NewLine}{e}", diff --git a/FabricObserver/Observers/NetworkObserver.cs b/FabricObserver/Observers/NetworkObserver.cs index 5f51b2a3..2210e571 100644 --- a/FabricObserver/Observers/NetworkObserver.cs +++ b/FabricObserver/Observers/NetworkObserver.cs @@ -109,10 +109,7 @@ public override async Task ObserveAsync(CancellationToken token) stopwatch.Start(); // Run conn tests. - Retry.Do( - InternetConnectionStateIsConnected, - TimeSpan.FromSeconds(10), - token); + Retry.Do(InternetConnectionStateIsConnected, TimeSpan.FromSeconds(10), token); await ReportAsync(token).ConfigureAwait(true); @@ -161,22 +158,23 @@ public override Task ReportAsync(CancellationToken token) ObserverName = ObserverName, Metric = ErrorWarningProperty.InternetConnectionFailure, NodeName = NodeName, + Source = ObserverConstants.FabricObserverName, }; if (IsTelemetryEnabled) { - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } var report = new HealthReport { AppName = new Uri(conn.TargetApp), + Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, HealthData = telemetryData, HealthMessage = healthMessage, HealthReportTimeToLive = timeToLiveWarning, + SourceId = $"{ObserverConstants.NetworkObserverName}({FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable})", State = healthState, NodeName = NodeName, Observer = ObserverName, @@ -195,17 +193,18 @@ public override Task ReportAsync(CancellationToken token) if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - ApplicationName = conn.TargetApp, - Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, - HealthState = "Warning", - HealthEventDescription = healthMessage, - ObserverName, - Metric = ErrorWarningProperty.InternetConnectionFailure, - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + ApplicationName = conn.TargetApp, + Code = FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable, + HealthState = "Warning", + HealthEventDescription = healthMessage, + ObserverName, + Metric = ErrorWarningProperty.InternetConnectionFailure, + NodeName, + Source = ObserverConstants.FabricObserverName, + }); } } else @@ -226,6 +225,7 @@ public override Task ReportAsync(CancellationToken token) EmitLogEvent = EnableVerboseLogging || IsObserverWebApiAppDeployed, HealthMessage = healthMessage, HealthReportTimeToLive = default, + SourceId = $"{ObserverConstants.NetworkObserverName}({FOErrorWarningCodes.AppWarningNetworkEndpointUnreachable})", State = HealthState.Ok, NodeName = NodeName, Observer = ObserverName, @@ -247,28 +247,28 @@ public override Task ReportAsync(CancellationToken token) ObserverName = ObserverName, Metric = "Internet Connection State", NodeName = NodeName, + Source = ObserverConstants.FabricObserverName, }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - ApplicationName = conn.TargetApp, - Code = FOErrorWarningCodes.Ok, - HealthState = "Ok", - HealthEventDescription = healthMessage, - ObserverName, - Metric = "Internet Connection State", - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + ApplicationName = conn.TargetApp, + Code = FOErrorWarningCodes.Ok, + HealthState = "Ok", + HealthEventDescription = healthMessage, + ObserverName, + Metric = "Internet Connection State", + NodeName, + Source = ObserverConstants.FabricObserverName, + }); } // Reset health state. @@ -297,16 +297,15 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) return string.Empty; } - var interfaceInfo = new StringBuilder( - $"Network Interface information for {iPGlobalProperties.HostName}:\n "); + var interfaceInfo = new StringBuilder($"Network Interface information for {iPGlobalProperties.HostName}:{Environment.NewLine} "); foreach (var nic in nics) { token.ThrowIfCancellationRequested(); - _ = interfaceInfo.Append("\n" + nic.Description + "\n"); - _ = interfaceInfo.AppendFormat(" Interface type : {0}\n", nic.NetworkInterfaceType); - _ = interfaceInfo.AppendFormat(" Operational status: {0}\n", nic.OperationalStatus); + _ = interfaceInfo.Append($"{Environment.NewLine}{nic.Description}{Environment.NewLine}"); + _ = interfaceInfo.AppendFormat($" Interface type : {0}{Environment.NewLine}", nic.NetworkInterfaceType); + _ = interfaceInfo.AppendFormat($" Operational status: {0}{Environment.NewLine}", nic.OperationalStatus); // Traffic. if (nic.OperationalStatus != OperationalStatus.Up) @@ -318,10 +317,10 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) var stats = nic.GetIPv4Statistics(); - _ = interfaceInfo.AppendFormat(" Bytes received: {0}\n", stats.BytesReceived); - _ = interfaceInfo.AppendFormat(" Bytes sent: {0}\n", stats.BytesSent); - _ = interfaceInfo.AppendFormat(" Incoming Packets With Errors: {0}\n", stats.IncomingPacketsWithErrors); - _ = interfaceInfo.AppendFormat(" Outgoing Packets With Errors: {0}\n", stats.OutgoingPacketsWithErrors); + _ = interfaceInfo.AppendFormat($" Bytes received: {0}{Environment.NewLine}", stats.BytesReceived); + _ = interfaceInfo.AppendFormat($" Bytes sent: {0}{Environment.NewLine}", stats.BytesSent); + _ = interfaceInfo.AppendFormat($" Incoming Packets With Errors: {0}{Environment.NewLine}", stats.IncomingPacketsWithErrors); + _ = interfaceInfo.AppendFormat($" Outgoing Packets With Errors: {0}{Environment.NewLine}", stats.OutgoingPacketsWithErrors); _ = interfaceInfo.AppendLine(); } @@ -338,11 +337,6 @@ private static string GetNetworkInterfaceInfo(CancellationToken token) private async Task InitializeAsync() { - WriteToLogWithLevel( - ObserverName, - $"Initializing {ObserverName} for network monitoring. | {NodeName}", - LogLevel.Information); - cancellationToken.ThrowIfCancellationRequested(); // This only needs to be logged once. @@ -356,54 +350,38 @@ private async Task InitializeAsync() if (!ObserverLogger.TryWriteLogFile(logPath, GetNetworkInterfaceInfo(cancellationToken))) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create NetInfo.txt file."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Unable to create NetInfo.txt file."); } } - var settings = - FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject( - ObserverConstants.ObserverConfigurationPackageName)?.Settings; - - configSettings.Initialize( - settings, - ConfigurationSectionName, - "NetworkObserverDataFileName"); - - var networkObserverConfigFileName = - Path.Combine(dataPackagePath ?? string.Empty, configSettings.NetworkObserverConfigFileName); + var settings = FabricServiceContext.CodePackageActivationContext.GetConfigurationPackageObject(ObserverConstants.ObserverConfigurationPackageName)?.Settings; + configSettings.Initialize(settings, ConfigurationSectionName, "NetworkObserverDataFileName"); + var networkObserverConfigFileName = Path.Combine(dataPackagePath ?? string.Empty, configSettings.NetworkObserverConfigFileName); if (string.IsNullOrWhiteSpace(networkObserverConfigFileName)) { ObserverLogger.LogWarning("NetworkObserver configuration file path not specified. Exiting."); - return false; } if (!File.Exists(networkObserverConfigFileName)) { ObserverLogger.LogWarning("NetworkObserver configuration file not found. Exiting."); - return false; } if (userConfig.Count == 0) { - using (Stream stream = new FileStream( - networkObserverConfigFileName, - FileMode.Open, - FileAccess.Read, - FileShare.Read)) + using (Stream stream = new FileStream(networkObserverConfigFileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { var configs = JsonHelper.ReadFromJsonStream(stream); foreach (var netConfig in configs) { - var deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync( - NodeName, - new Uri(netConfig.TargetApp)).ConfigureAwait(false); + var deployedApps = await FabricClientInstance.QueryManager.GetDeployedApplicationListAsync(NodeName, new Uri(netConfig.TargetApp)).ConfigureAwait(false); if (deployedApps == null || deployedApps.Count < 1) { @@ -439,6 +417,8 @@ private void InternetConnectionStateIsConnected() foreach (var endpoint in config.Endpoints) { + cancellationToken.ThrowIfCancellationRequested(); + if (string.IsNullOrEmpty(endpoint.HostName)) { continue; @@ -452,16 +432,13 @@ private void InternetConnectionStateIsConnected() } bool passed = false; - cancellationToken.ThrowIfCancellationRequested(); // SQL Azure, other database services that are addressable over direct TCP. if (endpoint.Protocol == DirectInternetProtocol.Tcp) { passed = TcpEndpointDoConnectionTest(endpoint.HostName, endpoint.Port); } - - // Default is http. - else + else // Default is http. { // Service REST endpoints, CosmosDB REST endpoint, etc. // Http protocol means any enpoint/port pair that is addressable over HTTP/s. @@ -499,8 +476,7 @@ private void InternetConnectionStateIsConnected() } catch (IOException ie) { - if (ie.InnerException != null - && ie.InnerException is ProtocolViolationException) + if (ie.InnerException != null && ie.InnerException is ProtocolViolationException) { passed = true; } @@ -528,13 +504,13 @@ private void InternetConnectionStateIsConnected() passed = true; } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - e.ToString()); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + e.ToString()); // Fix the bug.. throw; @@ -586,8 +562,7 @@ private bool TcpEndpointDoConnectionTest(string hostName, int port) { var se = ie.InnerException as SocketException; - if (se.SocketErrorCode == SocketError.ConnectionRefused - || se.SocketErrorCode == SocketError.ConnectionReset) + if (se.SocketErrorCode == SocketError.ConnectionRefused || se.SocketErrorCode == SocketError.ConnectionReset) { if (tcpConnTestRetried <= MaxTcpConnTestRetries) { @@ -604,8 +579,7 @@ private bool TcpEndpointDoConnectionTest(string hostName, int port) } catch (SocketException se) { - if (se.SocketErrorCode == SocketError.ConnectionRefused - || se.SocketErrorCode == SocketError.ConnectionReset) + if (se.SocketErrorCode == SocketError.ConnectionRefused || se.SocketErrorCode == SocketError.ConnectionReset) { if (tcpConnTestRetried < MaxTcpConnTestRetries) { @@ -633,47 +607,43 @@ private void SetHealthState(Endpoint endpoint, string targetApp, bool passed) { if (passed) { - if (healthState == HealthState.Warning && - connectionStatus.Any(conn => conn.HostName == endpoint.HostName && - conn.Health == HealthState.Warning)) + if (healthState == HealthState.Warning && connectionStatus.Any(conn => conn.HostName == endpoint.HostName && conn.Health == HealthState.Warning)) { _ = connectionStatus.RemoveAll(conn => conn.HostName == endpoint.HostName); connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = true, - Health = HealthState.Warning, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = true, + Health = HealthState.Warning, + TargetApp = targetApp, + }); } else { connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = true, - Health = HealthState.Ok, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = true, + Health = HealthState.Ok, + TargetApp = targetApp, + }); } } else { - if (!connectionStatus.Any(conn => conn.HostName == endpoint.HostName && - conn.TargetApp == targetApp && - conn.Health == HealthState.Warning)) + if (!connectionStatus.Any(conn => conn.HostName == endpoint.HostName && conn.TargetApp == targetApp && conn.Health == HealthState.Warning)) { connectionStatus.Add( - new ConnectionState - { - HostName = endpoint.HostName, - Connected = false, - Health = HealthState.Warning, - TargetApp = targetApp, - }); + new ConnectionState + { + HostName = endpoint.HostName, + Connected = false, + Health = HealthState.Warning, + TargetApp = targetApp, + }); if (!AppNames.Contains(targetApp)) { diff --git a/FabricObserver/Observers/NodeObserver.cs b/FabricObserver/Observers/NodeObserver.cs index 6e8aa946..7a5efb54 100644 --- a/FabricObserver/Observers/NodeObserver.cs +++ b/FabricObserver/Observers/NodeObserver.cs @@ -394,13 +394,8 @@ public override Task ReportAsync(CancellationToken token) return Task.CompletedTask; } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, @@ -631,7 +626,7 @@ private async Task GetSystemCpuMemoryValuesAsync(CancellationToken token) // Ports. if (ActivePortsData != null && (ActivePortsErrorThreshold > 0 || ActivePortsWarningThreshold > 0)) { - int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActivePortCount(); + int activePortCountTotal = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); ActivePortsData.Data.Add(activePortCountTotal); } @@ -731,14 +726,8 @@ error on these conditions. */ timer.Stop(); timer.Reset(); } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, diff --git a/FabricObserver/Observers/OSObserver.cs b/FabricObserver/Observers/OSObserver.cs index 2fa5cac0..4b4157f4 100644 --- a/FabricObserver/Observers/OSObserver.cs +++ b/FabricObserver/Observers/OSObserver.cs @@ -84,10 +84,10 @@ public override async Task ObserveAsync(CancellationToken token) public override Task ReportAsync(CancellationToken token) { + token.ThrowIfCancellationRequested(); + try { - token.ThrowIfCancellationRequested(); - // OS Health. if (osStatus != null && !string.Equals(osStatus, "OK", StringComparison.OrdinalIgnoreCase)) { @@ -110,12 +110,26 @@ public override Task ReportAsync(CancellationToken token) if (IsTelemetryEnabled) { _ = TelemetryClient?.ReportHealthAsync( - HealthScope.Application, - FabricRuntime.GetActivationContext().ApplicationName, - HealthState.Error, - $"{NodeName} - OS reporting unhealthy: {osStatus}", - ObserverName, - Token); + HealthScope.Node, + ObserverConstants.FabricObserverName, + HealthState.Error, + healthMessage, + ObserverName, + Token); + } + + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + HealthScope = HealthScope.Node, + Source = ObserverConstants.FabricObserverName, + HealthState = HealthState.Error, + Description = healthMessage, + ObserverName + }); } } else if (HasActiveFabricErrorOrWarning && string.Equals(osStatus, "OK", StringComparison.OrdinalIgnoreCase)) @@ -129,11 +143,36 @@ public override Task ReportAsync(CancellationToken token) NodeName = NodeName, HealthMessage = healthMessage, State = HealthState.Ok, - HealthReportTimeToLive = default, + HealthReportTimeToLive = GetHealthReportTimeToLive(), }; HealthReporter.ReportHealthToServiceFabric(healthReport); + if (IsTelemetryEnabled) + { + _ = TelemetryClient?.ReportHealthAsync( + HealthScope.Node, + ObserverConstants.FabricObserverName, + HealthState.Error, + healthMessage, + ObserverName, + Token); + } + + if (IsEtwEnabled) + { + ObserverLogger.LogEtw( + ObserverConstants.FabricObserverETWEventName, + new + { + HealthScope = HealthScope.Node, + Source = ObserverConstants.FabricObserverName, + HealthState = HealthState.Error, + Description = healthMessage, + ObserverName + }); + } + // Reset internal health state. HasActiveFabricErrorOrWarning = false; } @@ -188,9 +227,7 @@ public override Task ReportAsync(CancellationToken token) HealthReporter.ReportHealthToServiceFabric(report); - if (IsTelemetryProviderEnabled - && IsTelemetryEnabled - && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + if (IsEtwEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { // Send Health Report as Telemetry (perhaps it signals an Alert from App Insights, for example.). var telemetryData = new TelemetryData(FabricClientInstance, token) @@ -204,36 +241,28 @@ public override Task ReportAsync(CancellationToken token) Source = ObserverConstants.FabricObserverName, }; - _ = TelemetryClient?.ReportMetricAsync( - telemetryData, - Token); + _ = TelemetryClient?.ReportHealthAsync(telemetryData, Token); } // ETW. if (IsEtwProviderEnabled && RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - HealthState = "Warning", - HealthEventDescription = auServiceEnabledMessage, - ObserverName, - Metric = "WUAutoDownloadEnabled", - Value = isAUAutomaticDownloadEnabled, - NodeName, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + HealthState = "Warning", + HealthEventDescription = auServiceEnabledMessage, + ObserverName, + Metric = "WUAutoDownloadEnabled", + Value = isAUAutomaticDownloadEnabled, + NodeName, + }); } } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, @@ -269,16 +298,16 @@ private async Task> GetInfrastructureServiceInstancesAsync( await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => FabricClientInstance.QueryManager.GetServiceListAsync( - new Uri("fabric:/System"), - null, - ConfigurationSettings.AsyncTimeout, - Token), + new Uri("fabric:/System"), + null, + ConfigurationSettings.AsyncTimeout, + Token), Token).ConfigureAwait(false); var infraInstances = allSystemServices.Where( - i => i.ServiceTypeName.Equals( - "InfrastructureServiceType", - StringComparison.InvariantCultureIgnoreCase)); + i => i.ServiceTypeName.Equals( + "InfrastructureServiceType", + StringComparison.InvariantCultureIgnoreCase)); return infraInstances; } @@ -377,7 +406,7 @@ private async Task GetComputerInfoAsync(CancellationToken token) osStatus = osInfo.Status; // Active, bound ports. - int activePorts = OperatingSystemInfoProvider.Instance.GetActivePortCount(); + int activePorts = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(); // Active, ephemeral ports. int activeEphemeralPorts = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(); @@ -535,32 +564,32 @@ private async Task GetComputerInfoAsync(CancellationToken token) } ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - HealthState = "Ok", - Node = NodeName, - Observer = ObserverName, - OS = osInfo.Name, - OSVersion = osInfo.Version, - OSInstallDate = osInfo.InstallDate, - AutoUpdateEnabled = auStateUnknown ? "Unknown" : isAUAutomaticDownloadEnabled.ToString(), - osInfo.LastBootUpTime, - WindowsAutoUpdateEnabled = isAUAutomaticDownloadEnabled, - TotalMemorySizeGB = (int)(osInfo.TotalVisibleMemorySizeKB / 1048576), - AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), - AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), - LogicalProcessorCount = logicalProcessorCount, - LogicalDriveCount = logicalDriveCount, - DriveInfo = driveInfo, - NumberOfRunningProcesses = osInfo.NumberOfProcesses, - ActiveFirewallRules = firewalls, - ActivePorts = activePorts, - ActiveEphemeralPorts = activeEphemeralPorts, - WindowsDynamicPortRange = osEphemeralPortRange, - FabricAppPortRange = fabricAppPortRange, - HotFixes = hotFixes, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + HealthState = "Ok", + Node = NodeName, + Observer = ObserverName, + OS = osInfo.Name, + OSVersion = osInfo.Version, + OSInstallDate = osInfo.InstallDate, + AutoUpdateEnabled = auStateUnknown ? "Unknown" : isAUAutomaticDownloadEnabled.ToString(), + osInfo.LastBootUpTime, + WindowsAutoUpdateEnabled = isAUAutomaticDownloadEnabled, + TotalMemorySizeGB = (int)(osInfo.TotalVisibleMemorySizeKB / 1048576), + AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), + AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), + LogicalProcessorCount = logicalProcessorCount, + LogicalDriveCount = logicalDriveCount, + DriveInfo = driveInfo, + NumberOfRunningProcesses = osInfo.NumberOfProcesses, + ActiveFirewallRules = firewalls, + ActivePorts = activePorts, + ActiveEphemeralPorts = activeEphemeralPorts, + WindowsDynamicPortRange = osEphemeralPortRange, + FabricAppPortRange = fabricAppPortRange, + HotFixes = hotFixes, + }); } // Telemetry @@ -572,40 +601,34 @@ private async Task GetComputerInfoAsync(CancellationToken token) } TelemetryClient?.ReportMetricAsync( - new MachineTelemetryData - { - HealthState = "Ok", - Node = NodeName, - Observer = ObserverName, - OS = osInfo.Name, - OSVersion = osInfo.Version, - OSInstallDate = osInfo.InstallDate, - LastBootUpTime = osInfo.LastBootUpTime, - WindowsUpdateAutoDownloadEnabled = isAUAutomaticDownloadEnabled, - TotalMemorySizeGB = (int)osInfo.TotalVisibleMemorySizeKB / 1048576, - AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), - AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), - LogicalProcessorCount = logicalProcessorCount, - LogicalDriveCount = logicalDriveCount, - DriveInfo = driveInfo, - NumberOfRunningProcesses = osInfo.NumberOfProcesses, - ActiveFirewallRules = firewalls, - ActivePorts = activePorts, - ActiveEphemeralPorts = activeEphemeralPorts, - WindowsDynamicPortRange = osEphemeralPortRange, - FabricAppPortRange = fabricAppPortRange, - HotFixes = hotFixes, - }, Token); + new MachineTelemetryData + { + HealthState = "Ok", + Node = NodeName, + Observer = ObserverName, + OS = osInfo.Name, + OSVersion = osInfo.Version, + OSInstallDate = osInfo.InstallDate, + LastBootUpTime = osInfo.LastBootUpTime, + WindowsUpdateAutoDownloadEnabled = isAUAutomaticDownloadEnabled, + TotalMemorySizeGB = (int)osInfo.TotalVisibleMemorySizeKB / 1048576, + AvailablePhysicalMemoryGB = Math.Round(osInfo.FreePhysicalMemoryKB / 1048576.0, 2), + AvailableVirtualMemoryGB = Math.Round(osInfo.FreeVirtualMemoryKB / 1048576.0, 2), + LogicalProcessorCount = logicalProcessorCount, + LogicalDriveCount = logicalDriveCount, + DriveInfo = driveInfo, + NumberOfRunningProcesses = osInfo.NumberOfProcesses, + ActiveFirewallRules = firewalls, + ActivePorts = activePorts, + ActiveEphemeralPorts = activeEphemeralPorts, + WindowsDynamicPortRange = osEphemeralPortRange, + FabricAppPortRange = fabricAppPortRange, + HotFixes = hotFixes, + }, Token); } } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { - // ObserverManager handles these. - if (e is OperationCanceledException || e is TaskCanceledException) - { - throw; - } - HealthReporter.ReportFabricObserverServiceHealth( FabricServiceContext.ServiceName.OriginalString, ObserverName, diff --git a/FabricObserver/Observers/ObserverManager.cs b/FabricObserver/Observers/ObserverManager.cs index 0191930a..e7825cda 100644 --- a/FabricObserver/Observers/ObserverManager.cs +++ b/FabricObserver/Observers/ObserverManager.cs @@ -5,7 +5,6 @@ using System; using System.Collections.Generic; -using System.Diagnostics; using System.Fabric; using System.Fabric.Health; using System.IO; @@ -122,7 +121,7 @@ private int MaxArchivedLogFileLifetimeDays /// /// Initializes a new instance of the class. - /// This is used for unit testing. + /// This is only used by unit tests. /// /// Observer instance. public ObserverManager(ObserverBase observer, FabricClient fabricClient) @@ -208,10 +207,10 @@ public ObserverManager(IServiceProvider serviceProvider, FabricClient fabricClie } #endif telemetryEvents = new TelemetryEvents( - FabricClientInstance, - FabricServiceContext, - ServiceEventSource.Current, - this.token); + FabricClientInstance, + FabricServiceContext, + ServiceEventSource.Current, + this.token); if (telemetryEvents.FabricObserverRuntimeNodeEvent(codePkgVersion, GetFabricObserverInternalConfiguration(), "HealthState.Initialized")) { @@ -356,8 +355,9 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig await Task.Delay(250).ConfigureAwait(false); } } - catch (Exception) + catch (FabricException) { + } await Task.Delay(250).ConfigureAwait(false); @@ -389,6 +389,7 @@ public async Task StopObserversAsync(bool shutdownSignaled = true, bool isConfig } catch (FabricException) { + } await Task.Delay(250).ConfigureAwait(false); @@ -452,6 +453,7 @@ private static bool IsObserverWebApiAppInstalled() } catch (Exception e) when (e is FabricException || e is TimeoutException) { + } return false; @@ -469,6 +471,7 @@ private static string GetConfigSettingValue(string parameterName) } catch (Exception e) when (e is KeyNotFoundException || e is FabricElementNotFoundException) { + } return null; diff --git a/FabricObserver/Observers/SFConfigurationObserver.cs b/FabricObserver/Observers/SFConfigurationObserver.cs index b7ce261f..2dd80ba4 100644 --- a/FabricObserver/Observers/SFConfigurationObserver.cs +++ b/FabricObserver/Observers/SFConfigurationObserver.cs @@ -100,17 +100,13 @@ public override async Task ObserveAsync(CancellationToken token) { } - catch (Exception e) when (e is OperationCanceledException || e is TaskCanceledException) - { - return; - } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - $"this.NodeName | Unhandled Exception trying to read registry value:\n{e}"); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + $"Unhandled Exception in ObserveAsync:{Environment.NewLine}{e}"); throw; } @@ -124,7 +120,7 @@ public override async Task ReportAsync(CancellationToken token) var sb = new StringBuilder(); - _ = sb.AppendLine("\nService Fabric information:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Service Fabric information:{Environment.NewLine}"); if (!string.IsNullOrEmpty(SFVersion)) { @@ -182,10 +178,10 @@ public override async Task ReportAsync(CancellationToken token) if (!ObserverLogger.TryWriteLogFile(logPath, sb.ToString())) { HealthReporter.ReportFabricObserverServiceHealth( - FabricServiceContext.ServiceName.OriginalString, - ObserverName, - HealthState.Warning, - "Unable to create SFInfraInfo.txt file."); + FabricServiceContext.ServiceName.OriginalString, + ObserverName, + HealthState.Warning, + "Unable to create SFInfraInfo.txt file."); } _ = sb.Clear(); @@ -257,7 +253,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) token.ThrowIfCancellationRequested(); // Node Information. - _ = sb.AppendLine("\nNode Info:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Node Info:{Environment.NewLine}"); _ = sb.AppendLine($"Node Name: {NodeName}"); _ = sb.AppendLine($"Node Id: {FabricServiceContext.NodeContext.NodeId}"); _ = sb.AppendLine($"Node Instance Id: {FabricServiceContext.NodeContext.NodeInstanceId}"); @@ -288,7 +284,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) // Application Info. if (appList != null) { - _ = sb.AppendLine("\nDeployed Apps:\n"); + _ = sb.AppendLine($"{Environment.NewLine}Deployed Apps:{Environment.NewLine}"); foreach (var app in appList) { @@ -307,7 +303,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) _ = sb.AppendLine("Status: " + status); // Service(s). - _ = sb.AppendLine("\n\tServices:"); + _ = sb.AppendLine($"{Environment.NewLine}\tServices:"); var serviceList = await FabricClientInstance.QueryManager.GetServiceListAsync(app.ApplicationName).ConfigureAwait(true); var replicaList = await FabricClientInstance.QueryManager.GetDeployedReplicaListAsync(NodeName, app.ApplicationName).ConfigureAwait(true); @@ -333,7 +329,7 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) if (procId > -1) { - ports = OperatingSystemInfoProvider.Instance.GetActivePortCount(procId); + ports = OperatingSystemInfoProvider.Instance.GetActiveTcpPortCount(procId); ephemeralPorts = OperatingSystemInfoProvider.Instance.GetActiveEphemeralPortCount(procId); } @@ -359,25 +355,25 @@ private async Task GetDeployedAppsInfoAsync(CancellationToken token) if (IsEtwEnabled) { ObserverLogger.LogEtw( - ObserverConstants.FabricObserverETWEventName, - new - { - Level = 0, // Info - Node = NodeName, - Observer = ObserverName, - AppName = appName, - AppType = appType, - AppVersion = appVersion, - AppHealthState = healthState, - AppStatus = status, - ServiceName = serviceName.OriginalString, - ServiceTypeName = type, - Kind = kind, - ProcessModel = processModel, - ServiceManifestVersion = serviceManifestVersion, - ActivePorts = ports, - EphemeralPorts = ephemeralPorts, - }); + ObserverConstants.FabricObserverETWEventName, + new + { + Level = 0, // Info + Node = NodeName, + Observer = ObserverName, + AppName = appName, + AppType = appType, + AppVersion = appVersion, + AppHealthState = healthState, + AppStatus = status, + ServiceName = serviceName.OriginalString, + ServiceTypeName = type, + Kind = kind, + ProcessModel = processModel, + ServiceManifestVersion = serviceManifestVersion, + ActivePorts = ports, + EphemeralPorts = ephemeralPorts, + }); } break; diff --git a/FabricObserver/PackageRoot/Config/AppObserver.config.json b/FabricObserver/PackageRoot/Config/AppObserver.config.json index 1e4e294a..db8fb24f 100644 --- a/FabricObserver/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserver/PackageRoot/Config/AppObserver.config.json @@ -9,7 +9,7 @@ }, { "targetAppType": "SomeAppType", - "memoryWarningLimitMb": 1048, - "networkWarningEphemeralPorts": 7500 + "memoryWarningLimitMb": 500, + "networkWarningEphemeralPorts": 5000 } ] \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest._linux.xml b/FabricObserver/PackageRoot/ServiceManifest._linux.xml index 4cdd5c8a..fd1b4533 100644 --- a/FabricObserver/PackageRoot/ServiceManifest._linux.xml +++ b/FabricObserver/PackageRoot/ServiceManifest._linux.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + setcaps.sh @@ -27,11 +27,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserver/PackageRoot/ServiceManifest.xml b/FabricObserver/PackageRoot/ServiceManifest.xml index 907f7c8c..d8d7bace 100644 --- a/FabricObserver/PackageRoot/ServiceManifest.xml +++ b/FabricObserver/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricObserver @@ -21,11 +21,11 @@ - + - + \ No newline at end of file diff --git a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml index bcd16400..bb644e16 100644 --- a/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -133,7 +133,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + diff --git a/FabricObserverTests/ObserverTest.cs b/FabricObserverTests/ObserverTest.cs index d2a3a6eb..0f5eba2f 100644 --- a/FabricObserverTests/ObserverTest.cs +++ b/FabricObserverTests/ObserverTest.cs @@ -280,47 +280,7 @@ public async Task AppObserver_ObserveAsync_Successful_Observer_IsHealthy() var obs = new AppObserver(fabricClient, context) { - MonitorDuration = TimeSpan.FromSeconds(5), - ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), - ReplicaOrInstanceList = new List(), - }; - - await obs.ObserveAsync(token).ConfigureAwait(true); - - // observer ran to completion with no errors. - Assert.IsTrue(obs.LastRunDateTime > startDateTime); - - // observer detected no warning conditions. - Assert.IsFalse(obs.HasActiveFabricErrorOrWarning); - - // observer did not have any internal errors during run. - Assert.IsFalse(obs.IsUnhealthy); - - await CleanupTestHealthReportsAsync(obs); - - obs.Dispose(); - } - - /// - /// . - /// - /// A representing the result of the asynchronous operation. - [TestMethod] - public async Task AppObserver_ObserveAsync_TargetAppType_Successful_Observer_IsHealthy() - { - if (!isSFRuntimePresentOnTestMachine) - { - return; - } - - var startDateTime = DateTime.Now; - ObserverManager.FabricServiceContext = context; - ObserverManager.TelemetryEnabled = false; - ObserverManager.EtwEnabled = false; - - var obs = new AppObserver(fabricClient, context) - { - MonitorDuration = TimeSpan.FromSeconds(5), + MonitorDuration = TimeSpan.FromSeconds(1), ConfigPackagePath = Path.Combine(Environment.CurrentDirectory, "PackageRoot", "Config", "AppObserver.config.json"), ReplicaOrInstanceList = new List(), }; @@ -341,7 +301,6 @@ public async Task AppObserver_ObserveAsync_TargetAppType_Successful_Observer_IsH obs.Dispose(); } - [TestMethod] public async Task ClusterObserver_ObserveAsync_Successful_Observer_IsHealthy() { @@ -392,8 +351,8 @@ public async Task Successful_CertificateObserver_Run_Cancellation_Via_ObserverMa obsMgr.Dispose(); } - /* NOTE: These test can be flaky due to the Test infra. Try running them as a group, after running all the other tests as a group. - If any fail, then re-run the failed ones.. */ + /* NOTE: These tests are flaky due to the Test infra. Try running them as a group, after running all the other tests as a group. + If any fail, then re-run the failed ones. */ [TestMethod] public async Task Successful_AppObserver_Run_Cancellation_Via_ObserverManager() @@ -1113,8 +1072,6 @@ public async Task NetworkObserver_ObserveAsync_Successful_Observer_WritesLocalFi string outputFilePath = Path.Combine(Environment.CurrentDirectory, "observer_logs", "NetInfo.txt"); - Console.WriteLine($"outputFilePath: {outputFilePath}"); - // Output log file was created successfully during test. Assert.IsTrue(File.Exists(outputFilePath) && File.GetLastWriteTime(outputFilePath) > startDateTime @@ -1151,6 +1108,7 @@ public async Task NodeObserver_ObserveAsync_Successful_Observer_IsHealthy_Warnin UseCircularBuffer = true, CpuWarningUsageThresholdPct = 10, MemWarningUsageThresholdMb = 1, // This will generate Warning for sure. + ActivePortsWarningThreshold = 100, // This will generate Warning for sure. }; var obsMgr = new ObserverManager(obs, fabricClient); diff --git a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json index d1db115a..15ed9e75 100644 --- a/FabricObserverTests/PackageRoot/Config/AppObserver.config.json +++ b/FabricObserverTests/PackageRoot/Config/AppObserver.config.json @@ -1,10 +1,9 @@ [ { "targetApp": "*", - "cpuWarningLimitPercent": 60, - "networkWarningActivePorts": 1800, - "networkWarningEphemeralPorts": 1400, - "warningOpenFileHandles": 5000 + "cpuWarningLimitPercent": 90, + "networkWarningActivePorts": 8000, + "networkWarningEphemeralPorts": 5000 }, { "targetAppType": "MyAppType", @@ -15,5 +14,15 @@ "targetApp": "fabric:/MyHardWorkingApp42", "cpuWarningLimitPercent": 90, "memoryWarningLimitPercent": 60 + }, + { + "targetApp": "MalformedValueApp0", + "cpuWarningLimitPercent": 50, + "memoryWarningLimitPercent": 40 + }, + { + "targetApp": "Malformed Value App1", + "cpuWarningLimitPercent": 70, + "memoryWarningLimitPercent": 20 } ] \ No newline at end of file diff --git a/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json b/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json index 67fabea6..87412ed1 100644 --- a/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json +++ b/FabricObserverTests/PackageRoot/Config/NetworkObserver.config.json @@ -3,16 +3,24 @@ "targetApp": "fabric:/TestApp", "endpoints": [ { - "hostname": "www.facebook.com", - "port": 443 + "hostname": "https://myazuresrvice42.westus2.cloudapp.azure.com", + "port": 443, + "protocol": "http" }, { - "hostname": "www.google.com", - "port": 443 - }, + "hostname": "somesqlservername.database.windows.net", + "port": 1433, + "protocol": "tcp" + } + ] + }, + { + "targetApp": "fabric:/TestApp1", + "endpoints": [ { - "hostname": "www.microsoft.com", - "port": 443 + "hostname": "somesqlservername.database.windows.net", + "port": 1433, + "protocol": "tcp" } ] } diff --git a/README.md b/README.md index f737f62b..7938a92c 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# FabricObserver 3.1.8 +# FabricObserver 3.1.9 [**FabricObserver (FO)**](https://github.com/microsoft/service-fabric-observer/releases) is a complete implementation of a generic resource usage watchdog service written as a stateless, singleton Service Fabric .NET Core 3.1 application that 1. Monitors a broad range of resources that tend to be important to all Service Fabric applications, like disk, CPU, memory, networking, and cluster certificates out-of-the-box. @@ -57,7 +57,7 @@ For more information about **the design of FabricObserver**, please see the [Des 2. Install [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet-core/3.1) 3. Build. -Note: There is no need to run FO as system on Windows or root on Linux. +***Note: By default, FO runs as NetworkUser on Windows and sfappsuser on Linux. If you want to monitor SF service processes that run as elevated (System) on Windows, then you must also run FO as System on Windows. There is no reason to run as root on Linux under any circumstances (see the Capabilities binaries implementations, which allow for FO to run as sfappsuser and successfully execute specific commands that require elevated privilege).*** For Linux deployments, we have ensured that FO will work as expected as normal user (non-root user). In order for us to do this, we had to implement a setup script that sets [Capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html) on a proxy binary which can run netstat -tnap elevated. If you deploy from VS, then you will need to use FabricObserver/PackageRoot/ServiceManifest.linux.xml (just copy its contents into ServiceManifest.xml or add the new piece which is simply a SetupEntryPoint section). You will also need to do the same with ApplicationManifest.xml (see FabricObserverApp/ApplicationPackageRoot/ApplicationManifest.linux.xml for required changes). If you use our build scripts, they will take care of these modifications automatically for linux build output. diff --git a/SampleObserverPlugin/ReadMe.md b/SampleObserverPlugin/ReadMe.md index 42f70d17..180b44e1 100644 --- a/SampleObserverPlugin/ReadMe.md +++ b/SampleObserverPlugin/ReadMe.md @@ -126,7 +126,7 @@ You can deploy using the contents of your build out directory - just remove the * Create new instance of FO, which will contain your observer plugin ```Powershell $path = "[sourcedir]\MyObserverPlugin\bin\release\netstandard2.0\[target os platform, e.g., win-x64 or linux-x64]" -Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV316 -TimeoutSec 1800 +Copy-ServiceFabricApplicationPackage -ApplicationPackagePath $path -CompressPackage -ApplicationPackagePathInImageStore FabricObserverV319 -TimeoutSec 1800 Register-ServiceFabricApplicationType -ApplicationPathInImageStore FabricObserverV316 -New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.6 +New-ServiceFabricApplication -ApplicationName fabric:/FabricObserver -ApplicationTypeName FabricObserverType -ApplicationTypeVersion 3.1.9 ``` diff --git a/SampleObserverPlugin/SampleNewObserver.cs b/SampleObserverPlugin/SampleNewObserver.cs index 4ce49a17..1f3cbdfd 100644 --- a/SampleObserverPlugin/SampleNewObserver.cs +++ b/SampleObserverPlugin/SampleNewObserver.cs @@ -58,7 +58,7 @@ public override async Task ObserveAsync(CancellationToken token) totalNumberOfDeployedServices += services.Count; servicesInWarningError += services.Where(s => s.HealthState == HealthState.Warning || s.HealthState == HealthState.Error).Count(); - + foreach (var service in services) { var partitions = await FabricClientInstance.QueryManager.GetPartitionListAsync( diff --git a/SampleObserverPlugin/SampleObserverPlugin.csproj b/SampleObserverPlugin/SampleObserverPlugin.csproj index 7134e5d2..edb0b677 100644 --- a/SampleObserverPlugin/SampleObserverPlugin.csproj +++ b/SampleObserverPlugin/SampleObserverPlugin.csproj @@ -10,6 +10,6 @@ - + diff --git a/TelemetryLib/ITelemetryEventSource.cs b/TelemetryLib/ITelemetryEventSource.cs index ff137a71..23a5045d 100644 --- a/TelemetryLib/ITelemetryEventSource.cs +++ b/TelemetryLib/ITelemetryEventSource.cs @@ -3,8 +3,6 @@ // Licensed under the MIT License (MIT). See License.txt in the repo root for license information. // ------------------------------------------------------------ -using System; - namespace Microsoft.ServiceFabric.TelemetryLib { ///