From c2b10129d91020bed5c5ad1469dd277bdb823118 Mon Sep 17 00:00:00 2001 From: Tim Meusel Date: Mon, 30 Sep 2024 15:58:00 +0200 Subject: [PATCH] (#236) agent_state_summary: Count nodes without report as unhealthy It's possible that a Puppet Agent was stopped or disabled and all old reports were garbage collected from PuppetDB. The node still exists in PuppetDB, but when checking for a report the timestamp is null: ``` puppet query nodes[certname,report_timestamp]{} ``` ```json [ { "certname": "pe.tim.local", "report_timestamp": "2024-09-30T13:21:17.042Z" }, { "certname": "pe2.tim.local", "report_timestamp": null } ] ``` Previously we always assumed that `report_timestamp` has a valid timestamp. With this patch we explicitly validate the timestamp and count nodes withhout a timestamp as unhealthy. Now with the fix: ``` puppet plan run pe_status_check::agent_state_summary --environment peadm log_healthy_nodes=true log_unhealthy_nodes=true ``` ```json { "responsive": [ "pe.tim.local", "pe2.tim.local" ], "healthy_counter": 0, "total_counter": 2, "unhealthy_counter": 2, "noop": [], "unhealthy": [ "pe2.tim.local", "pe.tim.local" ], "healthy": [], "changed": [ "pe.tim.local" ], "no_report": [ "pe.tim.local" ], "corrective_changes": [], "used_cached_catalog": [ "pe2.tim.local" ], "unresponsive": [], "failed": [] } ``` --- README.md | 2 ++ plans/agent_state_summary.pp | 16 +++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2b5e8f74..000e3292 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,7 @@ The plan `pe_status_check::agent_state_summary` provides you a hash with all nod "failed": [ ], "changed": [ "student2.local" ], "unresponsive": [ "student3.local", "student4.local", "student1.local", "login.local" ], + "no_report": [ "newnode.with.report.local" ], "responsive": [ "pe.bastelfreak.local"], "unhealthy": [ "student2.local", "student3.local", "student4.local", "student1.local", "login.local" ], "unhealthy_counter": 5, @@ -181,6 +182,7 @@ The plan `pe_status_check::agent_state_summary` provides you a hash with all nod * `failed`: The last catalog couldn't be compiled or catalog application raised an error * `changed`: A node reported a change * `unresponsive`: Last report is older than 30 minutes (can be configured via the `runinterval` parameter) +* `no_report`: The node exists in PuppetDB but has no reports * `corrective_changes`: A node reported corrective changes * `used_cached_catalog`: The node didn't apply a new catalog but used a cached version * `unhealthy`: List of nodes that are in any of the above categories diff --git a/plans/agent_state_summary.pp b/plans/agent_state_summary.pp index d964dd3f..779d8c7f 100644 --- a/plans/agent_state_summary.pp +++ b/plans/agent_state_summary.pp @@ -16,18 +16,23 @@ $nodes = puppetdb_query('nodes[certname,latest_report_noop,latest_report_corrective_change,cached_catalog_status,latest_report_status,report_timestamp]{}') $fqdns = $nodes.map |$node| { $node['certname'] } - # check if the last catalog is older than X minutes + # check if the node has a report + # `report_timestamp` will be undef, or null, if no report exists + $no_report_nodes = $nodes.filter |$node| { $node['report_timestamp'] =~ Undef } + $no_report = $no_report_nodes.map |$node| { $node['certname'] } + + # check if the last report is older than X minutes, for all nodes that have a report $current_timestamp = Integer(Timestamp().strftime('%s')) $runinterval_seconds = $runinterval * 60 - $unresponsive = $nodes.map |$node| { + $unresponsive = ($nodes - $no_report_nodes).map |$node| { $old_timestamp = Integer(Timestamp($node['report_timestamp']).strftime('%s')) if ($current_timestamp - $old_timestamp) >= $runinterval_seconds { - $node['certname'] + $node } }.filter |$node| { $node =~ NotUndef } # all nodes that delivered a report in time - $responsive = $fqdns - $unresponsive + $responsive = $fqdns - $unresponsive - $no_report # all nodes that used noop for the last catalog $noop = $nodes.map |$node| { if ($node['latest_report_noop'] == true){ $node['certname'] } }.filter |$node| { $node =~ NotUndef } @@ -45,7 +50,7 @@ $changed = $nodes.map |$node| { if ($node['latest_report_status'] == 'changed'){ $node['certname'] } }.filter |$node| { $node =~ NotUndef } # all nodes that aren't healthy in any form - $unhealthy = [$noop, $corrective_changes, $used_cached_catalog, $failed, $changed, $unresponsive].flatten.unique + $unhealthy = [$noop, $corrective_changes, $used_cached_catalog, $failed, $changed, $unresponsive, $no_report].flatten.unique # all healthy nodes $healthy = $fqdns - $unhealthy @@ -58,6 +63,7 @@ 'failed' => $failed, 'changed' => $changed, 'unresponsive' => $unresponsive, + 'no_report' => $no_report, 'responsive' => $responsive, 'unhealthy' => $unhealthy, 'unhealthy_counter' => $unhealthy.count,