diff --git a/CHANGELOG.md b/CHANGELOG.md index 12a485f..02ff08e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ This CHANGELOG follows the format listed [here](https://github.com/sensu-plugins ### Changed - check-mysql-replication-status: fix code flow if server is not a slave (@DrMurx) - check-mysql-replication-status: refactoring & spec tests (@DrMurx) +- check-mysql-replication-status: added flapping protection (@DrMurx) ## [3.1.0] - 2018-12-15 ### Added diff --git a/README.md b/README.md index 32e0a4b..23f7114 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,15 @@ $ /opt/sensu/embedded/bin/check-mysql-threads.rb --host= --ini=/etc/sens $ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host= --ini=/etc/sensu/my.ini ``` +**check-mysql-replication-status** example with flapping protection + +MariaDB/MySQL sometimes wrongly reports a very high replication lag for a short moment. Flapping protection helps mitigating this issue +better than setting `occurrences` in sensu's `checks` definition because you don't lose any alerting granularity. + +```bash +$ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host= --ini=/etc/sensu/my.ini --flapping-retry=1 --flapping-lag=86400 --flapping-sleep=2 +``` + **check-mysql-msr-replication-status** example ```bash $ /opt/sensu/embedded/bin/check-mysql-replication-status.rb --host= --ini=/etc/sensu/my.ini diff --git a/bin/check-mysql-replication-status.rb b/bin/check-mysql-replication-status.rb index 83b5017..3998edd 100755 --- a/bin/check-mysql-replication-status.rb +++ b/bin/check-mysql-replication-status.rb @@ -94,6 +94,27 @@ class CheckMysqlReplicationStatus < Sensu::Plugin::Check::CLI # #YELLOW proc: lambda { |s| s.to_i } # rubocop:disable Lambda + option :flapping_lag, + short: '-l', + long: '--flapping-lag=VALUE', + description: 'Lag threshold to trigger flapping protection', + default: 100000, + proc: lambda { |s| s.to_i } # rubocop:disable Lambda + + option :flapping_retry, + short: '-r', + long: '--flapping-retry=VALUE', + description: 'Number of retries when lag flapping protection is triggered', + default: 0, + proc: lambda { |s| s.to_i } # rubocop:disable Lambda + + option :flapping_sleep, + long: '--flapping-sleep=VALUE', + description: 'Sleep between flapping protection retries', + default: 1, + proc: lambda { |s| s.to_i } # rubocop:disable Lambda + + def detect_replication_status?(row) %w[ Slave_IO_State @@ -175,19 +196,29 @@ def ok_slave_message def run db = open_connection - row = query_slave_status(db) - ok 'show slave status was nil. This server is not a slave.' if row.nil? - warn "couldn't detect replication status" unless detect_replication_status?(row) - - slave_running = slave_running?(row) - critical broken_slave_message(row) unless slave_running - - replication_delay = row['Seconds_Behind_Master'].to_i - message = "replication delayed by #{replication_delay}" - # TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks - critical message if replication_delay > config[:crit] - warning message if replication_delay > config[:warn] - ok "#{ok_slave_message}, #{message}" + retries = config[:flapping_retry] + # Note: Endless loop will exit via `ok`, `warning` or `critical`, or by Exception + loop do + row = query_slave_status(db) + ok 'show slave status was nil. This server is not a slave.' if row.nil? + warn "couldn't detect replication status" unless detect_replication_status?(row) + + slave_running = slave_running?(row) + critical broken_slave_message(row) unless slave_running + + replication_delay = row['Seconds_Behind_Master'].to_i + retries -= 1 + if replication_delay >= config[:flapping_lag] && retries >= 0 + sleep config[:flapping_sleep] + next + end + + message = "replication delayed by #{replication_delay}" + # TODO (breaking change): Thresholds are exclusive which is not consistent with all other checks + critical message if replication_delay > config[:crit] + warning message if replication_delay > config[:warn] + ok "#{ok_slave_message}, #{message}" + end rescue Mysql::Error => e errstr = "Error code: #{e.errno} Error message: #{e.error}" critical "#{errstr} SQLSTATE: #{e.sqlstate}" if e.respond_to?('sqlstate') diff --git a/test/check-mysql-replication-status_spec.rb b/test/check-mysql-replication-status_spec.rb index 09181b7..87cfb51 100644 --- a/test/check-mysql-replication-status_spec.rb +++ b/test/check-mysql-replication-status_spec.rb @@ -52,10 +52,10 @@ def checker.critical(*_args) ['No', 'Yes', nil, 2, 'critical'], ['Yes', 'No', nil, 2, 'critical'], ['No', 'No', nil, 2, 'critical'], - ['Yes', 'Yes', 899, 0, 'ok'], - ['Yes', 'Yes', 900, 1, 'warning'], - ['Yes', 'Yes', 1799, 1, 'warning'], - ['Yes', 'Yes', 1800, 2, 'critical'], + ['Yes', 'Yes', 900, 0, 'ok'], + ['Yes', 'Yes', 901, 1, 'warning'], + ['Yes', 'Yes', 1800, 1, 'warning'], + ['Yes', 'Yes', 1801, 2, 'critical'], ].each do |testdata| it "returns #{testdata[4]} for default thresholds" do slave_status_row = { @@ -76,4 +76,38 @@ def checker.critical(*_args) expect(exit_code).to eq testdata[3] end end + + it "sleeps with flapping protection for default thresholds" do + checker.config[:flapping_retry] = 1 + checker.config[:flapping_sleep] = 10 + + slave_status_row = [ + { + "Slave_IO_State" => '', + "Slave_IO_Running" => 'Yes', + "Slave_SQL_Running" => 'Yes', + "Last_IO_Error" => '', + "Last_SQL_Error" => '', + "Seconds_Behind_Master" => 100000 + }, + { + "Slave_IO_State" => '', + "Slave_IO_Running" => 'Yes', + "Slave_SQL_Running" => 'Yes', + "Last_IO_Error" => '', + "Last_SQL_Error" => '', + "Seconds_Behind_Master" => 99999 + } + ] + + begin + allow(checker).to receive(:open_connection) # do nothing + allow(checker).to receive(:query_slave_status).and_return slave_status_row[0], slave_status_row[1] + expect(checker).to receive(:sleep).with(10) + checker.run + rescue SystemExit => e + exit_code = e.status + end + expect(exit_code).to eq 2 + end end