From 61d08734f5faffbca631e1fa9843874289e29756 Mon Sep 17 00:00:00 2001 From: Firas Dib Date: Tue, 3 Oct 2023 17:37:02 +0200 Subject: [PATCH] Publish latest code --- .gitignore | 6 + README.md | 40 +++ config.json.example | 48 +++ config.schema.json | 252 ++++++++++++++ reports/__init__.py | 0 reports/discord_report.py | 144 ++++++++ reports/email_format.html | 26 ++ reports/email_report.py | 141 ++++++++ requirements.txt | 12 + snapper.py | 702 ++++++++++++++++++++++++++++++++++++++ utils.py | 12 + 11 files changed, 1383 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config.json.example create mode 100644 config.schema.json create mode 100644 reports/__init__.py create mode 100644 reports/discord_report.py create mode 100644 reports/email_format.html create mode 100644 reports/email_report.py create mode 100644 requirements.txt create mode 100644 snapper.py create mode 100644 utils.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..63478c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.idea/ +snapper.iml +snapper/ +config.json +__pycache__ +/*.sh diff --git a/README.md b/README.md new file mode 100644 index 0000000..07dee05 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +# :turtle: Snapper + +Snapper is a simple python script that executes [SnapRAID](https://github.com/amadvance/snapraid) in order to sync and scrub the array. Inspired by the great [snapraid-aio-script](https://github.com/auanasgheps/snapraid-aio-script) with a limited feature set. + +The reason I created this is that I wanted more granular control of how my setup worked, which consequently means, this script is opinionated. + +## Features + +- Sanity checks the array +- Runs `touch` if necessary +- Runs `diff` before attempting to `sync` +- Allows you to pre-hash before syncing +- Allows you to automatically re-run `sync` if snapraid recommends it +- Allows you to run snapraid with a lower priority to keep server and drives responsive +- Allows you to abort execution if configurable thresholds are broken +- Allows you to `scrub` after `sync` +- Logs the raw snapraid output as well as formatted text +- Creates a nicely formatted report and sends it via email or discord +- Provides live insight into the sync/scrub process in Discord +- Spin down selected hard drives after script completion + +**This project is a work in progress, and can change at any time.** + +I welcome bugfixes and contributions, but be aware that I will not merge PRs that I do not feel do not fit the usage of this tool. + +## How to use + +- Ensure you have Python 3.7 or later installed +- Install the necessary dependencies by running `pip3 install -r requirements.txt` +- Download the [latest release](https://github.com/firasdib/snapper/releases/latest) of this project, or clone the git project. +- Copy or rename `config.json.example` to `config.json` +- Run the script via `python3 snapper.py` + +You may run the script with the `--force` flag to force a sync/scrub and ignore any thresholds or sanity checks. + +## Configuration + +A `config.json` file is required and expected to be in the same root as this script. + +Please read through the [json schema](config.schema.json) to understand the exact details of each property. If you're not fluent in json schema (I don't blame you), you could use something like [this](https://json-schema.app/view/%23?url=https%3A%2F%2Fraw.githubusercontent.com%2Ffirasdib%2Fsnapper%2Fmain%2Fconfig.schema.json) to get a better idea of the different options. diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..d372ad2 --- /dev/null +++ b/config.json.example @@ -0,0 +1,48 @@ +{ + "snapraid": { + "binary": "/usr/bin/snapraid", + "config": "/etc/snapraid.conf", + "nice": 10, + "diff": { + "thresholds": { + "added": 500, + "removed": 500 + } + }, + "sync": { + "pre_hash": true, + "auto_sync": { + "enabled": false, + "max_attempts": 3 + } + }, + "scrub": { + "enabled": true, + "check_percent": 3, + "min_age": 30, + "scrub_new": true + } + }, + "notifications": { + "email": { + "enabled": true, + "binary": "/usr/bin/mailx", + "from_email": "from@email.com", + "to_email": "to@email.com" + }, + "discord": { + "enabled": false, + "webhook_id": "", + "webhook_token": "" + } + }, + "logs": { + "dir": "/var/log/snapper", + "max_count": 14 + }, + "spindown": { + "enabled": true, + "binary": "/usr/sbin/hdparm", + "drives": "parity" + } +} diff --git a/config.schema.json b/config.schema.json new file mode 100644 index 0000000..1746606 --- /dev/null +++ b/config.schema.json @@ -0,0 +1,252 @@ +{ + "type": "object", + "properties": { + "snapraid": { + "type": "object", + "properties": { + "binary": { + "type": "string", + "examples": [ + "/usr/bin/snapraid" + ], + "description": "The location of your snapraid executable." + }, + "config": { + "type": "string", + "examples": [ + "/etc/snapraid.conf" + ], + "description": "Location of the snapraid config file. Necessary for sanity checks." + }, + "nice": { + "type": "number", + "examples": [ + 10 + ], + "description": "Run snapraid at a given `nice`. By default processes run at `0`. Lower values mean higher priority. Ranges between -20 to +19.", + "minimum": -20, + "maximum": 19 + }, + "diff": { + "type": "object", + "properties": { + "thresholds": { + "type": "object", + "properties": { + "added": { + "type": "number", + "examples": [ + 500 + ], + "description": "If more files than the threshold amount have been added, don't execute jobs. Set to `0` to disable.", + "minimum": 0 + }, + "removed": { + "type": "number", + "examples": [ + 500 + ], + "description": "If more files than the threshold amount have been removed, don't execute jobs. Set to `0` to disable.", + "minimum": 0 + } + }, + "additionalProperties": false, + "required": ["added", "removed"] + } + }, + "additionalProperties": false, + "required": ["thresholds"] + }, + "sync": { + "type": "object", + "properties": { + "pre_hash": { + "type": "boolean", + "examples": [ + true + ], + "description": "Whether to pre-hash changed blocks before syncing." + }, + "auto_sync": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "examples": [ + false + ], + "description": "Whether or not to re-run the sync command if snapraid recommends it." + }, + "max_attempts": { + "type": "number", + "examples": [3], + "description": "The max amount of attempts to `sync` the array before bailing.", + "minimum": 0 + } + }, + "additionalProperties": false, + "required": ["enabled", "max_attempts"] + } + }, + "additionalProperties": false, + "required": ["pre_hash", "auto_sync"] + }, + "scrub": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "examples": [ + true + ], + "description": "Whether or not to scrub the array." + }, + "check_percent": { + "type": "number", + "examples": [ + 3 + ], + "description": "How many percent of the array to scrub each time. Set to `0` to disable scrubbing.", + "minimum": 0, + "maximum": 100 + }, + "min_age": { + "type": "number", + "examples": [ + 30 + ], + "description": "How old the blocks have to be before considered for scrub, in days.", + "minimum": 1 + }, + "scrub_new": { + "type": "boolean", + "examples": [ + true + ], + "description": "Whether to scrub newly synced blocks or not." + } + }, + "additionalProperties": false, + "required": ["enabled", "check_percent", "min_age", "scrub_new"] + } + }, + "additionalProperties": false, + "required": ["binary", "config", "nice", "diff", "sync", "scrub"] + }, + "notifications": { + "type": "object", + "properties": { + "email": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "examples": [ + true + ], + "description": "Whether or not to send notifications and reports to the defined email." + }, + "binary": { + "type": "string", + "examples": [ + "/usr/bin/mailx" + ], + "description": "The location of `mailx`." + }, + "from_email": { + "type": "string", + "examples": [ + "from@email.com" + ], + "description": "The senders email." + }, + "to_email": { + "type": "string", + "examples": [ + "to@email.com" + ], + "description": "The recipients email." + } + }, + "additionalProperties": false, + "required": ["enabled", "binary", "from_email", "to_email"] + }, + "discord": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "examples": [ + true + ], + "description": "Whether or not to send notifications and reports to Discord." + }, + "webhook_id": { + "type": "string", + "examples": ["1234567890"], + "description": "Discord webhook id." + }, + "webhook_token": { + "type": "string", + "examples": ["abc123"], + "description": "Discord webhook token." + } + }, + "additionalProperties": false, + "required": ["enabled", "webhook_id", "webhook_token"] + } + }, + "additionalProperties": false, + "required": ["email", "discord"] + }, + "logs": { + "type": "object", + "properties": { + "dir": { + "type": "string", + "examples": [ + "/var/log/snapper" + ], + "description": "The directory in which to save logs. Will be created if it does not exist." + }, + "max_count": { + "type": "number", + "examples": [ + 14 + ], + "description": "How many historic logs to keep. A new log is generated on each run, and the previous ones are rotated and gzipped.", + "minimum": 1 + } + }, + "additionalProperties": false, + "required": ["dir", "max_count"] + }, + "spindown": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "examples": [true], + "description": "Whether to spin down hard drives after script execution or not" + }, + "binary": { + "type": "string", + "examples": [ + "/usr/sbin/hdparm" + ], + "description": "The location of the `hdparm` executable." + }, + "drives": { + "type": "string", + "enum": ["parity", "all"], + "examples": ["parity"], + "description": "Which drives to spin down after script execution is complete" + } + }, + "additionalProperties": false, + "required": ["enabled", "binary", "drives"] + } + }, + "additionalProperties": false, + "required": ["snapraid", "notifications", "logs", "spindown"] +} diff --git a/reports/__init__.py b/reports/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/reports/discord_report.py b/reports/discord_report.py new file mode 100644 index 0000000..994f9f6 --- /dev/null +++ b/reports/discord_report.py @@ -0,0 +1,144 @@ +from operator import itemgetter + +did_not_run_color = 8539930 +did_run_color = 1737287 + +empty_field = {'name': '** **', 'value': '** **'} + + +def create_discord_report(report_data): + sync_job_ran, scrub_job_ran, sync_job_time, scrub_job_time, diff_data, zero_subsecond_count, \ + scrub_stats, drive_stats, smart_drive_data, global_fp, total_time = itemgetter( + 'sync_job_ran', + 'scrub_job_ran', + 'sync_job_time', + 'scrub_job_time', + 'diff_data', + 'zero_subsecond_count', + 'scrub_stats', + 'drive_stats', + 'smart_drive_data', + 'global_fp', + 'total_time')(report_data) + + touch_embed = {'title': 'Touch Job'} + + if zero_subsecond_count > 0: + touch_embed['description'] = (f'A total of **{zero_subsecond_count}** file(s) had their ' + f'sub-second value fixed.') + touch_embed['color'] = did_run_color + else: + touch_embed['description'] = 'No zero sub-second files were found.' + touch_embed['color'] = did_not_run_color + + sync_embed = {'title': 'Sync Job'} + + if sync_job_ran: + sync_embed['color'] = did_run_color + sync_embed['description'] = 'Sync Job finished successfully :white_check_mark:' + sync_embed['fields'] = [{ + 'name': 'Added', + 'value': f'```{diff_data["added"]}```', + 'inline': True + }, { + 'name': 'Removed', + 'value': f'```{diff_data["removed"]}```', + 'inline': True + }, { + 'name': 'Updated', + 'value': f'```{diff_data["updated"]}```', + 'inline': True + }, { + 'name': 'Moved', + 'value': f'```{diff_data["moved"]}```', + 'inline': True + }, { + 'name': 'Copied', + 'value': f'```{diff_data["copied"]}```', + 'inline': True + }, { + 'name': 'Restored', + 'value': f'```{diff_data["restored"]}```', + 'inline': True + }] + + sync_embed['footer'] = { + 'text': f'Elapsed time {sync_job_time}' + } + else: + sync_embed['color'] = did_not_run_color + sync_embed['description'] = 'Sync job did **not** run.' + + scrub_embed = {'title': 'Scrub Job'} + + if scrub_job_ran: + scrub_embed['color'] = did_run_color + scrub_embed['description'] = f'''Scrub Job finished successfully :white_check_mark: + +**{scrub_stats["unscrubbed"]}%** of the array has not been scrubbed, with the oldest block at **{scrub_stats["scrub_age"]}** day(s), the median at **{scrub_stats["median"]}** day(s), and the newest at **{scrub_stats["newest"]}** day(s).''' + + scrub_embed['footer'] = { + 'text': f'Elapsed time {scrub_job_time}' + } + else: + scrub_embed['description'] = f'>Scrub Job did **not** run.' + + array_report_embed = { + 'title': 'Full Array Report', + 'color': did_run_color, + 'fields': [] + } + + for i, d in enumerate(drive_stats): + field = { + 'name': d['drive_name'] if d['drive_name'] else 'Full Array', + 'value': f'''``` +Total use (%) {d["use_percent"]} +Fragmented Files {d["fragmented_files"]} +Excess Fragments {d["excess_fragments"]} +Wasted Space (GB) {d["wasted_gb"]} +Used Space (GB) {d["used_gb"]} +Free Space (GB) {d["free_gb"]} +```'''.replace(' ', '\u00A0'), + 'inline': True + } + + array_report_embed['fields'].append(field) + + if (i + 1) % 2 == 0 and i + 1 != len(drive_stats): + array_report_embed['fields'].append(empty_field) + + smart_report_embed = { + 'title': 'SMART Report', + 'description': f'The current failure probability of any single drive this year is {global_fp}%.', + 'color': did_run_color, + 'fields': [], + } + + for i, d in enumerate(smart_drive_data): + field = { + 'name': f'{d["device"]} (`{d["serial"]}`)' if d['disk'] == '-' else f'{d["disk"]} ({d["device"]}, `{d["serial"]}`)', + 'value': f'''``` +Temperature (°C) {d["temp"]} +Power On Time (days) {d["power_on_days"]} +Error Count {d["error_count"]} +Failure Probability {d["fp"]} +Drive Size (TiB) {d["size"]} +```'''.replace(' ', '\u00A0'), + 'inline': True + } + + smart_report_embed['fields'].append(field) + + if (i + 1) % 2 == 0 and i + 1 != len(smart_drive_data): + smart_report_embed['fields'].append(empty_field) + + embeds = [ + touch_embed, + sync_embed, + scrub_embed, + array_report_embed, + smart_report_embed + ] + + return f':turtle: SnapRAID job completed successfully in **{total_time}**', embeds diff --git a/reports/email_format.html b/reports/email_format.html new file mode 100644 index 0000000..4aebf71 --- /dev/null +++ b/reports/email_format.html @@ -0,0 +1,26 @@ + + + + + +SNAPRAID_REPORT_CONTENT + + \ No newline at end of file diff --git a/reports/email_report.py b/reports/email_report.py new file mode 100644 index 0000000..35da73c --- /dev/null +++ b/reports/email_report.py @@ -0,0 +1,141 @@ +from operator import itemgetter +from utils import get_relative_path + +with open(get_relative_path(__file__, './email_format.html'), 'r') as f: + email_report_template = f.read() + + +def create_email_report(report_data): + sync_job_ran, scrub_job_ran, sync_job_time, scrub_job_time, diff_data, zero_subsecond_count, \ + scrub_stats, drive_stats, smart_drive_data, global_fp, total_time = itemgetter( + 'sync_job_ran', + 'scrub_job_ran', + 'sync_job_time', + 'scrub_job_time', + 'diff_data', + 'zero_subsecond_count', + 'scrub_stats', + 'drive_stats', + 'smart_drive_data', + 'global_fp', + 'total_time')(report_data) + + # + # Create email report + + sync_report = f'

Sync Job

' + + if sync_job_ran: + sync_report = sync_report + f''' +

Job finished successfully in {sync_job_time}.

+

File diff summary as follows:

+ + ''' + else: + sync_report = sync_report + '

Sync job did not run.

' + + touch_report = '

Touch job

' + + if zero_subsecond_count > 0: + touch_report = touch_report + ('

A total of {zero_subsecond_count} ' + 'file(s) had their sub-second value fixed.

') + else: + touch_report = touch_report + '

No zero sub-second files were found.

' + + scrub_report = '

Scrub Job

' + + if scrub_job_ran: + scrub_report = scrub_report + f''' +

Job finished successfully in {scrub_job_time}.

+

{scrub_stats["unscrubbed"]}% of the array has not been scrubbed, + with the oldest block at {scrub_stats["scrub_age"]} day(s), the median + at {scrub_stats["median"]} + day(s), and the newest at {scrub_stats["newest"]} day(s).

+ ''' + else: + scrub_report = scrub_report + '

Scrub Job did not run.

' + + array_drive_report = ''.join(f''' + + {d["drive_name"] if d["drive_name"] else 'Full Array'} + {d["fragmented_files"]} + {d["excess_fragments"]} + {d["wasted_gb"]} + {d["used_gb"]} + {d["free_gb"]} + {d["use_percent"]} + + ''' for d in drive_stats) + + array_report = f''' +

SnapRAID Array Report

+ + + + + + + + + + + + + + {array_drive_report} + +
DriveFragmented FilesExcess FragmentsWasted Space (GB)Used Space (GB)Free Space (GB)Total Used (%)
+ ''' + + smart_drive_report = ''.join(f''' + + {d["disk"]} ({d["device"]}) + {d["temp"]} + {d["power_on_days"]} + {d["error_count"]} + {d["fp"]} + {d["size"]} + {d["serial"]} + + ''' for d in smart_drive_data) + + smart_report = f''' +

SMART Report

+ + + + + + + + + + + + + + {smart_drive_report} + +
DriveTemperature (°C)Power On Time (days)Error CountFailure ProbabilityDrive Size (TiB)Serial Number
+

The current failure probability of any single drive this year is {global_fp}%.

+ ''' + + email_report = f''' +

[Snapper] SnapRAID job completed successfully in {total_time}

+ {touch_report} + {sync_report} + {scrub_report} + {array_report} + {smart_report} + ''' + + email_message = email_report_template.replace('SNAPRAID_REPORT_CONTENT', email_report) + + return email_message diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a31bfa1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +attrs==23.1.0 +certifi==2023.7.22 +charset-normalizer==3.2.0 +idna==3.4 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 +psutil==5.9.5 +python-pidfile==3.1.1 +referencing==0.30.2 +requests==2.31.0 +rpds-py==0.10.3 +urllib3==2.0.5 diff --git a/snapper.py b/snapper.py new file mode 100644 index 0000000..caf179b --- /dev/null +++ b/snapper.py @@ -0,0 +1,702 @@ +import argparse +import concurrent.futures +import gzip +import json +import logging +import logging.handlers +import math +import os +import re +import shutil +import subprocess +import traceback +from datetime import datetime, timedelta +from operator import itemgetter + +import psutil +import requests +import pidfile +from jsonschema import validate + +from reports.discord_report import create_discord_report +from reports.email_report import create_email_report +from utils import format_delta, get_relative_path + +# +# Read config + +with open(get_relative_path(__file__, './config.json'), 'r') as f: + config = json.load(f) + +with open(get_relative_path(__file__, './config.schema.json'), 'r') as f: + schema = json.load(f) + +validate(instance=config, schema=schema) + + +# +# Configure logging + +def rotator(source, dest): + with open(source, 'rb') as f_in: + with gzip.open(dest, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + + os.remove(source) + + +def setup_logger(name, log_file, level='INFO'): + log_dir, max_count = itemgetter('dir', 'max_count')(config['logs']) + + if not os.path.exists(log_dir): + os.makedirs(log_dir) + + log_file_path = os.path.join(log_dir, log_file) + needs_rollover = os.path.isfile(log_file_path) + + handler = logging.handlers.RotatingFileHandler(log_file_path, backupCount=max(max_count, 1)) + handler.setFormatter(logging.Formatter('[%(asctime)s] - [%(levelname)s] - %(message)s')) + + handler.rotator = rotator + handler.namer = lambda file_name: file_name + '.gz' + + if needs_rollover: + handler.doRollover() + + logger = logging.getLogger(name) + logger.setLevel(level) + + if logger.hasHandlers(): + logger.handlers.clear() + + logger.addHandler(handler) + logger.propagate = False + + return logger + + +# +# Parse command line args + +parser = argparse.ArgumentParser(description='SnapRAID execution wrapper') +parser.add_argument('-f', '--force', + help='Ignore any set thresholds or warnings and execute all jobs regardless', + action='store_true') +args = vars(parser.parse_args()) + +force_script_execution = args['force'] + + +# +# Notification helpers + +def notify_and_handle_error(message, error): + log.error(message) + log.error(''.join(traceback.format_exception(None, error, error.__traceback__))) + + send_email('WARNING! SnapRAID jobs unsuccessful', message.replace('\n', '
')) + notify_warning(message) + + exit(1) + + +def notify_warning(message, embeds=None): + return send_discord(f':warning: [**WARNING!**] {message}', embeds=embeds) + + +def notify_info(message, embeds=None, message_id=None): + return send_discord(f':information_source: [**INFO**] {message}', embeds, message_id) + + +def send_discord(message, embeds=None, message_id=None): + is_enabled, webhook_id, webhook_token = itemgetter( + 'enabled', 'webhook_id', 'webhook_token')(config['notifications']['discord']) + + if not is_enabled: + return + + if embeds is None: + embeds = [] + + data = { + 'content': message, + 'embeds': embeds, + 'username': 'Snapper', + } + + update_message = message_id is not None + base_url = f'https://discord.com/api/webhooks/{webhook_id}/{webhook_token}' + + if update_message: + discord_url = f'{base_url}/messages/{message_id}' + response = requests.patch(discord_url, json=data) + else: + discord_url = f'{base_url}?wait=true' + response = requests.post(discord_url, json=data) + + try: + response.raise_for_status() + log.debug('Successfully posted message to discord') + + if not update_message: + data = response.json() + + # Return the message ID in case we want to manipulate it + return data['id'] + except requests.exceptions.HTTPError as err: + # Handle error when trying to update a message + if update_message: + log.debug('Failed to update message, posting new.') + return send_discord(message, embeds=embeds) + + log.error('Unable to send message to discord') + log.error(str(err)) + + +def send_email(subject, message): + log.debug('Attempting to send email...') + + is_enabled, mail_bin, from_email, to_email = itemgetter( + 'enabled', 'binary', 'from_email', 'to_email')(config['notifications']['email']) + + if not is_enabled: + return + + if not os.path.isfile(mail_bin): + raise FileNotFoundError('Unable to find mail executable', mail_bin) + + result = subprocess.run([ + mail_bin, + '-a', 'Content-Type: text/html', + '-s', subject, + '-r', from_email, + to_email + ], input=message, capture_output=True, text=True) + + if result.stderr: + raise ConnectionError('Unable to send email', result.stderr) + + log.debug(f'Successfully sent email to {to_email}') + + +# +# Snapraid Helpers + +def is_running(): + for process in psutil.process_iter(attrs=['name']): + if process.name().lower() == 'snapraid': + return True + + return False + + +def set_snapraid_priority(): + # Setting nice is enough, as ionice follows that per the documentation here: + # https://www.kernel.org/doc/Documentation/block/ioprio.txt + # + # The default class `IOPRIO_CLASS_BE` sets ionice as: `io_nice = (cpu_nice + 20) / 5.` + # The default nice is 0, which sets ionice to 4. + # We set nice to 10, which results in ionice of 6 - this way it's not entirely down prioritized. + + nice_level = config['snapraid']['nice'] + os.nice(nice_level) + p = psutil.Process(os.getpid()) + p.ionice(psutil.IOPRIO_CLASS_BE, math.floor((nice_level + 20) / 5)) + + +def spin_down(): + hdparm_bin, is_enabled, drives = itemgetter('binary', 'enabled', 'drives')(config['spindown']) + + if not is_enabled: + return + + if not os.path.isfile(hdparm_bin): + raise FileNotFoundError('Unable to find hdparm executable', hdparm_bin) + + log.info(f'Attempting to spin down all {drives} drives...') + + content_files, parity_files = get_snapraid_config() + drives_to_spin_down = parity_files + (content_files if drives == 'all' else []) + + shell_command = (f'{hdparm_bin} -y $(' + f'df {" ".join(drives_to_spin_down)} | ' # Get the drives + f'tail -n +2 | ' # Remove the header + f'cut -d " " -f1 | ' # Split by space, get the first item + f'tr "\\n" " "' # Replace newlines with spaces + f')') + + try: + process = subprocess.run(shell_command, shell=True, capture_output=True, text=True) + + rc = process.returncode + + if rc == 0: + log.info('Successfully spun down drives.') + else: + log.error(f'Unable to successfully spin down hard drives, see error output below.') + log.error(process.stderr) + log.error(f'Shell command executed: {shell_command}') + except Exception as err: + log.error(f'Encountered exception while attempting to spin down drives:') + log.error(str(err)) + + +# +# Snapraid Commands + +def run_snapraid(commands, progress_handler=None, allowed_return_codes=[]): + snapraid_bin, snapraid_config = itemgetter('binary', 'config')(config['snapraid']) + + if not os.path.isfile(snapraid_bin): + raise FileNotFoundError('Unable to find SnapRAID executable', snapraid_bin) + + if is_running(): + raise ChildProcessError('SnapRAID already seems to be running, unable to proceed.') + + std_out = [] + std_err = [] + + with (subprocess.Popen( + [snapraid_bin, '--conf', snapraid_config] + commands, + shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + preexec_fn=set_snapraid_priority, encoding="utf-8", + errors='replace' + ) as process, + concurrent.futures.ThreadPoolExecutor(2) as tpe, + ): + def read_stdout(file): + for line in file: + rline = line.rstrip() + + raw_log.info(rline) + + if progress_handler is None or not progress_handler(rline): + std_out.append(rline) + + def read_stderr(file): + for line in file: + rline = line.rstrip() + + raw_log.error(rline) + std_err.append(rline) + + f1 = tpe.submit(read_stdout, process.stdout) + f2 = tpe.submit(read_stderr, process.stderr) + f1.result() + f2.result() + + rc = process.returncode + + if not (rc == 0 or rc in allowed_return_codes): + last_lines = '\n'.join(std_err[-10:]) + + raise SystemError(f'A critical SnapRAID error was encountered during command ' + f'`snapraid {" ".join(commands)}`. The process exited with code `{rc}`.\n' + f'Here are the last **10 lines** from the error log:\n```\n' + f'{last_lines}\n```\nThis requires your immediate attention.', + '\n'.join(std_err)) + + return '\n'.join(std_out), '\n'.join(std_err) + + +def get_status(): + snapraid_status, _ = run_snapraid(['status']) + + stats_regex = re.compile( + r'^ *(?P\d+) +(?P\d+) +(?P\d+) +(' + r'?P[-.\d]+) +(?P\d+) +(?P\d+) +(?P\d+)%(?: +(' + r'?P\S+)|$)', + flags=re.MULTILINE) + drive_stats = [m.groupdict() for m in stats_regex.finditer(snapraid_status)] + + scrub_info = re.search( + r'scrubbed (?P\d+) days ago, the median (?P\d+), the newest (' + r'?P\d+)', + snapraid_status) + unscrubbed_percent = re.search( + r'^The (?P\d+)% of the array is not scrubbed', snapraid_status, + flags=re.MULTILINE) + error_count = re.search(r'^DANGER! In the array there are (?P\d+) errors!', + snapraid_status, flags=re.MULTILINE) + zero_subsecond_count = re.search( + r'^You have (?P\d+) files with zero sub-second timestamp', snapraid_status, + flags=re.MULTILINE) + + sync_in_progress = bool( + re.search(r'^You have a sync in progress', snapraid_status, flags=re.MULTILINE)) + + if scrub_info is None: + raise ValueError('Unable to parse SnapRAID status') + + unscrubbed_percent = 0 if unscrubbed_percent is None else int(unscrubbed_percent[1]) + zero_subsecond_count = 0 if zero_subsecond_count is None else int(zero_subsecond_count[1]) + error_count = 0 if error_count is None else int(error_count[1]) + + return ( + drive_stats, + { + 'unscrubbed': unscrubbed_percent, + 'scrub_age': int(scrub_info[1]), + 'median': int(scrub_info[2]), + 'newest': int(scrub_info[3]) + }, + error_count, + zero_subsecond_count, + sync_in_progress + ) + + +def get_diff(): + snapraid_diff, _ = run_snapraid(['diff'], allowed_return_codes=[2]) + + diff_regex = re.compile(r'''^ *(?P\d+) equal$ +^ *(?P\d+) added$ +^ *(?P\d+) removed$ +^ *(?P\d+) updated$ +^ *(?P\d+) moved$ +^ *(?P\d+) copied$ +^ *(?P\d+) restored$''', flags=re.MULTILINE) + + diff_data = [m.groupdict() for m in diff_regex.finditer(snapraid_diff)] + + if len(diff_data) == 0: + raise ValueError('Unable to parse diff output from SnapRAID, not proceeding.') + + diff_int = dict([a, int(x)] for a, x in diff_data[0].items()) + + return diff_int + + +def get_smart(): + smart_data, _ = run_snapraid(['smart']) + + drive_regex = re.compile(r'^ *(?P\d+|-) +(?P\d+|-) +(' + r'?P\d+|-) +(?P\d+%|-|SSD) +(?P\S+) +(' + r'?P\S+) +(?P\S+) +(?P\S+)$', flags=re.MULTILINE) + drive_data = [m.groupdict() for m in drive_regex.finditer(smart_data)] + + global_fp = re.search(r'next year is (?P\d+)%', smart_data).group(1) + + if drive_data is None or global_fp is None: + raise ValueError('Unable to parse drive data or global failure percentage, not proceeding.') + + return drive_data, global_fp + + +def handle_progress(): + start = datetime.now() + message_id = None + + def handler(data): + nonlocal start + nonlocal message_id + + progress_data = re.search(r'^(?P\d+)%, (?P\d+) MB' + r'(?:, (?P\d+) MB/s, (?P\d+) stripe/s, ' + r'CPU (?P\d+)%, (?P\d+):(?P\d+) ETA)?$', data, + flags=re.MULTILINE) + + is_progress = bool(progress_data) + + if is_progress and datetime.now() - start >= timedelta(minutes=1): + msg = f'Current progress **{progress_data.group(1)}%** (`{progress_data.group(2)} MB`)' + + if progress_data.group(3) is not None: + msg = (f'{msg} — processing at **{progress_data.group(3)} MB/s** ' + f'(*{progress_data.group(4)} stripe/s, {progress_data.group(5)}% CPU*). ' + f'**ETA:** {progress_data.group(6)}h {progress_data.group(7)}m') + + if message_id is None: + message_id = notify_info(msg) + else: + new_message_id = notify_info(msg, message_id=message_id) + + if new_message_id: + message_id = new_message_id + + start = datetime.now() + + return is_progress + + return handler + + +def _run_sync(run_count): + pre_hash, auto_sync = itemgetter('pre_hash', 'auto_sync')(config['snapraid']['sync']) + auto_sync_enabled, max_attempts = itemgetter('enabled', 'max_attempts')(auto_sync) + + try: + log.info(f'Running SnapRAID sync ({run_count}) ' + f'{"with" if pre_hash else "without"} pre-hashing...') + notify_info(f'Syncing **({run_count})**...') + + run_snapraid(['sync', '-h'] if pre_hash else ['sync'], handle_progress()) + except SystemError as err: + sync_errors = err.args[1] + + if sync_errors is None: + raise err + + # The three errors in the regex are considered "safe", i.e., + # a file was just modified or removed during the sync. + # + # This is normally nothing to be worried about, and the operation can just be rerun. + # If there are other errors in the output, and not only these, we don't want to re-run + # the sync command, because it could be things we need to have a closer look at. + + bad_errors = re.sub(r'^(?:' + r'WARNING! You cannot modify (?:files|data disk) during a sync|' + r'Unexpected (?:time|size) change at file .+|' + r'Missing file .+|' + r'Rerun the sync command when finished|' + r'WARNING! With \d+ disks it\'s recommended to use \w+ parity levels|' + r'WARNING! Unexpected file errors!' + r')\.\s*', + '', sync_errors, flags=re.MULTILINE | re.IGNORECASE) + should_rerun = bad_errors == '' and re.search(r'^Rerun the sync command when finished', + sync_errors, + flags=re.MULTILINE | re.IGNORECASE) + + if should_rerun: + log.info( + 'SnapRAID has indicated another sync is recommended, due to disks or files being ' + 'modified during the sync process.') + + if should_rerun and auto_sync_enabled and run_count < max_attempts: + log.info('Re-running sync command with identical options...') + _run_sync(run_count + 1) + else: + raise err + + +def run_sync(): + start = datetime.now() + _run_sync(1) + end = datetime.now() + + sync_job_time = format_delta(end - start) + + log.info(f'Sync job finished, elapsed time {sync_job_time}') + notify_info(f'Sync job finished, elapsed time **{sync_job_time}**') + + return sync_job_time + + +def run_scrub(): + enabled, scrub_new, check_percent, min_age = itemgetter( + 'enabled', 'scrub_new', 'check_percent', 'min_age')(config['snapraid']['scrub']) + + if not enabled: + log.info('Scrubbing not enabled, skipping.') + + return None + + log.info('Running scrub job...') + + start = datetime.now() + + if scrub_new: + log.info('Scrubbing new blocks...') + notify_info('Scrubbing new blocks...') + + scrub_new_output, _ = run_snapraid(['scrub', '-p', 'new'], handle_progress()) + + log.info('Scrubbing old blocks...') + notify_info('Scrubbing old blocks...') + + scrub_output, _ = run_snapraid( + ['scrub', '-p', str(check_percent), '-o', str(min_age)], + handle_progress()) + + end = datetime.now() + + scrub_job_time = format_delta(end - start) + + log.info(f'Scrub job finished, elapsed time {scrub_job_time}') + notify_info(f'Scrub job finished, elapsed time **{scrub_job_time}**') + + return scrub_job_time + + +def run_touch(): + run_snapraid(['touch']) + + +# +# Sanity Checker + +def get_snapraid_config(): + config_file = config['snapraid']['config'] + + if not os.path.isfile(config_file): + raise FileNotFoundError('Unable to find SnapRAID configuration', config_file) + + with open(config_file, 'r') as file: + snapraid_config = file.read() + + file_regex = re.compile(r'^(content|parity) +(.+/\w+.(?:content|parity)) *$', + flags=re.MULTILINE) + parity_files = [] + content_files = [] + + for m in file_regex.finditer(snapraid_config): + if m[1] == 'content': + content_files.append(m[2]) + else: + parity_files.append(m[2]) + + return content_files, parity_files + + +def sanity_check(): + content_files, parity_files = get_snapraid_config() + files = content_files + parity_files + + for file in files: + if not os.path.isfile(file): + raise FileNotFoundError('Unable to locate required content/parity file', file) + + log.info(f'All {len(files)} content and parity files found, proceeding.') + + +# +# Main + +def main(): + try: + total_start = datetime.now() + + log.info('Snapper started') + notify_info('Starting SnapRAID jobs...') + + log.info('Running sanity checks...') + + sanity_check() + + log.info('Checking for errors and files with zero sub-second timestamps...') + + (_, _, error_count, zero_subsecond_count, sync_in_progress) = get_status() + + if error_count > 0: + if force_script_execution: + log.error(f'There are {error_count} errors in you array, ignoring due to forced run.') + notify_warning(f'There are **{error_count}** errors in you array, ignoring due to forced run.') + else: + raise SystemError(f'There are {error_count} errors in you array, you should review ' + f'this immediately. All jobs have been halted.') + + if zero_subsecond_count > 0: + log.info(f'Found {zero_subsecond_count} file(s) with zero sub-second timestamp') + log.info('Running touch job...') + run_touch() + + log.info('Get SnapRAID diff...') + + diff_data = get_diff() + + log.info(f'Diff output: {diff_data["equal"]} equal, ' + + f'{diff_data["added"]} added, ' + + f'{diff_data["removed"]} removed, ' + + f'{diff_data["updated"]} updated, ' + + f'{diff_data["moved"]} moved, ' + + f'{diff_data["copied"]} copied, ' + + f'{diff_data["restored"]} restored') + + if sum(diff_data.values()) - diff_data["equal"] > 0 or sync_in_progress or \ + force_script_execution: + added_threshold, removed_threshold = itemgetter('added', 'removed')( + config['snapraid']['diff']['thresholds']) + + if force_script_execution: + log.info('Ignoring any thresholds and forcefully proceeding with sync.') + elif 0 < added_threshold < diff_data["added"]: + raise ValueError(f'More files ({diff_data["added"]}) have been added than the ' + f'configured max ({added_threshold})') + elif 0 < removed_threshold < diff_data["removed"]: + raise ValueError( + f'More files ({diff_data["removed"]}) have been removed than the configured ' + f'max ({removed_threshold})') + elif sync_in_progress: + log.info('A previous sync in progress has been detected, resuming.') + else: + log.info(f'Fewer files added ({diff_data["added"]}) than the configured ' + f'limit ({added_threshold}), proceeding.') + log.info(f'Fewer files removed ({diff_data["removed"]}) than the configured ' + f'limit ({removed_threshold}), proceeding.') + + sync_job_time = run_sync() + sync_job_ran = True + else: + log.info('No changes to sync, skipping.') + notify_info('No changes to sync') + + sync_job_ran = False + sync_job_time = None + + scrub_job_time = run_scrub() + scrub_job_ran = scrub_job_time is not None + + log.info('Fetching SnapRAID status...') + (drive_stats, scrub_stats, error_count, _, _) = get_status() + + log.info(f'{scrub_stats["unscrubbed"]}% of the array has not been scrubbed, with the ' + f'oldest block at {scrub_stats["scrub_age"]} day(s), the ' + f'median at {scrub_stats["median"]} day(s), and the newest at ' + f'{scrub_stats["newest"]} day(s).') + + log.info('Fetching smart data...') + (smart_drive_data, global_fp) = get_smart() + + log.info(f'Drive failure probability this year is {global_fp}%') + + total_time = format_delta(datetime.now() - total_start) + + report_data = { + 'sync_job_ran': sync_job_ran, + 'scrub_job_ran': scrub_job_ran, + 'sync_job_time': sync_job_time, + 'scrub_job_time': scrub_job_time, + 'diff_data': diff_data, + 'zero_subsecond_count': zero_subsecond_count, + 'scrub_stats': scrub_stats, + 'drive_stats': drive_stats, + 'smart_drive_data': smart_drive_data, + 'global_fp': global_fp, + 'total_time': total_time + } + + email_report = create_email_report(report_data) + + send_email('SnapRAID Job Completed Successfully', email_report) + + if config['notifications']['discord']['enabled']: + (discord_message, embeds) = create_discord_report(report_data) + + send_discord(discord_message, embeds) + + spin_down() + + log.info('SnapRAID jobs completed successfully, exiting.') + except (ValueError, ChildProcessError, SystemError) as err: + notify_and_handle_error(err.args[0], err) + except ConnectionError as err: + log.error(str(err)) + except FileNotFoundError as err: + notify_and_handle_error(f'{err.args[0]} - missing file path `{err.args[1]}`', err) + except BaseException as err: + notify_and_handle_error( + f'Unhandled Python Exception `{str(err) if str(err) else "unknown error"}`', err) + + +try: + with pidfile.PIDFile('/tmp/snapper.pid'): + # Setup loggers after pidfile has been acquired + raw_log = setup_logger('snapper_raw', 'snapper_raw.log') + log = setup_logger('snapper', 'snapper.log') + + log.handlers = raw_log.handlers + log.handlers + log.addHandler(logging.StreamHandler()) + + main() +except pidfile.AlreadyRunningError: + print('snapper already appears to be running!') diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..9042ee5 --- /dev/null +++ b/utils.py @@ -0,0 +1,12 @@ +from pathlib import Path + + +def format_delta(delta): + hours, remainder = divmod(delta.seconds, 3600) + minutes, seconds = divmod(remainder, 60) + + return '{:02}h {:02}m {:02}s'.format(int(hours), int(minutes), int(seconds)) + + +def get_relative_path(parent_path, file): + return Path(parent_path).parent / file