diff --git a/mac_pw_pool/.gitignore b/mac_pw_pool/.gitignore index be7fee3..c677dc2 100644 --- a/mac_pw_pool/.gitignore +++ b/mac_pw_pool/.gitignore @@ -1 +1,2 @@ +/dh_status.txt* /pw_status.txt* diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index e6a6845..a356876 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -59,7 +59,7 @@ inst_failure() { } # Find dedicated hosts to operate on. -dh_flt="Name=tag-key,Values=PWPoolReady" +dh_flt="Name=tag:Name,Values=MacM1-*" dh_qry='Hosts[].{HostID:HostId, Name:[Tags[?Key==`Name`].Value][] | [0]}' dh_searchout="$TEMPDIR/hosts.output" # JSON or error message if ! $AWS ec2 describe-hosts --filter "$dh_flt" --query "$dh_qry" &> "$dh_searchout"; then @@ -70,8 +70,9 @@ fi # Array item format: " " dh_fmt='.[] | .Name +" "+ .HostID' # Avoid always processing hosts in the same alpha-sorted order, as that would -# mean hosts at the end of the list consistently get the least management. -if ! readarray NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort --random-sort); then +# mean hosts at the end of the list consistently wait the longest for new +# instances to be created (see creation-stagger code below). +if ! readarray -t NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort --random-sort); then die "Extracting dedicated host 'Name' and 'HostID' fields $(ctx 0): $(<$dh_searchout)" fi @@ -90,28 +91,53 @@ fi # Therefore, instance creations must be staggered by according to this # window. CREATE_STAGGER_HOURS=3 -latest_launched="1970-01-01T00:00+00:00" # in case $PWSTATE is missing +latest_launched="1970-01-01T00:00+00:00" # in case $DHSTATE is missing dcmpfmt="+%Y%m%d%H%M" # date comparison format compatible with numeric 'test' -if [[ -r "$PWSTATE" ]]; then - # N/B: Asumes data in $PWSTATE represents reality. It may not, - # for example if instances have been manually created or terminated. - # Querying the actual state would make this script much more complex. - declare -a _pwstate - readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$PWSTATE") - for _pwentry in "${_pwstate[@]}"; do - read -r name instance_id launch_time<<<"$_pwentry" - launched_hour=$(date -u -d "$launch_time" "$dcmpfmt") - latest_launched_hour=$(date -u -d "$latest_launched" "$dcmpfmt") - dbg "instance $name launched on $launched_hour, latest launched hour: $latest_launched_hour" - if [[ $launched_hour -gt $latest_launched_hour ]]; then - latest_launched="$launch_time" - fi - done +# To find the latest instance launch time, script can't rely on reading +# $DHSTATE or $PWSTATE because they may not exist or be out of date. +# Search for all running instances by name and running state, returning +# their launch timestamps. +declare -a pw_filt +pw_filts=( + 'Name=tag:Name,Values=MacM1-*' + 'Name=tag:PWPoolReady,Values=true' + 'Name=instance-state-name,Values=running' +) +pw_query='Reservations[].Instances[].LaunchTime' +inst_lt_f=$TEMPDIR/inst_launch_times +dbg "Obtaining launch times for all running MacM1-* instances" +dbg "$AWS ec2 describe-instances --filters '${pw_filts[*]}' --query '$pw_query' &> '$inst_lt_f'" +if ! $AWS ec2 describe-instances --filters "${pw_filts[@]}" --query "$pw_query" &> "$inst_lt_f"; then + die "Can not query instances: +$(<$inst_lt_f)" +else + declare -a launchtimes + if ! readarray -t launchtimes<<<$(json_query '.[]?' "$inst_lt_f") || + [[ "${#launchtimes[@]}" -eq 0 ]] || + [[ "${launchtimes[0]}" == "" ]]; then + warn "Found no running instances, this should not happen." + else + dbg "launchtimes=[${launchtimes[*]}]" + for launch_time in "${launchtimes[@]}"; do + # Assume launch_time is never malformed + launched_hour=$(date -u -d "$launch_time" "$dcmpfmt") + latest_launched_hour=$(date -u -d "$latest_launched" "$dcmpfmt") + dbg "instance launched on $launched_hour; latest launch hour: $latest_launched_hour" + if [[ $launched_hour -gt $latest_launched_hour ]]; then + dbg "Updating latest launched timestamp" + latest_launched="$launch_time" + fi + done + fi fi -dbg "latest_launched=$latest_launched (according to \$PWSTATE)" -msg "Operating on $n_dh_total dedicated hosts at $(date -u -Iminutes):" -echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $PWSTATE)" +# Increase readability for humans by always ensuring the two important +# date stamps line up regardless of the length of $n_dh_total. +_n_dh_sp=$(printf ' %.0s' seq 1 ${#n_dh_total}) +msg "Operating on $n_dh_total dedicated hosts at $(date -u -Iseconds)" +msg " ${_n_dh_sp}Last instance launch on $latest_launched" +echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $DHSTATE)" + for name_hostid in "${NAME2HOSTID[@]}"; do n_dh=$(($n_dh+1)) _I=" " @@ -122,9 +148,9 @@ for name_hostid in "${NAME2HOSTID[@]}"; do hostoutput="$TEMPDIR/${name}_host.output" # JSON or error message from aws describe-hosts instoutput="$TEMPDIR/${name}_inst.output" # JSON or error message from aws describe-instance or run-instance - inststate="$TEMPDIR/${name}_inst.state" # Line to append to $PWSTATE + inststate="$TEMPDIR/${name}_inst.state" # Line to append to $DHSTATE - if ! $AWS ec2 describe-hosts --filter "Name=tag:Name,Values=$name" &> "$hostoutput"; then + if ! $AWS ec2 describe-hosts --host-ids $hostid &> "$hostoutput"; then host_failure "Failed to look up dedicated host." continue # Allow hosts to be taken out of service easily/manually by editing its tags. @@ -166,21 +192,21 @@ for name_hostid in "${NAME2HOSTID[@]}"; do # for Mac instances, but this is not reflected anywhere in the JSON. Trying to start a new # Mac instance on an already occupied host is bound to fail. Inconveniently this error # will look an aweful lot like many other types of errors, confusing any human examining - # $PWSTATE. Detect dedicated-hosts with existing instances. + # $DHSTATE. Detect dedicated-hosts with existing instances. InstanceId=$(set +e; jq -r '.Hosts?[0]?.Instances?[0].InstanceId?' "$hostoutput") dbg "InstanceId='$InstanceId'" # Stagger creation of instances by $CREATE_STAGGER_HOURS launch_new=0 if [[ "$InstanceId" == "null" ]] || [[ "$InstanceId" == "" ]]; then - launch_threshold=$(date -u -Iminutes -d "$latest_launched + $CREATE_STAGGER_HOURS hours") + launch_threshold=$(date -u -Iseconds -d "$latest_launched + $CREATE_STAGGER_HOURS hours") launch_threshold_hour=$(date -u -d "$launch_threshold" "$dcmpfmt") now_hour=$(date -u "$dcmpfmt") dbg "launch_threshold_hour=$launch_threshold_hour" dbg " now_hour=$now_hour" if [[ $now_hour -lt $launch_threshold_hour ]]; then msg "Cannot launch new instance until $launch_threshold" - echo "# $name HOST BUSY: Inst. creation delayed until $launch_threshold" > "$inststate" + echo "# $name HOST THROTTLE: Inst. creation delayed until $launch_threshold" > "$inststate" continue else launch_new=1 @@ -197,7 +223,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do continue else # Block further launches (assumes script is running in a 10m while loop). - latest_launched=$(date -u -Iminutes) + latest_launched=$(date -u -Iseconds) msg "Successfully created new instance; Waiting for 'running' state (~1m typical)..." # N/B: New Mac instances take ~5-10m to actually become ssh-able if ! InstanceId=$(json_query '.Instances?[0]?.InstanceId' "$instoutput"); then @@ -248,25 +274,18 @@ done _I="" msg " " msg "Processing all dedicated host and instance states." -new_error=0 +# Consuming state file in alpha-order is easier on human eyes +readarray -t NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort) for name_hostid in "${NAME2HOSTID[@]}"; do read -r name hostid<<<"$name_hostid" inststate="$TEMPDIR/${name}_inst.state" - [[ -r "$inststate" ]] || die "Expecting to find instance-state file $inststate for host '$name' $(ctx 0)." - if grep -E -q '^# .+ .+ ERROR: ' $inststate; then - new_error=1 - fi - cat "$inststate" >> "$TEMPDIR/$(basename $PWSTATE)" + [[ -r "$inststate" ]] || \ + die "Expecting to find instance-state file $inststate for host '$name' $(ctx 0)." + cat "$inststate" >> "$TEMPDIR/$(basename $DHSTATE)" done dbg "Creating/updating state file" -if [[ -r "$PWSTATE" ]]; then - cp "$PWSTATE" "${PWSTATE}~" -fi -mv "$TEMPDIR/$(basename $PWSTATE)" "$PWSTATE" - -dbg "Handling any errors" -if ((new_error)); then - die "Processing one or more hosts or instances $(ctx 0): -$(<$PWSTATE)" +if [[ -r "$DHSTATE" ]]; then + cp "$DHSTATE" "${DHSTATE}~" fi +mv "$TEMPDIR/$(basename $DHSTATE)" "$DHSTATE" diff --git a/mac_pw_pool/SetupInstances.sh b/mac_pw_pool/SetupInstances.sh index 16c4088..d869c8d 100755 --- a/mac_pw_pool/SetupInstances.sh +++ b/mac_pw_pool/SetupInstances.sh @@ -8,7 +8,7 @@ set -eo pipefail # from `LaunchInstances.sh` soas to "hide" the nearly 2-hours of cumulative # startup and termination wait time. This script depends on: # -# * The $PWSTATE file created/updated by `LaunchInstances.sh`. +# * The $DHSTATE file created/updated by `LaunchInstances.sh`. # * All requirements listed in the top `LaunchInstances.sh` comment. # * The local ssh-agent is able to supply the appropriate private key. # * The $POOLTOKEN env. var. is defined @@ -16,6 +16,29 @@ set -eo pipefail # shellcheck source-path=SCRIPTDIR source $(dirname ${BASH_SOURCE[0]})/pw_lib.sh +# Update temporary-dir status file for instance $name +# status type $1 and value $2. Where status type is +# 'setup', 'listener', 'tasks' or 'comment'. +set_pw_status() { + [[ -n "$name" ]] || \ + die "Expecting \$name to be set" + case $1 in + setup) ;; + listener) ;; + tasks) ;; + comment) ;; + *) die "Status type must be 'setup', 'listener', 'tasks', or 'comment'" + esac + if [[ "$1" != "comment" ]] && [[ -z "$2" ]]; then + die "Expecting status value to be non-empty." + fi + echo -n "$2" > $TEMPDIR/${name}.$1 +} + +# Wrapper around msg() and warn() which also set_pw_status() comment. +pwst_msg() { set_pw_status comment "$1"; msg "$1"; } +pwst_warn() { set_pw_status comment "$1"; warn "$1"; } + # Set non-zero to enable debugging / prevent removal of temp. dir. S_DEBUG="${S_DEBUG:0}" if ((S_DEBUG)); then @@ -27,26 +50,27 @@ fi [[ -n "$POOLTOKEN" ]] || \ die "Expecting \$POOLTOKEN to be defined/non-empty $(ctx 0)." -[[ -r "$PWSTATE" ]] || \ - die "Can't read from state file: $PWSTATE" +[[ -r "$DHSTATE" ]] || \ + die "Can't read from state file: $DHSTATE" -declare -a _pwstate -readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$PWSTATE") +declare -a _dhstate +readarray -t _dhstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE" | sort) n_inst=0 -n_inst_total="${#_pwstate[@]}" -if [[ $n_inst_total -eq 0 ]]; then - msg "No operable hosts found in $PWSTATE: -$(<$PWSTATE)" +n_inst_total="${#_dhstate[@]}" +if [[ $n_inst_total -eq 0 ]] || [[ -z "${_dhstate[0]}" ]]; then + msg "No operable hosts found in $DHSTATE: +$(<$DHSTATE)" # Assume this script is running in a loop, and unf. there are # simply no dedicated-hosts in 'available' state. exit 0 fi -# N/B: Assumes $PWSTATE represents reality -msg "Operating on $n_inst_total instances from $(head -1 $PWSTATE)" +# N/B: Assumes $DHSTATE represents reality +msg "Operating on $n_inst_total instances from $(head -1 $DHSTATE)" +echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $PWSTATE)" # Indent for messages inside loop -for _pwentry in "${_pwstate[@]}"; do - read -r name instance_id launch_time<<<"$_pwentry" +for _dhentry in "${_dhstate[@]}"; do + read -r name instance_id launch_time<<<"$_dhentry" _I=" " msg " " n_inst=$(($n_inst+1)) @@ -56,19 +80,25 @@ for _pwentry in "${_pwstate[@]}"; do ncoutput="$TEMPDIR/${name}_nc.output" logoutput="$TEMPDIR/${name}_ssh.output" + # Most operations below 'continue' looping on error. Ensure status files match. + for status_type in setup listener tasks; do + set_pw_status $status_type error + done + set_pw_status comment "" + if ! $AWS ec2 describe-instances --instance-ids $instance_id &> "$instoutput"; then - warn "Could not query instance $instance_id $(ctx 0)." + pwst_warn "Could not query instance $instance_id $(ctx 0)." continue fi # Check if instance should be managed if ! PWPoolReady=$(json_query '.Reservations?[0]?.Instances?[0]?.Tags? | map(select(.Key == "PWPoolReady")) | .[].Value' "$instoutput"); then - warn "Instance does not have a PWPoolReady tag" + pwst_warn "Instance does not have a PWPoolReady tag" PWPoolReady="tag absent" fi if [[ "$PWPoolReady" != "true" ]]; then - msg "Instance disabled via tag 'PWPoolReady' == '$PWPoolReady' != 'true'." + pwst_msg "Instance disabled via tag 'PWPoolReady' == '$PWPoolReady' != 'true'." continue fi @@ -79,24 +109,23 @@ for _pwentry in "${_pwstate[@]}"; do now=$(date -u +%Y%m%d) then=$(date -u -d "$launch_time" +%Y%m%d) # c/automation_images Cirrus-CI job terminates EC2 instances after 3 days - # TODO: Consider simply terminating instance here instead of hard-failing script. [[ $((now-then)) -le 3 ]] || \ - die "Instance found alive longer than it should be, please investigate $(ctx 0)" + pwst_warn "Excess instance lifetime, please investigate $(ctx 0)" msg "Looking up public DNS" if ! pub_dns=$(json_query '.Reservations?[0]?.Instances?[0]?.PublicDnsName?' "$instoutput"); then - warn "Could not lookup of public DNS for instance $instance_id $(ctx 0)" + pwst_warn "Could not lookup of public DNS for instance $instance_id $(ctx 0)" continue fi msg "Attempting to contact '$name' at $pub_dns" if ! nc -z -w 13 $pub_dns 22 &> "$ncoutput"; then - warn "Could not connect to port 22 on '$pub_dns' $(ctx 0)." + pwst_warn "Could not connect to port 22 on '$pub_dns' $(ctx 0)." continue fi if ! $SSH ec2-user@$pub_dns true; then - warn "Could not ssh to 'ec2-user@$pub_dns' $(ctx 0)." + pwst_warn "Could not ssh to 'ec2-user@$pub_dns' $(ctx 0)." continue fi @@ -105,27 +134,27 @@ for _pwentry in "${_pwstate[@]}"; do if ! $SSH ec2-user@$pub_dns test -r .setup.started; then if $SSH ec2-user@$pub_dns test -r setup.log; then - warn "Setup log found, prior executions may have failed $(ctx 0)." + pwst_warn "Setup log found, prior executions may have failed $(ctx 0)." fi - msg "Setting up new instance" + pwst_msg "Setting up new instance" # Switch to bash for consistency && some ssh commands below # don't play nicely with zsh. $SSH ec2-user@$pub_dns sudo chsh -s /bin/bash ec2-user &> /dev/null if ! $SCP $SETUP_SCRIPT $SPOOL_SCRIPT ec2-user@$pub_dns:/var/tmp/; then - warn "Could not scp scripts to instance $(ctx 0)." + pwst_warn "Could not scp scripts to instance $(ctx 0)." continue fi if ! $SCP $CIENV_SCRIPT ec2-user@$pub_dns:./; then - warn "Could not scp CI Env. script to instance $(ctx 0)." + pwst_warn "Could not scp CI Env. script to instance $(ctx 0)." continue fi if ! $SSH ec2-user@$pub_dns chmod +x /var/tmp/*.sh; then - warn "Could not chmod scripts $(ctx 0)." + pwst_warn "Could not chmod scripts $(ctx 0)." continue fi @@ -135,20 +164,24 @@ for _pwentry in "${_pwstate[@]}"; do bash -c '/var/tmp/setup.sh &> setup.log & disown %-1' msg "Setup script started" + set_pw_status setup started + # Let it run in the background continue fi since=$($SSH ec2-user@$pub_dns tail -1 .setup.started) msg "Setup not complete; Now: $(date -u -Iseconds)" - msg "Setup running since: $since" + pwst_msg "Setup running since: $since" continue fi msg "Instance setup has completed" + set_pw_status setup complete if ! $SSH ec2-user@$pub_dns pgrep -u "${name}-worker" -f -q "cirrus worker run"; then - warn "Cirrus worker listener process is not running $(ctx 0)." + pwst_warn "Cirrus worker listener process is not running $(ctx 0)." + set_pw_status listener dead continue fi @@ -157,21 +190,23 @@ for _pwentry in "${_pwstate[@]}"; do logpath="/private/tmp/${name}-worker.log" # set in setup.sh # The log output from "cirrus worker run" only records task starts if ! $SSH ec2-user@$pub_dns "cat $logpath" &> "$logoutput"; then - warn "Missing worker log $logpath" + pwst_warn "Missing worker log $logpath" else dbg "worker log: $(<$logoutput)" fi # First line of log should always match this - if ! grep -q 'worker successfully registered' "$logoutput"; then + if ! head -1 "$logoutput" | grep -q 'worker successfully registered'; then warn "Expecting successful registration log entry: $(head -1 "$logoutput")" continue fi msg "Cirrus worker listener successfully registered at least once" + set_pw_status listener alive + # Most of the time these are harmless, "can't connect" flakes. if grep -Eq 'level=error.+msg.+failed' "$logoutput"; then warn "Failure messages present in worker log" fi @@ -180,4 +215,27 @@ $(head -1 "$logoutput")" # make this known to humans in case they care. n_tasks=$(grep -Ei 'started' "$logoutput" | wc -l) || true msg "Apparent worker task executions: $n_tasks" + set_pw_status tasks $n_tasks done + +_I="" +msg " " +msg "Processing all persistent worker states." +for _dhentry in "${_dhstate[@]}"; do + read -r name otherstuff<<<"$_dhentry" + _f1=$name + _f2=$(<$TEMPDIR/${name}.setup) + _f3=$(<$TEMPDIR/${name}.listener) + _f4=$(<$TEMPDIR/${name}.tasks) + _f5=$(<$TEMPDIR/${name}.comment) + [[ -z "$_f5" ]] || _f5=" # $_f5" + + printf '%s %s %s %s%s\n' \ + "$_f1" "$_f2" "$_f3" "$_f4" "$_f5" >> "$TEMPDIR/$(basename $PWSTATE)" +done + +dbg "Creating/updating state file" +if [[ -r "$PWSTATE" ]]; then + cp "$PWSTATE" "${PWSTATE}~" +fi +mv "$TEMPDIR/$(basename $PWSTATE)" "$PWSTATE" diff --git a/mac_pw_pool/pw_lib.sh b/mac_pw_pool/pw_lib.sh index 7ca0897..b52b624 100644 --- a/mac_pw_pool/pw_lib.sh +++ b/mac_pw_pool/pw_lib.sh @@ -10,11 +10,21 @@ LIB_DIRPATH=$(dirname "${BASH_SOURCE[0]}") TEMPDIR=$(mktemp -d -p '' "${SCRIPT_FILENAME}_XXXXX.tmp") trap "rm -rf '$TEMPDIR'" EXIT -# Path to file recording one line per instance, with it's name, instance id, start +# Path to file recording the most recent state of each dedicated host. +# Format is simply one line per dedicated host, with it's name, instance id, start # date/time separated by a space. Exceptional conditions are recorded as comments # with the name and details. File is refreshed/overwritten each time script runs # without any fatal/uncaught command-errors. Intended for reference by humans # and/or other tooling. +DHSTATE="${PWSTATE:-$LIB_DIRPATH/dh_status.txt}" +# Similar to $DHSTATE but records the status of each instance. Format is +# instance name, setup status, listener status, and apparent # tasks executed +# or the word 'error' indicating a fault accessing the remote worker logfile. +# Optionally, there may be a final comment field, beginning with a # and text +# suggesting where there may be a fault. +# Possible status field values are as follows: +# setup - started, complete, error +# listener - alive, dead, error PWSTATE="${PWSTATE:-$LIB_DIRPATH/pw_status.txt}" # Name of launch template. Current/default version will be used. @@ -54,11 +64,11 @@ ctx() { } msg() { echo "${_I}${1:-No text message provided}"; } -warn() { echo "${1:-No warning message provided}" | awk -e '{print "'"${_I}"'WARNING: "$0}' > /dev/stderr; } -die() { echo "${1:-No error message provided}" | awk -e '{print "'"${_I}"'ERROR: "$0}' > /dev/stderr; exit 1; } +warn() { echo "${1:-No warning message provided}" | awk -e '{print "'"${_I}"'WARNING: "$0}' >> /dev/stderr; } +die() { echo "${1:-No error message provided}" | awk -e '{print "'"${_I}"'ERROR: "$0}' >> /dev/stderr; exit 1; } dbg() { if ((X_DEBUG)); then - msg "${1:-No debug message provided} $(ctx 1)" | awk -e '{print "'"${_I}"'DEBUG: "$0}' > /dev/stderr + msg "${1:-No debug message provided} $(ctx 1)" | awk -e '{print "'"${_I}"'DEBUG: "$0}' >> /dev/stderr fi }