From 475167d67736c03bf40972b8c71a1cd6805f5d8a Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Wed, 15 Nov 2023 10:22:44 -0500 Subject: [PATCH 1/7] Rename state file to better indicate content type The file relates to dedicated hosts (DH), not persistent-workers (PW). Also, don't exit non-zero if there is an error-status. Rely on consumers of state file to take appropriate action. Signed-off-by: Chris Evich --- mac_pw_pool/.gitignore | 2 +- mac_pw_pool/LaunchInstances.sh | 40 ++++++++++++++-------------------- mac_pw_pool/SetupInstances.sh | 24 ++++++++++---------- mac_pw_pool/pw_lib.sh | 5 +++-- 4 files changed, 32 insertions(+), 39 deletions(-) diff --git a/mac_pw_pool/.gitignore b/mac_pw_pool/.gitignore index be7fee3..9013acf 100644 --- a/mac_pw_pool/.gitignore +++ b/mac_pw_pool/.gitignore @@ -1 +1 @@ -/pw_status.txt* +/dh_status.txt* diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index e6a6845..5647450 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -70,7 +70,8 @@ fi # Array item format: " " dh_fmt='.[] | .Name +" "+ .HostID' # Avoid always processing hosts in the same alpha-sorted order, as that would -# mean hosts at the end of the list consistently get the least management. +# mean hosts at the end of the list consistently wait the longest for new +# instances to be created (see creation-stagger code below). if ! readarray NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort --random-sort); then die "Extracting dedicated host 'Name' and 'HostID' fields $(ctx 0): $(<$dh_searchout)" @@ -90,14 +91,14 @@ fi # Therefore, instance creations must be staggered by according to this # window. CREATE_STAGGER_HOURS=3 -latest_launched="1970-01-01T00:00+00:00" # in case $PWSTATE is missing +latest_launched="1970-01-01T00:00+00:00" # in case $DHSTATE is missing dcmpfmt="+%Y%m%d%H%M" # date comparison format compatible with numeric 'test' -if [[ -r "$PWSTATE" ]]; then - # N/B: Asumes data in $PWSTATE represents reality. It may not, +if [[ -r "$DHSTATE" ]]; then + # N/B: Asumes data in $DHSTATE represents reality. It may not, # for example if instances have been manually created or terminated. # Querying the actual state would make this script much more complex. declare -a _pwstate - readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$PWSTATE") + readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE") for _pwentry in "${_pwstate[@]}"; do read -r name instance_id launch_time<<<"$_pwentry" launched_hour=$(date -u -d "$launch_time" "$dcmpfmt") @@ -108,10 +109,10 @@ if [[ -r "$PWSTATE" ]]; then fi done fi -dbg "latest_launched=$latest_launched (according to \$PWSTATE)" +dbg "latest_launched=$latest_launched (according to \$DHSTATE)" msg "Operating on $n_dh_total dedicated hosts at $(date -u -Iminutes):" -echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $PWSTATE)" +echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $DHSTATE)" for name_hostid in "${NAME2HOSTID[@]}"; do n_dh=$(($n_dh+1)) _I=" " @@ -122,7 +123,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do hostoutput="$TEMPDIR/${name}_host.output" # JSON or error message from aws describe-hosts instoutput="$TEMPDIR/${name}_inst.output" # JSON or error message from aws describe-instance or run-instance - inststate="$TEMPDIR/${name}_inst.state" # Line to append to $PWSTATE + inststate="$TEMPDIR/${name}_inst.state" # Line to append to $DHSTATE if ! $AWS ec2 describe-hosts --filter "Name=tag:Name,Values=$name" &> "$hostoutput"; then host_failure "Failed to look up dedicated host." @@ -166,7 +167,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do # for Mac instances, but this is not reflected anywhere in the JSON. Trying to start a new # Mac instance on an already occupied host is bound to fail. Inconveniently this error # will look an aweful lot like many other types of errors, confusing any human examining - # $PWSTATE. Detect dedicated-hosts with existing instances. + # $DHSTATE. Detect dedicated-hosts with existing instances. InstanceId=$(set +e; jq -r '.Hosts?[0]?.Instances?[0].InstanceId?' "$hostoutput") dbg "InstanceId='$InstanceId'" @@ -248,25 +249,16 @@ done _I="" msg " " msg "Processing all dedicated host and instance states." -new_error=0 for name_hostid in "${NAME2HOSTID[@]}"; do read -r name hostid<<<"$name_hostid" inststate="$TEMPDIR/${name}_inst.state" - [[ -r "$inststate" ]] || die "Expecting to find instance-state file $inststate for host '$name' $(ctx 0)." - if grep -E -q '^# .+ .+ ERROR: ' $inststate; then - new_error=1 - fi - cat "$inststate" >> "$TEMPDIR/$(basename $PWSTATE)" + [[ -r "$inststate" ]] || \ + die "Expecting to find instance-state file $inststate for host '$name' $(ctx 0)." + cat "$inststate" >> "$TEMPDIR/$(basename $DHSTATE)" done dbg "Creating/updating state file" -if [[ -r "$PWSTATE" ]]; then - cp "$PWSTATE" "${PWSTATE}~" -fi -mv "$TEMPDIR/$(basename $PWSTATE)" "$PWSTATE" - -dbg "Handling any errors" -if ((new_error)); then - die "Processing one or more hosts or instances $(ctx 0): -$(<$PWSTATE)" +if [[ -r "$DHSTATE" ]]; then + cp "$DHSTATE" "${DHSTATE}~" fi +mv "$TEMPDIR/$(basename $DHSTATE)" "$DHSTATE" diff --git a/mac_pw_pool/SetupInstances.sh b/mac_pw_pool/SetupInstances.sh index 16c4088..e0396b8 100755 --- a/mac_pw_pool/SetupInstances.sh +++ b/mac_pw_pool/SetupInstances.sh @@ -8,7 +8,7 @@ set -eo pipefail # from `LaunchInstances.sh` soas to "hide" the nearly 2-hours of cumulative # startup and termination wait time. This script depends on: # -# * The $PWSTATE file created/updated by `LaunchInstances.sh`. +# * The $DHSTATE file created/updated by `LaunchInstances.sh`. # * All requirements listed in the top `LaunchInstances.sh` comment. # * The local ssh-agent is able to supply the appropriate private key. # * The $POOLTOKEN env. var. is defined @@ -27,26 +27,26 @@ fi [[ -n "$POOLTOKEN" ]] || \ die "Expecting \$POOLTOKEN to be defined/non-empty $(ctx 0)." -[[ -r "$PWSTATE" ]] || \ - die "Can't read from state file: $PWSTATE" +[[ -r "$DHSTATE" ]] || \ + die "Can't read from state file: $DHSTATE" declare -a _pwstate -readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$PWSTATE") +readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE") n_inst=0 -n_inst_total="${#_pwstate[@]}" -if [[ $n_inst_total -eq 0 ]]; then - msg "No operable hosts found in $PWSTATE: -$(<$PWSTATE)" +n_inst_total="${#_dhstate[@]}" +if [[ $n_inst_total -eq 0 ]] || [[ -z "${_dhstate[0]}" ]]; then + msg "No operable hosts found in $DHSTATE: +$(<$DHSTATE)" # Assume this script is running in a loop, and unf. there are # simply no dedicated-hosts in 'available' state. exit 0 fi -# N/B: Assumes $PWSTATE represents reality -msg "Operating on $n_inst_total instances from $(head -1 $PWSTATE)" +# N/B: Assumes $DHSTATE represents reality +msg "Operating on $n_inst_total instances from $(head -1 $DHSTATE)" # Indent for messages inside loop -for _pwentry in "${_pwstate[@]}"; do - read -r name instance_id launch_time<<<"$_pwentry" +for _dhentry in "${_dhstate[@]}"; do + read -r name instance_id launch_time<<<"$_dhentry" _I=" " msg " " n_inst=$(($n_inst+1)) diff --git a/mac_pw_pool/pw_lib.sh b/mac_pw_pool/pw_lib.sh index 7ca0897..f7a5dd6 100644 --- a/mac_pw_pool/pw_lib.sh +++ b/mac_pw_pool/pw_lib.sh @@ -10,12 +10,13 @@ LIB_DIRPATH=$(dirname "${BASH_SOURCE[0]}") TEMPDIR=$(mktemp -d -p '' "${SCRIPT_FILENAME}_XXXXX.tmp") trap "rm -rf '$TEMPDIR'" EXIT -# Path to file recording one line per instance, with it's name, instance id, start +# Path to file recording the most recent state of each dedicated host. +# Format is simply one line per dedicated host, with it's name, instance id, start # date/time separated by a space. Exceptional conditions are recorded as comments # with the name and details. File is refreshed/overwritten each time script runs # without any fatal/uncaught command-errors. Intended for reference by humans # and/or other tooling. -PWSTATE="${PWSTATE:-$LIB_DIRPATH/pw_status.txt}" +DHSTATE="${PWSTATE:-$LIB_DIRPATH/dh_status.txt}" # Name of launch template. Current/default version will be used. # https://us-east-1.console.aws.amazon.com/ec2/home?region=us-east-1#LaunchTemplates: From c5b3a9a9e1b0909103368ff5cc22311eee13a6b6 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Wed, 15 Nov 2023 10:58:25 -0500 Subject: [PATCH 2/7] Record status details for each worker Record the most recent status of all workers in a dedicated file. Intended for use by humans or other automation. Signed-off-by: Chris Evich --- mac_pw_pool/.gitignore | 1 + mac_pw_pool/SetupInstances.sh | 94 ++++++++++++++++++++++++++++------- mac_pw_pool/pw_lib.sh | 9 ++++ 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/mac_pw_pool/.gitignore b/mac_pw_pool/.gitignore index 9013acf..c677dc2 100644 --- a/mac_pw_pool/.gitignore +++ b/mac_pw_pool/.gitignore @@ -1 +1,2 @@ /dh_status.txt* +/pw_status.txt* diff --git a/mac_pw_pool/SetupInstances.sh b/mac_pw_pool/SetupInstances.sh index e0396b8..77f3ef9 100755 --- a/mac_pw_pool/SetupInstances.sh +++ b/mac_pw_pool/SetupInstances.sh @@ -16,6 +16,29 @@ set -eo pipefail # shellcheck source-path=SCRIPTDIR source $(dirname ${BASH_SOURCE[0]})/pw_lib.sh +# Update temporary-dir status file for instance $name +# status type $1 and value $2. Where status type is +# 'setup', 'listener', 'tasks' or 'comment'. +set_pw_status() { + [[ -n "$name" ]] || \ + die "Expecting \$name to be set" + case $1 in + setup) ;; + listener) ;; + tasks) ;; + comment) ;; + *) die "Status type must be 'setup', 'listener', 'tasks', or 'comment'" + esac + if [[ "$1" != "comment" ]] && [[ -z "$2" ]]; then + die "Expecting status value to be non-empty." + fi + echo -n "$2" > $TEMPDIR/${name}.$1 +} + +# Wrapper around msg() and warn() which also set_pw_status() comment. +pwst_msg() { set_pw_status comment "$1"; msg "$1"; } +pwst_warn() { set_pw_status comment "$1"; warn "$1"; } + # Set non-zero to enable debugging / prevent removal of temp. dir. S_DEBUG="${S_DEBUG:0}" if ((S_DEBUG)); then @@ -31,7 +54,7 @@ fi die "Can't read from state file: $DHSTATE" declare -a _pwstate -readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE") +readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE" | sort) n_inst=0 n_inst_total="${#_dhstate[@]}" if [[ $n_inst_total -eq 0 ]] || [[ -z "${_dhstate[0]}" ]]; then @@ -44,6 +67,7 @@ fi # N/B: Assumes $DHSTATE represents reality msg "Operating on $n_inst_total instances from $(head -1 $DHSTATE)" +echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $PWSTATE)" # Indent for messages inside loop for _dhentry in "${_dhstate[@]}"; do read -r name instance_id launch_time<<<"$_dhentry" @@ -56,19 +80,25 @@ for _dhentry in "${_dhstate[@]}"; do ncoutput="$TEMPDIR/${name}_nc.output" logoutput="$TEMPDIR/${name}_ssh.output" + # Most operations below 'continue' looping on error. Ensure status files match. + for status_type in setup listener tasks; do + set_pw_status $status_type error + done + set_pw_status comment "" + if ! $AWS ec2 describe-instances --instance-ids $instance_id &> "$instoutput"; then - warn "Could not query instance $instance_id $(ctx 0)." + pwst_warn "Could not query instance $instance_id $(ctx 0)." continue fi # Check if instance should be managed if ! PWPoolReady=$(json_query '.Reservations?[0]?.Instances?[0]?.Tags? | map(select(.Key == "PWPoolReady")) | .[].Value' "$instoutput"); then - warn "Instance does not have a PWPoolReady tag" + pwst_warn "Instance does not have a PWPoolReady tag" PWPoolReady="tag absent" fi if [[ "$PWPoolReady" != "true" ]]; then - msg "Instance disabled via tag 'PWPoolReady' == '$PWPoolReady' != 'true'." + pwst_msg "Instance disabled via tag 'PWPoolReady' == '$PWPoolReady' != 'true'." continue fi @@ -79,24 +109,23 @@ for _dhentry in "${_dhstate[@]}"; do now=$(date -u +%Y%m%d) then=$(date -u -d "$launch_time" +%Y%m%d) # c/automation_images Cirrus-CI job terminates EC2 instances after 3 days - # TODO: Consider simply terminating instance here instead of hard-failing script. [[ $((now-then)) -le 3 ]] || \ - die "Instance found alive longer than it should be, please investigate $(ctx 0)" + pwst_warn "Excess instance lifetime, please investigate $(ctx 0)" msg "Looking up public DNS" if ! pub_dns=$(json_query '.Reservations?[0]?.Instances?[0]?.PublicDnsName?' "$instoutput"); then - warn "Could not lookup of public DNS for instance $instance_id $(ctx 0)" + pwst_warn "Could not lookup of public DNS for instance $instance_id $(ctx 0)" continue fi msg "Attempting to contact '$name' at $pub_dns" if ! nc -z -w 13 $pub_dns 22 &> "$ncoutput"; then - warn "Could not connect to port 22 on '$pub_dns' $(ctx 0)." + pwst_warn "Could not connect to port 22 on '$pub_dns' $(ctx 0)." continue fi if ! $SSH ec2-user@$pub_dns true; then - warn "Could not ssh to 'ec2-user@$pub_dns' $(ctx 0)." + pwst_warn "Could not ssh to 'ec2-user@$pub_dns' $(ctx 0)." continue fi @@ -105,27 +134,27 @@ for _dhentry in "${_dhstate[@]}"; do if ! $SSH ec2-user@$pub_dns test -r .setup.started; then if $SSH ec2-user@$pub_dns test -r setup.log; then - warn "Setup log found, prior executions may have failed $(ctx 0)." + pwst_warn "Setup log found, prior executions may have failed $(ctx 0)." fi - msg "Setting up new instance" + pwst_msg "Setting up new instance" # Switch to bash for consistency && some ssh commands below # don't play nicely with zsh. $SSH ec2-user@$pub_dns sudo chsh -s /bin/bash ec2-user &> /dev/null if ! $SCP $SETUP_SCRIPT $SPOOL_SCRIPT ec2-user@$pub_dns:/var/tmp/; then - warn "Could not scp scripts to instance $(ctx 0)." + pwst_warn "Could not scp scripts to instance $(ctx 0)." continue fi if ! $SCP $CIENV_SCRIPT ec2-user@$pub_dns:./; then - warn "Could not scp CI Env. script to instance $(ctx 0)." + pwst_warn "Could not scp CI Env. script to instance $(ctx 0)." continue fi if ! $SSH ec2-user@$pub_dns chmod +x /var/tmp/*.sh; then - warn "Could not chmod scripts $(ctx 0)." + pwst_warn "Could not chmod scripts $(ctx 0)." continue fi @@ -135,20 +164,24 @@ for _dhentry in "${_dhstate[@]}"; do bash -c '/var/tmp/setup.sh &> setup.log & disown %-1' msg "Setup script started" + set_pw_status setup started + # Let it run in the background continue fi since=$($SSH ec2-user@$pub_dns tail -1 .setup.started) msg "Setup not complete; Now: $(date -u -Iseconds)" - msg "Setup running since: $since" + pwst_msg "Setup running since: $since" continue fi msg "Instance setup has completed" + set_pw_status setup complete if ! $SSH ec2-user@$pub_dns pgrep -u "${name}-worker" -f -q "cirrus worker run"; then - warn "Cirrus worker listener process is not running $(ctx 0)." + pwst_warn "Cirrus worker listener process is not running $(ctx 0)." + set_pw_status listener dead continue fi @@ -157,21 +190,23 @@ for _dhentry in "${_dhstate[@]}"; do logpath="/private/tmp/${name}-worker.log" # set in setup.sh # The log output from "cirrus worker run" only records task starts if ! $SSH ec2-user@$pub_dns "cat $logpath" &> "$logoutput"; then - warn "Missing worker log $logpath" + pwst_warn "Missing worker log $logpath" else dbg "worker log: $(<$logoutput)" fi # First line of log should always match this - if ! grep -q 'worker successfully registered' "$logoutput"; then + if ! head -1 "$logoutput" | grep -q 'worker successfully registered'; then warn "Expecting successful registration log entry: $(head -1 "$logoutput")" continue fi msg "Cirrus worker listener successfully registered at least once" + set_pw_status listener alive + # Most of the time these are harmless, "can't connect" flakes. if grep -Eq 'level=error.+msg.+failed' "$logoutput"; then warn "Failure messages present in worker log" fi @@ -180,4 +215,27 @@ $(head -1 "$logoutput")" # make this known to humans in case they care. n_tasks=$(grep -Ei 'started' "$logoutput" | wc -l) || true msg "Apparent worker task executions: $n_tasks" + set_pw_status tasks $n_tasks done + +_I="" +msg " " +msg "Processing all persistent worker states." +for _pwentry in "${_pwstate[@]}"; do + read -r name otherstuff<<<"$_pwentry" + _f1=$name + _f2=$(<$TEMPDIR/${name}.setup) + _f3=$(<$TEMPDIR/${name}.listener) + _f4=$(<$TEMPDIR/${name}.tasks) + _f5=$(<$TEMPDIR/${name}.comment) + [[ -z "$_f5" ]] || _f5=" # $_f5" + + printf '%s %s %s %s%s\n' \ + "$_f1" "$_f2" "$_f3" "$_f4" "$_f5" >> "$TEMPDIR/$(basename $PWSTATE)" +done + +dbg "Creating/updating state file" +if [[ -r "$PWSTATE" ]]; then + cp "$PWSTATE" "${PWSTATE}~" +fi +mv "$TEMPDIR/$(basename $PWSTATE)" "$PWSTATE" diff --git a/mac_pw_pool/pw_lib.sh b/mac_pw_pool/pw_lib.sh index f7a5dd6..9911942 100644 --- a/mac_pw_pool/pw_lib.sh +++ b/mac_pw_pool/pw_lib.sh @@ -17,6 +17,15 @@ trap "rm -rf '$TEMPDIR'" EXIT # without any fatal/uncaught command-errors. Intended for reference by humans # and/or other tooling. DHSTATE="${PWSTATE:-$LIB_DIRPATH/dh_status.txt}" +# Similar to $DHSTATE but records the status of each instance. Format is +# instance name, setup status, listener status, and apparent # tasks executed +# or the word 'error' indicating a fault accessing the remote worker logfile. +# Optionally, there may be a final comment field, beginning with a # and text +# suggesting where there may be a fault. +# Possible status field values are as follows: +# setup - started, complete, error +# listener - alive, dead, error +PWSTATE="${PWSTATE:-$LIB_DIRPATH/pw_status.txt}" # Name of launch template. Current/default version will be used. # https://us-east-1.console.aws.amazon.com/ec2/home?region=us-east-1#LaunchTemplates: From d6ec0981eb30e0b43dcf1f2f6595d418bc31d1f9 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Wed, 15 Nov 2023 11:53:00 -0500 Subject: [PATCH 3/7] Alpha-sort dedicated host state file Signed-off-by: Chris Evich --- mac_pw_pool/LaunchInstances.sh | 2 ++ mac_pw_pool/SetupInstances.sh | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index 5647450..e1ea6ba 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -249,6 +249,8 @@ done _I="" msg " " msg "Processing all dedicated host and instance states." +# Consuming state file in alpha-order is easier on human eyes +readarray NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort) for name_hostid in "${NAME2HOSTID[@]}"; do read -r name hostid<<<"$name_hostid" inststate="$TEMPDIR/${name}_inst.state" diff --git a/mac_pw_pool/SetupInstances.sh b/mac_pw_pool/SetupInstances.sh index 77f3ef9..d869c8d 100755 --- a/mac_pw_pool/SetupInstances.sh +++ b/mac_pw_pool/SetupInstances.sh @@ -53,8 +53,8 @@ fi [[ -r "$DHSTATE" ]] || \ die "Can't read from state file: $DHSTATE" -declare -a _pwstate -readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE" | sort) +declare -a _dhstate +readarray -t _dhstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE" | sort) n_inst=0 n_inst_total="${#_dhstate[@]}" if [[ $n_inst_total -eq 0 ]] || [[ -z "${_dhstate[0]}" ]]; then @@ -221,8 +221,8 @@ done _I="" msg " " msg "Processing all persistent worker states." -for _pwentry in "${_pwstate[@]}"; do - read -r name otherstuff<<<"$_pwentry" +for _dhentry in "${_dhstate[@]}"; do + read -r name otherstuff<<<"$_dhentry" _f1=$name _f2=$(<$TEMPDIR/${name}.setup) _f3=$(<$TEMPDIR/${name}.listener) From 7244323cefbad78827ee98b00244af3396397376 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 16 Nov 2023 10:47:10 -0500 Subject: [PATCH 4/7] Fix several DH management script bugs Previously it was possible to fail to launch any instances do to bugs and assumptions in the last-launch-time determination. Fix this by actually querying running instances, and searching for the most recent launch time. If there are no instances found, print a warning operators may observe. Also, fix missing `-t` option to several readarray() calls. Signed-off-by: Chris Evich --- mac_pw_pool/LaunchInstances.sh | 68 ++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index e1ea6ba..9236ab1 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -59,7 +59,7 @@ inst_failure() { } # Find dedicated hosts to operate on. -dh_flt="Name=tag-key,Values=PWPoolReady" +dh_flt="Name=tag:Name,Values=MacM1-*" dh_qry='Hosts[].{HostID:HostId, Name:[Tags[?Key==`Name`].Value][] | [0]}' dh_searchout="$TEMPDIR/hosts.output" # JSON or error message if ! $AWS ec2 describe-hosts --filter "$dh_flt" --query "$dh_qry" &> "$dh_searchout"; then @@ -72,7 +72,7 @@ dh_fmt='.[] | .Name +" "+ .HostID' # Avoid always processing hosts in the same alpha-sorted order, as that would # mean hosts at the end of the list consistently wait the longest for new # instances to be created (see creation-stagger code below). -if ! readarray NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort --random-sort); then +if ! readarray -t NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort --random-sort); then die "Extracting dedicated host 'Name' and 'HostID' fields $(ctx 0): $(<$dh_searchout)" fi @@ -93,26 +93,48 @@ fi CREATE_STAGGER_HOURS=3 latest_launched="1970-01-01T00:00+00:00" # in case $DHSTATE is missing dcmpfmt="+%Y%m%d%H%M" # date comparison format compatible with numeric 'test' -if [[ -r "$DHSTATE" ]]; then - # N/B: Asumes data in $DHSTATE represents reality. It may not, - # for example if instances have been manually created or terminated. - # Querying the actual state would make this script much more complex. - declare -a _pwstate - readarray -t _pwstate <<<$(grep -E -v '^($|#+| +)' "$DHSTATE") - for _pwentry in "${_pwstate[@]}"; do - read -r name instance_id launch_time<<<"$_pwentry" - launched_hour=$(date -u -d "$launch_time" "$dcmpfmt") - latest_launched_hour=$(date -u -d "$latest_launched" "$dcmpfmt") - dbg "instance $name launched on $launched_hour, latest launched hour: $latest_launched_hour" - if [[ $launched_hour -gt $latest_launched_hour ]]; then - latest_launched="$launch_time" - fi - done +# To find the latest instance launch time, script can't rely on reading +# $DHSTATE or $PWSTATE because they may not exist or be out of date. +# Search for all running instances by name and running state, returning +# their launch timestamps. +declare -a pw_filt +pw_filts=( + 'Name=tag:Name,Values=MacM1-*' + 'Name=tag:PWPoolReady,Values=true' + 'Name=instance-state-name,Values=running' +) +pw_query='Reservations[].Instances[].LaunchTime' +inst_lt_f=$TEMPDIR/inst_launch_times +dbg "Obtaining launch times for all running MacM1-* instances" +dbg "$AWS ec2 describe-instances --filters '${pw_filts[*]}' --query '$pw_query' &> '$inst_lt_f'" +if ! $AWS ec2 describe-instances --filters "${pw_filts[@]}" --query "$pw_query" &> "$inst_lt_f"; then + die "Can not query instances: +$(<$inst_lt_f)" +else + declare -a launchtimes + if ! readarray -t launchtimes<<<$(json_query '.[]?' "$inst_lt_f"); then + warn "Found no running instances, this should not happen." + else + for launch_time in "${launchtimes[@]}"; do + # Assume launch_time is never malformed + launched_hour=$(date -u -d "$launch_time" "$dcmpfmt") + latest_launched_hour=$(date -u -d "$latest_launched" "$dcmpfmt") + dbg "instance launched on $launched_hour; latest launch hour: $latest_launched_hour" + if [[ $launched_hour -gt $latest_launched_hour ]]; then + dbg "Updating latest launched timestamp" + latest_launched="$launch_time" + fi + done + fi fi -dbg "latest_launched=$latest_launched (according to \$DHSTATE)" -msg "Operating on $n_dh_total dedicated hosts at $(date -u -Iminutes):" +# Increase readability for humans by always ensuring the two important +# date stamps line up regardless of the length of $n_dh_total. +_n_dh_sp=$(printf ' %.0s' seq 1 ${#n_dh_total}) +msg "Operating on $n_dh_total dedicated hosts at $(date -u -Iseconds)" +msg " ${_n_dh_sp}Last instance launch on $latest_launched" echo -e "# $(basename ${BASH_SOURCE[0]}) run $(date -u -Iseconds)\n#" > "$TEMPDIR/$(basename $DHSTATE)" + for name_hostid in "${NAME2HOSTID[@]}"; do n_dh=$(($n_dh+1)) _I=" " @@ -125,7 +147,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do instoutput="$TEMPDIR/${name}_inst.output" # JSON or error message from aws describe-instance or run-instance inststate="$TEMPDIR/${name}_inst.state" # Line to append to $DHSTATE - if ! $AWS ec2 describe-hosts --filter "Name=tag:Name,Values=$name" &> "$hostoutput"; then + if ! $AWS ec2 describe-hosts --host-ids $hostid &> "$hostoutput"; then host_failure "Failed to look up dedicated host." continue # Allow hosts to be taken out of service easily/manually by editing its tags. @@ -174,7 +196,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do # Stagger creation of instances by $CREATE_STAGGER_HOURS launch_new=0 if [[ "$InstanceId" == "null" ]] || [[ "$InstanceId" == "" ]]; then - launch_threshold=$(date -u -Iminutes -d "$latest_launched + $CREATE_STAGGER_HOURS hours") + launch_threshold=$(date -u -Iseconds -d "$latest_launched + $CREATE_STAGGER_HOURS hours") launch_threshold_hour=$(date -u -d "$launch_threshold" "$dcmpfmt") now_hour=$(date -u "$dcmpfmt") dbg "launch_threshold_hour=$launch_threshold_hour" @@ -198,7 +220,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do continue else # Block further launches (assumes script is running in a 10m while loop). - latest_launched=$(date -u -Iminutes) + latest_launched=$(date -u -Iseconds) msg "Successfully created new instance; Waiting for 'running' state (~1m typical)..." # N/B: New Mac instances take ~5-10m to actually become ssh-able if ! InstanceId=$(json_query '.Instances?[0]?.InstanceId' "$instoutput"); then @@ -250,7 +272,7 @@ _I="" msg " " msg "Processing all dedicated host and instance states." # Consuming state file in alpha-order is easier on human eyes -readarray NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort) +readarray -t NAME2HOSTID <<<$(json_query "$dh_fmt" "$dh_searchout" | sort) for name_hostid in "${NAME2HOSTID[@]}"; do read -r name hostid<<<"$name_hostid" inststate="$TEMPDIR/${name}_inst.state" From 3a9c2d4675c6275ec1a123832c16632fc4fe5c9d Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Thu, 16 Nov 2023 16:41:21 -0500 Subject: [PATCH 5/7] Fix truncating duplicated & redirected script output For whatever reason, when a script that duplicates and redirects stdout/stderr to a log-file calls one of the management scripts, the log-file is truncated. Updating output functions to append their output seems to resolve this issue. Signed-off-by: Chris Evich --- mac_pw_pool/pw_lib.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mac_pw_pool/pw_lib.sh b/mac_pw_pool/pw_lib.sh index 9911942..b52b624 100644 --- a/mac_pw_pool/pw_lib.sh +++ b/mac_pw_pool/pw_lib.sh @@ -64,11 +64,11 @@ ctx() { } msg() { echo "${_I}${1:-No text message provided}"; } -warn() { echo "${1:-No warning message provided}" | awk -e '{print "'"${_I}"'WARNING: "$0}' > /dev/stderr; } -die() { echo "${1:-No error message provided}" | awk -e '{print "'"${_I}"'ERROR: "$0}' > /dev/stderr; exit 1; } +warn() { echo "${1:-No warning message provided}" | awk -e '{print "'"${_I}"'WARNING: "$0}' >> /dev/stderr; } +die() { echo "${1:-No error message provided}" | awk -e '{print "'"${_I}"'ERROR: "$0}' >> /dev/stderr; exit 1; } dbg() { if ((X_DEBUG)); then - msg "${1:-No debug message provided} $(ctx 1)" | awk -e '{print "'"${_I}"'DEBUG: "$0}' > /dev/stderr + msg "${1:-No debug message provided} $(ctx 1)" | awk -e '{print "'"${_I}"'DEBUG: "$0}' >> /dev/stderr fi } From d1a3503a7fe1f50ebf3dc14217dee29d1d8367b9 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Fri, 17 Nov 2023 14:50:27 -0500 Subject: [PATCH 6/7] Minor: Adjust status message The term "BUSY" implies the dedicated host is doing something else. This is not the case for staggering launches. Use a more descriptive status indicator for this. Signed-off-by: Chris Evich --- mac_pw_pool/LaunchInstances.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index 9236ab1..703d804 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -203,7 +203,7 @@ for name_hostid in "${NAME2HOSTID[@]}"; do dbg " now_hour=$now_hour" if [[ $now_hour -lt $launch_threshold_hour ]]; then msg "Cannot launch new instance until $launch_threshold" - echo "# $name HOST BUSY: Inst. creation delayed until $launch_threshold" > "$inststate" + echo "# $name HOST THROTTLE: Inst. creation delayed until $launch_threshold" > "$inststate" continue else launch_new=1 From 723fbf10390372c3d2c6d433b043c74ecf187216 Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Mon, 20 Nov 2023 10:09:34 -0500 Subject: [PATCH 7/7] Fix last-launch time query failure behavior If for whatever reason there is a failure in the query or search for last-launch times, `$latest_launched` could be set to the current time. This will ultimately result in no instances being launched. Fix this by improved detection of an empty/null launch time in `${launchtimes[@]}`. Signed-off-by: Chris Evich --- mac_pw_pool/LaunchInstances.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mac_pw_pool/LaunchInstances.sh b/mac_pw_pool/LaunchInstances.sh index 703d804..a356876 100755 --- a/mac_pw_pool/LaunchInstances.sh +++ b/mac_pw_pool/LaunchInstances.sh @@ -112,9 +112,12 @@ if ! $AWS ec2 describe-instances --filters "${pw_filts[@]}" --query "$pw_query" $(<$inst_lt_f)" else declare -a launchtimes - if ! readarray -t launchtimes<<<$(json_query '.[]?' "$inst_lt_f"); then + if ! readarray -t launchtimes<<<$(json_query '.[]?' "$inst_lt_f") || + [[ "${#launchtimes[@]}" -eq 0 ]] || + [[ "${launchtimes[0]}" == "" ]]; then warn "Found no running instances, this should not happen." else + dbg "launchtimes=[${launchtimes[*]}]" for launch_time in "${launchtimes[@]}"; do # Assume launch_time is never malformed launched_hour=$(date -u -d "$launch_time" "$dcmpfmt")