Skip to content

Commit

Permalink
HA test matrix
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Indenbaum <[email protected]>
  • Loading branch information
Alexander Indenbaum committed Feb 8, 2024
1 parent d30cc5f commit 02c6010
Show file tree
Hide file tree
Showing 6 changed files with 212 additions and 81 deletions.
88 changes: 7 additions & 81 deletions .github/workflows/build-container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,8 @@ jobs:
needs: [build, build-ceph]
strategy:
fail-fast: false
matrix:
test: ["sanity", "state_transitions"]
runs-on: ubuntu-latest
env:
HUGEPAGES: 768 # 3 spdk instances
Expand Down Expand Up @@ -537,27 +539,7 @@ jobs:
- name: Wait for gateways to be listening
timeout-minutes: 3
run: |
for i in $(seq 2); do
while true; do
sleep ${{ env.WAIT_INTERVAL_SECS }} # Adjust the sleep duration as needed
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
continue
fi
GW_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW_NAME")"
if docker-compose run --rm nvmeof-cli --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | grep -i failed; then
echo "Container $i $GW_NAME $GW_IP no subsystems. Waiting..."
continue
fi
echo "Container $i $GW_NAME $GW_IP subsystems:"
docker-compose run --rm nvmeof-cli --server-address $GW_IP --server-port 5500 get_subsystems
break;
done
done
source tests/ha/wait_gateways.sh
- name: List containers
if: success() || failure()
Expand All @@ -571,69 +553,13 @@ jobs:
- name: Set up target
run: |
set -xe
GW1_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /1/ {print $1}')
GW2_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /2/ {print $1}')
GW1_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW1_NAME")"
GW2_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW2_NAME")"
NQN="nqn.2016-06.io.spdk:cnode1"
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 subsystem add --subsystem $NQN -t
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 namespace add --subsystem $NQN --rbd-pool rbd --rbd-image demo_image1 --size 10M --rbd-create-image -l 1
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 namespace add --subsystem $NQN --rbd-pool rbd --rbd-image demo_image2 --size 10M --rbd-create-image -l 2
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 listener add --subsystem $NQN --gateway-name $GW1_NAME --traddr $GW1_IP --trsvcid 4420
docker-compose run --rm nvmeof-cli --server-address $GW2_IP --server-port 5500 listener add --subsystem $NQN --gateway-name $GW2_NAME --traddr $GW2_IP --trsvcid 4420
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 host add --subsystem $NQN --host "*"
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 get_subsystems
docker-compose run --rm nvmeof-cli --server-address $GW2_IP --server-port 5500 get_subsystems
source tests/ha/setup.sh
- name: Run bdevperf discovery
- name: Run HA ${{ matrix.test }} test
timeout-minutes: 5
run: |
set -xe
# See
# - https://github.com/spdk/spdk/blob/master/doc/jsonrpc.md
# - https://spdk.io/doc/nvmf_multipath_howto.html
. .env
container_ip() {
docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1"
}
echo -n "ℹ️ Starting bdevperf container"
make up SVC=bdevperf OPTS="--detach"
sleep 10
echo "ℹ️ bdevperf start up logs"
make logs SVC=bdevperf
eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_SOCKET | tr -d '\n\r' )
ip=$(container_ip $GW1)
echo "ℹ️ Using discovery service in gateway $GW1 ip $ip"
rpc="/usr/libexec/spdk/scripts/rpc.py"
echo "ℹ️ bdevperf bdev_nvme_set_options"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_set_options -r -1"
echo "ℹ️ bdevperf start discovery ip: $ip port: $NVMEOF_DISC_PORT"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_start_discovery -b Nvme0 -t tcp -a $ip -s $NVMEOF_DISC_PORT -f ipv4 -w"
echo "ℹ️ bdevperf bdev_nvme_get_discovery_info"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_get_discovery_info"
echo "ℹ️ bdevperf perform_tests"
eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_TEST_DURATION | tr -d '\n\r' )
timeout=$(expr $BDEVPERF_TEST_DURATION \* 2)
failover_step=$(expr $BDEVPERF_TEST_DURATION / 4)
GW2_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /2/ {print $1}')
wreak_havoc() {
echo "Waiting $failover_step secs before failover..."
sleep $failover_step
echo "Stop gateway $GW2_NAME"
docker stop $GW2_NAME
echo "Waiting $failover_step secs before failback..."
sleep $failover_step
echo "Restart gateway $GW2_NAME"
docker start $GW2_NAME
echo "wreak_havoc() function completed."
}
wreak_havoc &
bdevperf="/usr/libexec/spdk/scripts/bdevperf.py"
make exec SVC=bdevperf OPTS=-T CMD="$bdevperf -v -t $timeout -s $BDEVPERF_SOCKET perform_tests"
wait
source "tests/ha/${{ matrix.test }}.sh"
- name: Check coredump existence
if: success() || failure()
Expand Down
31 changes: 31 additions & 0 deletions tests/ha/sanity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
set -xe
# See
# - https://github.com/spdk/spdk/blob/master/doc/jsonrpc.md
# - https://spdk.io/doc/nvmf_multipath_howto.html
. .env
container_ip() {
docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$1"
}

echo -n "ℹ️ Starting bdevperf container"
make up SVC=bdevperf OPTS="--detach"
sleep 10
echo "ℹ️ bdevperf start up logs"
make logs SVC=bdevperf
eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_SOCKET | tr -d '\n\r' )


ip=$(container_ip $GW1)
echo "ℹ️ Using discovery service in gateway $GW1 ip $ip"
rpc="/usr/libexec/spdk/scripts/rpc.py"
echo "ℹ️ bdevperf bdev_nvme_set_options"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_set_options -r -1"
echo "ℹ️ bdevperf start discovery ip: $ip port: $NVMEOF_DISC_PORT"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_start_discovery -b Nvme0 -t tcp -a $ip -s $NVMEOF_DISC_PORT -f ipv4 -w"
echo "ℹ️ bdevperf bdev_nvme_get_discovery_info"
make exec SVC=bdevperf OPTS=-T CMD="$rpc -v -s $BDEVPERF_SOCKET bdev_nvme_get_discovery_info"
echo "ℹ️ bdevperf perform_tests"
eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_TEST_DURATION | tr -d '\n\r' )
timeout=$(expr $BDEVPERF_TEST_DURATION \* 2)
bdevperf="/usr/libexec/spdk/scripts/bdevperf.py"
make exec SVC=bdevperf OPTS=-T CMD="$bdevperf -v -t $timeout -s $BDEVPERF_SOCKET perform_tests"
17 changes: 17 additions & 0 deletions tests/ha/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
set -xe

GW1_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /1/ {print $1}')
GW2_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /2/ {print $1}')
GW1_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW1_NAME")"
GW2_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW2_NAME")"
NQN="nqn.2016-06.io.spdk:cnode1"

docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 subsystem add --subsystem $NQN -t
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 namespace add --subsystem $NQN --rbd-pool rbd --rbd-image demo_image1 --size 10M --rbd-create-image -l 1
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 namespace add --subsystem $NQN --rbd-pool rbd --rbd-image demo_image2 --size 10M --rbd-create-image -l 2
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 listener add --subsystem $NQN --gateway-name $GW1_NAME --traddr $GW1_IP --trsvcid 4420
docker-compose run --rm nvmeof-cli --server-address $GW2_IP --server-port 5500 listener add --subsystem $NQN --gateway-name $GW2_NAME --traddr $GW2_IP --trsvcid 4420
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 host add --subsystem $NQN --host "*"
docker-compose run --rm nvmeof-cli --server-address $GW1_IP --server-port 5500 get_subsystems
docker-compose run --rm nvmeof-cli --server-address $GW2_IP --server-port 5500 get_subsystems

109 changes: 109 additions & 0 deletions tests/ha/state_transitions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
set -xe
rpc=/usr/libexec/spdk/scripts/rpc.py
cmd=nvmf_subsystem_get_listeners
nqn=nqn.2016-06.io.spdk:cnode1

expect_optimized() {
GW_NAME=$1
EXPECTED_OPTIMIZED=$2

socket=$(docker exec "$GW_NAME" find /var/run/ceph -name spdk.sock)
# Verify single optimized
while true; do
response=$(docker exec "$GW_NAME" "$rpc" "-s" "$socket" "$cmd" "$nqn")
ana_states=$(echo "$response" | jq -r '.[0].ana_states')

# Count the number of "optimized" groups
optimized_count=$(jq -nr --argjson ana_states "$ana_states" '$ana_states | map(select(.ana_state == "optimized")) | length')

# Check if there is exactly one "optimized" group
if [ "$optimized_count" -eq "$EXPECTED_OPTIMIZED" ]; then
# Iterate through JSON array
for item in $(echo "$ana_states" | jq -c '.[]'); do
ana_group=$(echo "$item" | jq -r '.ana_group')
ana_state=$(echo "$item" | jq -r '.ana_state')

# Check if ana_state is "optimized"
if [ "$ana_state" = "optimized" ]; then
optimized_group_number="$ana_group"
echo "$ana_group"
fi
done
break
else
sleep 1
continue
fi
done
}

# GW name by index
gw_name() {
i=$1
docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}'
}

# Function to access numbers by index
access_number_by_index() {
numbers=$1
index=$(expr $2 + 1)
number=$(echo "$numbers" | awk -v idx="$index" 'NR == idx {print}')
echo "$number"
}

# verify that given numbers must be either 1 and 2 or 2 and 1
verify_ana_groups() {
nr1=$1
nr2=$2

if [ "$nr1" -eq 1 ] && [ "$nr2" -eq 2 ]; then
echo "Verified: first is 1 and second is 2"
elif [ "$nr1" -eq 2 ] && [ "$nr2" -eq 1 ]; then
echo "Verified: first is 2 and second is 1"
else
echo "Invalid numbers: first and second must be either 1 and 2 or 2 and 1"
exit 1
fi
}

GW1_NAME=$(gw_name 1)
GW2_NAME=$(gw_name 2)

#
# Step 1 validate both gave are optimized for one of ANA groups 1 and 2
#
GW1_OPTIMIZED=$(expect_optimized $GW1_NAME 1)
gw1_ana=$(access_number_by_index "$GW1_OPTIMIZED" 0)

GW2_OPTIMIZED=$(expect_optimized $GW2_NAME 1)
gw2_ana=$(access_number_by_index "$GW2_OPTIMIZED" 0)

verify_ana_groups "$gw1_ana" "$gw2_ana"

#
# Step 2 failover
#
echo "Stop gw $GW2_NAME"
docker stop $GW2_NAME
sleep 10
docker ps

GW1_FAILOVER_OPTIMIZED=$(expect_optimized $GW1_NAME 2)
gw1_ana1=$(access_number_by_index "$GW1_FAILOVER_OPTIMIZED" 0)
gw1_ana2=$(access_number_by_index "$GW1_FAILOVER_OPTIMIZED" 1)
verify_ana_groups "$gw1_ana1" "$gw1_ana2"

#
# Step 2 failover
#
echo "Start gw $GW2_NAME"
docker start $GW2_NAME
sleep 10

GW1_OPTIMIZED=$(expect_optimized $GW1_NAME 1)
gw1_ana=$(access_number_by_index "$GW1_OPTIMIZED" 0)

GW2_OPTIMIZED=$(expect_optimized $GW2_NAME 1)
gw2_ana=$(access_number_by_index "$GW2_OPTIMIZED" 0)

verify_ana_groups "$gw1_ana" "$gw2_ana"
21 changes: 21 additions & 0 deletions tests/ha/wait_gateways.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
for i in $(seq 2); do
while true; do
sleep ${{ env.WAIT_INTERVAL_SECS }} # Adjust the sleep duration as needed
GW_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /'$i'/ {print $1}')
container_status=$(docker inspect -f '{{.State.Status}}' "$GW_NAME")
if [ "$container_status" == "running" ]; then
echo "Container $i $GW_NAME is now running."
else
echo "Container $i $GW_NAME is still not running. Waiting..."
continue
fi
GW_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "$GW_NAME")"
if docker-compose run --rm nvmeof-cli --server-address $GW_IP --server-port 5500 get_subsystems 2>&1 | grep -i failed; then
echo "Container $i $GW_NAME $GW_IP no subsystems. Waiting..."
continue
fi
echo "Container $i $GW_NAME $GW_IP subsystems:"
docker-compose run --rm nvmeof-cli --server-address $GW_IP --server-port 5500 get_subsystems
break;
done
done
27 changes: 27 additions & 0 deletions tests/ha/wreak_havoc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
set -xe
echo "ℹ️ HA failover/failback test"
eval $(make run SVC=bdevperf OPTS="--entrypoint=env" | grep BDEVPERF_TEST_DURATION | tr -d '\n\r' )
failover_step=$(expr $BDEVPERF_TEST_DURATION / 4)
GW2_NAME=$(docker ps --format '{{.ID}}\t{{.Names}}' | awk '$2 ~ /nvmeof/ && $2 ~ /2/ {print $1}')
wreak_havoc() {
echo "Waiting $failover_step secs before failover..."
sleep $failover_step
echo "Stop gateway $GW2_NAME"
docker stop $GW2_NAME
echo "Waiting $failover_step secs before failback..."
sleep $failover_step
echo "Restart gateway $GW2_NAME"
docker start $GW2_NAME
echo "wreak_havoc() function completed."
}

# Check if GITHUB_WORKSPACE is defined
if [ -n "$GITHUB_WORKSPACE" ]; then
test_dir="$GITHUB_WORKSPACE/tests/ha"
else
test_dir=$(dirname $0)
fi
wreak_havoc &
source $test_dir/sanity.sh
wait

0 comments on commit 02c6010

Please sign in to comment.