From 1f9ffa33b08d66b2302c3bc57f5414ed90ae416c Mon Sep 17 00:00:00 2001
From: "Louise K. Schmidtgen" <louise.schmidtgen@canonical.com>
Date: Tue, 12 Nov 2024 19:26:57 +0100
Subject: [PATCH] Performance Test Set-up (1) (#197)

---
 .github/workflows/e2e.yml                     |   4 +
 .github/workflows/performance.yaml            | 141 ++++++++
 test/performance/.copyright.tmpl              |   1 +
 test/performance/Readme.md                    |  60 ++++
 test/performance/lxd-profile.yaml             | 105 ++++++
 test/performance/requirements-dev.txt         |   5 +
 test/performance/requirements-test.txt        |   5 +
 test/performance/tests/conftest.py            | 189 ++++++++++
 test/performance/tests/test_performance.py    |  32 ++
 test/performance/tests/test_util/config.py    |  45 +++
 .../tests/test_util/harness/__init__.py       |  11 +
 .../tests/test_util/harness/base.py           | 114 ++++++
 .../tests/test_util/harness/lxd.py            | 181 ++++++++++
 test/performance/tests/test_util/util.py      | 324 ++++++++++++++++++
 test/performance/tox.ini                      |  51 +++
 15 files changed, 1268 insertions(+)
 create mode 100644 .github/workflows/performance.yaml
 create mode 100644 test/performance/.copyright.tmpl
 create mode 100644 test/performance/Readme.md
 create mode 100644 test/performance/lxd-profile.yaml
 create mode 100644 test/performance/requirements-dev.txt
 create mode 100644 test/performance/requirements-test.txt
 create mode 100644 test/performance/tests/conftest.py
 create mode 100644 test/performance/tests/test_performance.py
 create mode 100644 test/performance/tests/test_util/config.py
 create mode 100644 test/performance/tests/test_util/harness/__init__.py
 create mode 100644 test/performance/tests/test_util/harness/base.py
 create mode 100644 test/performance/tests/test_util/harness/lxd.py
 create mode 100644 test/performance/tests/test_util/util.py
 create mode 100644 test/performance/tox.ini

diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
index dbd60d85..5429d51a 100644
--- a/.github/workflows/e2e.yml
+++ b/.github/workflows/e2e.yml
@@ -5,6 +5,10 @@ on:
     branches: [master]
   pull_request:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
   test:
     name: Test
diff --git a/.github/workflows/performance.yaml b/.github/workflows/performance.yaml
new file mode 100644
index 00000000..127c4cb9
--- /dev/null
+++ b/.github/workflows/performance.yaml
@@ -0,0 +1,141 @@
+name: Performance Test K8s-snap
+
+on:
+  push:
+    branches: ["master"]
+  pull_request:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  BASE_SHA: ${{ github.before || github.event.pull_request.base.sha }}
+  BASE_BRANCH: ${{ github.base_ref || github.ref }}
+  TARGET_SHA: ${{ github.sha }}
+
+jobs:
+  build:
+    name: K8s-snap Performance Test
+    runs-on: ubuntu-20.04
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@v2
+        with:
+          egress-policy: audit
+      - name: Checking out repo
+        uses: actions/checkout@v4
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install tox
+        run: |
+          pip install tox
+      - name: Install Go
+        uses: actions/setup-go@v5
+        with:
+          go-version: "1.22"
+      - name: Install lxd
+        run: |
+          sudo snap refresh lxd --channel 5.21/stable
+          sudo lxd init --auto
+          sudo usermod --append --groups lxd $USER
+          sg lxd -c 'lxc version'
+      - name: Ensure lxd network traffic flows by removing docker if installed
+        run: |
+          if command -v docker >/dev/null 2>&1; then
+            echo "Docker is installed, purging it"
+            sudo apt-get purge -y docker-engine docker docker.io docker-ce docker-ce-cli containerd runc
+          fi
+      - name: Download latest k8s-snap
+        run: |
+          sudo snap download k8s --channel=latest/edge --basename k8s
+      - name: Unpack Snap
+        run: |
+          sudo unsquashfs -d snap-unpack-dir k8s.snap
+      - name: Create snap with k8s-dqlite ${{ github.head_ref }}
+        run: |
+          make static
+          sudo cp ./bin/static/k8s-dqlite snap-unpack-dir/bin/k8s-dqlite
+          sudo chmod o+r snap-unpack-dir/bin/k8s-dqlite
+          sudo mksquashfs snap-unpack-dir head.snap -noappend -comp lzo -no-fragments
+      - name: Run Performance test ${{ github.head_ref }} snap
+        env:
+          TEST_SNAP: ${{ github.workspace }}/head.snap
+          TEST_SUBSTRATE: lxd
+          TEST_LXD_IMAGE: ubuntu:22.04
+          TEST_INSPECTION_REPORTS_DIR: ${{ github.workspace }}/inspection-reports
+        run: |
+          cd test/performance && sg lxd -c 'tox -e performance'
+      - name: Create snap with k8s-dqlite base code
+        run: |
+          set -o pipefail
+          git fetch origin $BASE_BRANCH
+          git reset --hard $BASE_SHA
+          make static
+          sudo cp ./bin/static/k8s-dqlite snap-unpack-dir/bin/k8s-dqlite
+          sudo chmod o+r snap-unpack-dir/bin/k8s-dqlite
+          sudo mksquashfs snap-unpack-dir base-code.snap -noappend -comp lzo -no-fragments
+      - name: Switch back to target branch
+        run: git reset --hard $TARGET_SHA
+      - name: Run Performance test for base code snap
+        env:
+          TEST_SNAP: ${{ github.workspace }}/base-code.snap
+          TEST_SUBSTRATE: lxd
+          TEST_LXD_IMAGE: ubuntu:22.04
+          TEST_INSPECTION_REPORTS_DIR: ${{ github.workspace }}/inspection-reports
+        run: |
+          cd test/performance && sg lxd -c 'tox -e performance'
+      - name: Create snap with k8s-dqlite v1.1.11
+        run: |
+          set -o pipefail
+          git fetch origin --tags
+          git reset --hard v1.1.11
+          make static
+          sudo cp ./bin/static/k8s-dqlite snap-unpack-dir/bin/k8s-dqlite
+          sudo chmod o+r snap-unpack-dir/bin/k8s-dqlite
+          sudo mksquashfs snap-unpack-dir v1-1-11.snap -noappend -comp lzo -no-fragments
+      - name: Switch back to target branch
+        run: git reset --hard $TARGET_SHA
+      - name: Run Performance test for v1.1.11 snap
+        env:
+          TEST_SNAP: ${{ github.workspace }}/v1-1-11.snap
+          TEST_SUBSTRATE: lxd
+          TEST_LXD_IMAGE: ubuntu:22.04
+          TEST_INSPECTION_REPORTS_DIR: ${{ github.workspace }}/inspection-reports
+        run: |
+          cd test/performance && sg lxd -c 'tox -e performance'
+      - name: Create snap with k8s-dqlite v1.2.0
+        run: |
+          set -o pipefail
+          git fetch origin --tags
+          git reset --hard v1.2.0
+          make static
+          sudo cp ./bin/static/k8s-dqlite snap-unpack-dir/bin/k8s-dqlite
+          sudo chmod o+r snap-unpack-dir/bin/k8s-dqlite
+          sudo mksquashfs snap-unpack-dir v1-2-0.snap -noappend -comp lzo -no-fragments
+      - name: Switch back to target branch
+        run: git reset --hard $TARGET_SHA
+      - name: Run Performance test for v1.2.0 snap
+        env:
+          TEST_SNAP: ${{ github.workspace }}/v1-2-0.snap
+          TEST_SUBSTRATE: lxd
+          TEST_LXD_IMAGE: ubuntu:22.04
+          TEST_INSPECTION_REPORTS_DIR: ${{ github.workspace }}/inspection-reports
+        run: |
+          cd test/performance && sg lxd -c 'tox -e performance'
+      - name: Prepare inspection reports
+        if: failure()
+        run: |
+          tar -czvf inspection-reports.tar.gz -C ${{ github.workspace }} inspection-reports
+          echo "artifact_name=inspection-reports-${{ matrix.os }}" | sed 's/:/-/g' >> $GITHUB_ENV
+      - name: Upload inspection report artifact
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.artifact_name }}
+          path: ${{ github.workspace }}/inspection-reports.tar.gz
diff --git a/test/performance/.copyright.tmpl b/test/performance/.copyright.tmpl
new file mode 100644
index 00000000..ecbed6c7
--- /dev/null
+++ b/test/performance/.copyright.tmpl
@@ -0,0 +1 @@
+Copyright ${years} ${owner}.
diff --git a/test/performance/Readme.md b/test/performance/Readme.md
new file mode 100644
index 00000000..9bcabdd5
--- /dev/null
+++ b/test/performance/Readme.md
@@ -0,0 +1,60 @@
+# Performance Testing
+
+## Overview
+
+End to end tests are written in Python. They are built on top of a [Harness](./tests/conftest.py) fixture so that they can run on multiple environments like LXD or in the future on the local machine.
+
+End to end tests can be configured using environment variables. You can see all available options in [./tests/config.py](./tests/config.py).
+
+## Running end to end tests
+
+Running the end to end tests requires `python3` and `tox`. Install with:
+
+```bash
+sudo apt install python3-virtualenv
+virtualenv .venv
+. .venv/bin/activate
+pip install 'tox<5'
+```
+
+Further, make sure that you have downloaded the `k8s.snap`:
+
+```bash
+sudo snap download k8s --channel=latest/edge --basename k8s
+```
+
+In general, all end to end tests will require specifying the local path to the snap package under test, using the `TEST_SNAP` environment variable. Make sure to specify the full path to the file.
+
+End to end tests are typically run with: `cd test/performance && tox -e performance`
+
+### Running end to end tests on the local machine
+
+```bash
+export TEST_SNAP=$PWD/k8s.snap
+export TEST_SUBSTRATE=local
+
+cd test/performance && tox -e performance
+```
+
+> *NOTE*: When running locally, end to end tests that create more than one instance will fail.
+
+### Running end to end tests on LXD containers
+
+First, make sure that you have initialized LXD:
+
+```bash
+sudo lxd init --auto
+```
+
+Then, run the tests with:
+
+```bash
+export TEST_SNAP=$PWD/k8s.snap
+export TEST_SUBSTRATE=lxd
+
+export TEST_LXD_IMAGE=ubuntu:22.04               # (optionally) specify which image to use for LXD containers
+export TEST_LXD_PROFILE_NAME=k8s-performance     # (optionally) specify profile name to configure
+export TEST_SKIP_CLEANUP=1                       # (optionally) do not destroy machines after tests finish
+
+cd test/performance && tox -e performance
+```
diff --git a/test/performance/lxd-profile.yaml b/test/performance/lxd-profile.yaml
new file mode 100644
index 00000000..c6a05f38
--- /dev/null
+++ b/test/performance/lxd-profile.yaml
@@ -0,0 +1,105 @@
+description: "LXD profile for Canonical Kubernetes"
+config:
+  linux.kernel_modules: ip_vs,ip_vs_rr,ip_vs_wrr,ip_vs_sh,ip_tables,ip6_tables,iptable_raw,netlink_diag,nf_nat,overlay,br_netfilter,xt_socket
+  raw.lxc: |
+    lxc.apparmor.profile=unconfined
+    lxc.mount.auto=proc:rw sys:rw cgroup:rw
+    lxc.cgroup.devices.allow=a
+    lxc.cap.drop=
+  security.nesting: "true"
+  security.privileged: "true"
+devices:
+  aadisable2:
+    path: /dev/kmsg
+    source: /dev/kmsg
+    type: unix-char
+  dev-loop-control:
+    major: "10"
+    minor: "237"
+    path: /dev/loop-control
+    type: unix-char
+  dev-loop0:
+    major: "7"
+    minor: "0"
+    path: /dev/loop0
+    type: unix-block
+  dev-loop1:
+    major: "7"
+    minor: "1"
+    path: /dev/loop1
+    type: unix-block
+  dev-loop2:
+    major: "7"
+    minor: "2"
+    path: /dev/loop2
+    type: unix-block
+  dev-loop3:
+    major: "7"
+    minor: "3"
+    path: /dev/loop3
+    type: unix-block
+  dev-loop4:
+    major: "7"
+    minor: "4"
+    path: /dev/loop4
+    type: unix-block
+  dev-loop5:
+    major: "7"
+    minor: "5"
+    path: /dev/loop5
+    type: unix-block
+  dev-loop6:
+    major: "7"
+    minor: "6"
+    path: /dev/loop6
+    type: unix-block
+  dev-loop7:
+    major: "7"
+    minor: "7"
+    path: /dev/loop7
+    type: unix-block
+  dev-loop8:
+    major: "7"
+    minor: "8"
+    path: /dev/loop8
+    type: unix-block
+  dev-loop9:
+    major: "7"
+    minor: "9"
+    path: /dev/loop9
+    type: unix-block
+  dev-loop10:
+    major: "7"
+    minor: "10"
+    path: /dev/loop10
+    type: unix-block
+  dev-loop11:
+    major: "7"
+    minor: "11"
+    path: /dev/loop11
+    type: unix-block
+  dev-loop12:
+    major: "7"
+    minor: "12"
+    path: /dev/loop12
+    type: unix-block
+  dev-loop13:
+    major: "7"
+    minor: "13"
+    path: /dev/loop13
+    type: unix-block
+  dev-loop14:
+    major: "7"
+    minor: "14"
+    path: /dev/loop14
+    type: unix-block
+  dev-loop15:
+    major: "7"
+    minor: "15"
+    path: /dev/loop15
+    type: unix-block
+  dev-loop16:
+    major: "7"
+    minor: "16"
+    path: /dev/loop16
+    type: unix-block
diff --git a/test/performance/requirements-dev.txt b/test/performance/requirements-dev.txt
new file mode 100644
index 00000000..a66721ae
--- /dev/null
+++ b/test/performance/requirements-dev.txt
@@ -0,0 +1,5 @@
+black==24.3.0
+codespell==2.2.4
+flake8==6.0.0
+isort==5.12.0
+licenseheaders==0.8.8
diff --git a/test/performance/requirements-test.txt b/test/performance/requirements-test.txt
new file mode 100644
index 00000000..91282e09
--- /dev/null
+++ b/test/performance/requirements-test.txt
@@ -0,0 +1,5 @@
+coverage[toml]==7.2.5
+pytest==7.3.1
+PyYAML==6.0.1
+tenacity==8.2.3
+pylint==3.2.5
diff --git a/test/performance/tests/conftest.py b/test/performance/tests/conftest.py
new file mode 100644
index 00000000..ad815994
--- /dev/null
+++ b/test/performance/tests/conftest.py
@@ -0,0 +1,189 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import itertools
+import logging
+from pathlib import Path
+from typing import Generator, Iterator, List, Optional, Union
+
+import pytest
+from test_util import config, harness, util
+
+LOG = logging.getLogger(__name__)
+
+
+def _harness_clean(h: harness.Harness):
+    "Clean up created instances within the test harness."
+
+    if config.SKIP_CLEANUP:
+        LOG.warning(
+            "Skipping harness cleanup. "
+            "It is your job now to clean up cloud resources"
+        )
+    else:
+        LOG.debug("Cleanup")
+        h.cleanup()
+
+
+def _generate_inspection_report(h: harness.Harness, instance_id: str):
+    LOG.debug("Generating inspection report for %s", instance_id)
+
+    inspection_path = Path(config.INSPECTION_REPORTS_DIR)
+    result = h.exec(
+        instance_id,
+        ["/snap/k8s/current/k8s/scripts/inspect.sh", "/inspection-report.tar.gz"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+
+    (inspection_path / instance_id).mkdir(parents=True, exist_ok=True)
+    (inspection_path / instance_id / "inspection_report_logs.txt").write_text(
+        result.stdout
+    )
+
+    try:
+        h.pull_file(
+            instance_id,
+            "/inspection-report.tar.gz",
+            (inspection_path / instance_id / "inspection_report.tar.gz").as_posix(),
+        )
+    except harness.HarnessError as e:
+        LOG.warning("Failed to pull inspection report: %s", e)
+
+
+@pytest.fixture(scope="session")
+def h() -> harness.Harness:
+    LOG.debug("Create harness for %s", config.SUBSTRATE)
+    # if config.SUBSTRATE == "local":
+    #     h = harness.LocalHarness()
+    if config.SUBSTRATE == "lxd":
+        h = harness.LXDHarness()
+    else:
+        raise harness.HarnessError(
+            "TEST_SUBSTRATE must be one of: local, lxd, multipass, juju"
+        )
+
+    yield h
+
+    if config.INSPECTION_REPORTS_DIR is not None:
+        for instance_id in h.instances:
+            LOG.debug("Generating inspection reports for session instances")
+            _generate_inspection_report(h, instance_id)
+
+    _harness_clean(h)
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers",
+        "bootstrap_config: Provide a custom bootstrap config to the bootstrapping node.\n"
+        "disable_k8s_bootstrapping: By default, the first k8s node is bootstrapped. This marker disables that.\n"
+        "no_setup: No setup steps (pushing snap, bootstrapping etc.) are performed on any node for this test.\n"
+        "network_type: Specify network type to use for the infrastructure (IPv4, Dualstack or IPv6).\n"
+        "etcd_count: Mark a test to specify how many etcd instance nodes need to be created (None by default)\n"
+        "node_count: Mark a test to specify how many instance nodes need to be created\n"
+        "snap_versions: Mark a test to specify snap_versions for each node\n",
+    )
+
+
+@pytest.fixture(scope="function")
+def node_count(request) -> int:
+    node_count_marker = request.node.get_closest_marker("node_count")
+    if not node_count_marker:
+        return 1
+    node_count_arg, *_ = node_count_marker.args
+    return int(node_count_arg)
+
+
+def snap_versions(request) -> Iterator[Optional[str]]:
+    """An endless iterable of snap versions for each node in the test."""
+    marking = ()
+    if snap_version_marker := request.node.get_closest_marker("snap_versions"):
+        marking, *_ = snap_version_marker.args
+    # endlessly repeat of the configured snap version after exhausting the marking
+    return itertools.chain(marking, itertools.repeat(None))
+
+
+@pytest.fixture(scope="function")
+def disable_k8s_bootstrapping(request) -> bool:
+    return bool(request.node.get_closest_marker("disable_k8s_bootstrapping"))
+
+
+@pytest.fixture(scope="function")
+def no_setup(request) -> bool:
+    return bool(request.node.get_closest_marker("no_setup"))
+
+
+@pytest.fixture(scope="function")
+def bootstrap_config(request) -> Union[str, None]:
+    bootstrap_config_marker = request.node.get_closest_marker("bootstrap_config")
+    if not bootstrap_config_marker:
+        return None
+    config, *_ = bootstrap_config_marker.args
+    return config
+
+
+@pytest.fixture(scope="function")
+def network_type(request) -> Union[str, None]:
+    bootstrap_config_marker = request.node.get_closest_marker("network_type")
+    if not bootstrap_config_marker:
+        return "IPv4"
+    network_type, *_ = bootstrap_config_marker.args
+    return network_type
+
+
+@pytest.fixture(scope="function")
+def instances(
+    h: harness.Harness,
+    node_count: int,
+    tmp_path: Path,
+    disable_k8s_bootstrapping: bool,
+    no_setup: bool,
+    bootstrap_config: Union[str, None],
+    request,
+    network_type: str,
+) -> Generator[List[harness.Instance], None, None]:
+    """Construct instances for a cluster.
+
+    Bootstrap and setup networking on the first instance, if `disable_k8s_bootstrapping` marker is not set.
+    """
+    if node_count <= 0:
+        pytest.xfail("Test requested 0 or fewer instances, skip this test.")
+
+    LOG.info(f"Creating {node_count} instances")
+    instances: List[harness.Instance] = []
+
+    for _, snap in zip(range(node_count), snap_versions(request)):
+        # Create <node_count> instances and setup the k8s snap in each.
+        instance = h.new_instance(network_type=network_type)
+        instances.append(instance)
+        if not no_setup:
+            util.setup_k8s_snap(instance, tmp_path, snap)
+
+    if not disable_k8s_bootstrapping and not no_setup:
+        first_node, *_ = instances
+
+        if bootstrap_config is not None:
+            first_node.exec(
+                ["k8s", "bootstrap", "--file", "-"],
+                input=str.encode(bootstrap_config),
+            )
+        else:
+            first_node.exec(["k8s", "bootstrap"])
+
+    yield instances
+
+    if config.SKIP_CLEANUP:
+        LOG.warning("Skipping clean-up of instances, delete them on your own")
+        return
+
+    # Cleanup after each test.
+    # We cannot execute _harness_clean() here as this would also
+    # remove the session_instance. The harness ensures that everything is cleaned up
+    # at the end of the test session.
+    for instance in instances:
+        if config.INSPECTION_REPORTS_DIR is not None:
+            LOG.debug("Generating inspection reports for test instances")
+            _generate_inspection_report(h, instance.id)
+
+        h.delete_instance(instance.id)
diff --git a/test/performance/tests/test_performance.py b/test/performance/tests/test_performance.py
new file mode 100644
index 00000000..2d05e1e0
--- /dev/null
+++ b/test/performance/tests/test_performance.py
@@ -0,0 +1,32 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import logging
+from typing import List
+
+import pytest
+from test_util import harness, util
+
+LOG = logging.getLogger(__name__)
+
+
+@pytest.mark.node_count(3)
+def test_load_test(instances: List[harness.Instance]):
+    cluster_node = instances[0]
+    joining_node = instances[1]
+    joining_node_2 = instances[2]
+
+    join_token = util.get_join_token(cluster_node, joining_node)
+    join_token_2 = util.get_join_token(cluster_node, joining_node_2)
+
+    assert join_token != join_token_2
+
+    util.join_cluster(joining_node, join_token)
+    util.join_cluster(joining_node_2, join_token_2)
+
+    util.wait_until_k8s_ready(cluster_node, instances)
+    nodes = util.ready_nodes(cluster_node)
+    assert len(nodes) == 3, "nodes should have joined cluster"
+
+    assert "control-plane" in util.get_local_node_status(cluster_node)
+    assert "control-plane" in util.get_local_node_status(joining_node)
+    assert "control-plane" in util.get_local_node_status(joining_node_2)
diff --git a/test/performance/tests/test_util/config.py b/test/performance/tests/test_util/config.py
new file mode 100644
index 00000000..6272a019
--- /dev/null
+++ b/test/performance/tests/test_util/config.py
@@ -0,0 +1,45 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import os
+from pathlib import Path
+
+DIR = Path(__file__).absolute().parent
+
+# The following defaults are used to define how long to wait for a condition to be met.
+DEFAULT_WAIT_RETRIES = int(os.getenv("TEST_DEFAULT_WAIT_RETRIES") or 50)
+DEFAULT_WAIT_DELAY_S = int(os.getenv("TEST_DEFAULT_WAIT_DELAY_S") or 10)
+
+MANIFESTS_DIR = DIR / ".." / ".." / "templates"
+
+# INSPECTION_REPORTS_DIR is the directory where inspection reports are stored.
+# If empty, no reports are generated.
+INSPECTION_REPORTS_DIR = os.getenv("TEST_INSPECTION_REPORTS_DIR")
+
+# SKIP_CLEANUP can be used to prevent machines to be automatically destroyed
+# after the tests complete.
+SKIP_CLEANUP = (os.getenv("TEST_SKIP_CLEANUP") or "") == "1"
+
+# SNAP is the path to the snap under test.
+SNAP = os.getenv("TEST_SNAP") or ""
+
+# SNAP_NAME is the name of the snap under test.
+SNAP_NAME = os.getenv("TEST_SNAP_NAME") or "k8s"
+
+# FLAVOR is the flavour to use for running the performance tests.
+FLAVOR = os.getenv("TEST_FLAVOR") or ""
+
+# SUBSTRATE is the substrate to use for running the performance tests.
+# Default 'lxd'.
+SUBSTRATE = os.getenv("TEST_SUBSTRATE") or "lxd"
+
+# LXD_IMAGE is the image to use for LXD containers.
+LXD_IMAGE = os.getenv("TEST_LXD_IMAGE") or "ubuntu:22.04"
+
+# LXD_PROFILE is the profile to use for LXD containers.
+LXD_PROFILE = (
+    os.getenv("TEST_LXD_PROFILE")
+    or (DIR / ".." / ".." / "lxd-profile.yaml").read_text()
+)
+
+# LXD_PROFILE_NAME is the profile name to use for LXD containers.
+LXD_PROFILE_NAME = os.getenv("TEST_LXD_PROFILE_NAME") or "k8s-performance"
diff --git a/test/performance/tests/test_util/harness/__init__.py b/test/performance/tests/test_util/harness/__init__.py
new file mode 100644
index 00000000..d1016962
--- /dev/null
+++ b/test/performance/tests/test_util/harness/__init__.py
@@ -0,0 +1,11 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+from test_util.harness.base import Harness, HarnessError, Instance
+from test_util.harness.lxd import LXDHarness
+
+__all__ = [
+    HarnessError,
+    Harness,
+    Instance,
+    LXDHarness,
+]
diff --git a/test/performance/tests/test_util/harness/base.py b/test/performance/tests/test_util/harness/base.py
new file mode 100644
index 00000000..381f1304
--- /dev/null
+++ b/test/performance/tests/test_util/harness/base.py
@@ -0,0 +1,114 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import subprocess
+from functools import cached_property, partial
+
+
+class HarnessError(Exception):
+    """Base error for all our harness failures"""
+
+    pass
+
+
+class Instance:
+    """Reference to a harness and a given instance id.
+
+    Provides convenience methods for an instance to call its harness' methods
+    """
+
+    def __init__(self, h: "Harness", id: str) -> None:
+        self._h = h
+        self._id = id
+
+        self.send_file = partial(h.send_file, id)
+        self.pull_file = partial(h.pull_file, id)
+        self.exec = partial(h.exec, id)
+        self.delete_instance = partial(h.delete_instance, id)
+
+    @property
+    def id(self) -> str:
+        return self._id
+
+    @cached_property
+    def arch(self) -> str:
+        """Return the architecture of the instance"""
+        return self.exec(
+            ["dpkg", "--print-architecture"], text=True, capture_output=True
+        ).stdout.strip()
+
+    def __str__(self) -> str:
+        return f"{self._h.name}:{self.id}"
+
+
+class Harness:
+    """Abstract how performance tests can start and manage multiple machines. This allows
+    writing performance tests that can run on the local machine, LXD, or Multipass with minimum
+    effort.
+    """
+
+    name: str
+
+    def new_instance(self, network_type: str = "IPv4") -> Instance:
+        """Creates a new instance on the infrastructure and returns an object
+        which can be used to interact with it.
+
+        dualstack: If True, the instance will be created with dualstack support.
+
+        If the operation fails, a HarnessError is raised.
+        """
+        raise NotImplementedError
+
+    def send_file(self, instance_id: str, source: str, destination: str):
+        """Send a local file to the instance.
+
+        :param instance_id: The instance_id, as returned by new_instance()
+        :param source: Path to the file that will be copied to the instance
+        :param destination: Path in the instance where the file will be copied.
+                                 This must always be an absolute path.
+
+
+        If the operation fails, a HarnessError is raised.
+        """
+        raise NotImplementedError
+
+    def pull_file(self, instance_id: str, source: str, destination: str):
+        """Pull a file from the instance and save it on the local machine
+
+        :param instance_id: The instance_id, as returned by new_instance()
+        :param source: Path to the file that will be copied from the instance.
+                            This must always be an absolute path.
+        :param destination: Path on the local machine the file will be saved.
+
+        If the operation fails, a HarnessError is raised.
+        """
+        raise NotImplementedError
+
+    def exec(
+        self, instance_id: str, command: list, **kwargs
+    ) -> subprocess.CompletedProcess:
+        """Run a command as root on the instance.
+
+        :param instance_id: The instance_id, as returned by new_instance()
+        :param command: Command for subprocess.run()
+        :param kwargs: Keyword args compatible with subprocess.run()
+
+        If the operation fails, a subprocesss.CalledProcessError is raised.
+        """
+        raise NotImplementedError
+
+    def delete_instance(self, instance_id: str):
+        """Delete a previously created instance.
+
+        :param instance_id: The instance_id, as returned by new_instance()
+
+        If the operation fails, a HarnessError is raised.
+        """
+        raise NotImplementedError
+
+    def cleanup(self):
+        """Delete any leftover resources after the tests are done, e.g. delete any
+        instances that might still be running.
+
+        If the operation fails, a HarnessError is raised.
+        """
+        raise NotImplementedError
diff --git a/test/performance/tests/test_util/harness/lxd.py b/test/performance/tests/test_util/harness/lxd.py
new file mode 100644
index 00000000..ce6041aa
--- /dev/null
+++ b/test/performance/tests/test_util/harness/lxd.py
@@ -0,0 +1,181 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import logging
+import os
+import shlex
+import subprocess
+from pathlib import Path
+from typing import List
+
+from test_util import config
+from test_util.harness import Harness, HarnessError, Instance
+from test_util.util import run, stubbornly
+
+LOG = logging.getLogger(__name__)
+
+
+class LXDHarness(Harness):
+    """A Harness that creates an LXD container for each instance."""
+
+    name = "lxd"
+
+    def next_id(self) -> int:
+        self._next_id += 1
+        return self._next_id
+
+    def __init__(self):
+        super(LXDHarness, self).__init__()
+
+        self._next_id = 0
+        self.profile = config.LXD_PROFILE_NAME
+        self.image = config.LXD_IMAGE
+        self.instances = set()
+
+        self._configure_profile(self.profile, config.LXD_PROFILE)
+
+        self._configure_network(
+            "lxdbr0",
+            "ipv4.address=auto",
+            "ipv4.nat=true",
+        )
+
+        LOG.debug(
+            "Configured LXD substrate (profile %s, image %s)", self.profile, self.image
+        )
+
+    def new_instance(self, network_type: str = "IPv4") -> Instance:
+        instance_id = f"k8s-performance-{os.urandom(3).hex()}-{self.next_id()}"
+
+        LOG.debug("Creating instance %s with image %s", instance_id, self.image)
+        launch_lxd_command = [
+            "lxc",
+            "launch",
+            self.image,
+            instance_id,
+            "-p",
+            "default",
+            "-p",
+            self.profile,
+        ]
+
+        if network_type.lower() != "ipv4":
+            raise HarnessError(
+                f"unknown network type {network_type}, need to be one of 'IPv4'"
+            )
+
+        try:
+            stubbornly(retries=3, delay_s=1).exec(launch_lxd_command)
+            self.instances.add(instance_id)
+
+        except subprocess.CalledProcessError as e:
+            raise HarnessError(f"Failed to create LXD container {instance_id}") from e
+
+        self.exec(instance_id, ["snap", "wait", "system", "seed.loaded"])
+        return Instance(self, instance_id)
+
+    def _configure_profile(self, profile_name: str, profile_config: str):
+        LOG.debug("Checking for LXD profile %s", profile_name)
+        try:
+            run(["lxc", "profile", "show", profile_name])
+        except subprocess.CalledProcessError:
+            try:
+                LOG.debug("Creating LXD profile %s", profile_name)
+                run(["lxc", "profile", "create", profile_name])
+
+            except subprocess.CalledProcessError as e:
+                raise HarnessError(
+                    f"Failed to create LXD profile {profile_name}"
+                ) from e
+
+        try:
+            LOG.debug("Configuring LXD profile %s", profile_name)
+            run(
+                ["lxc", "profile", "edit", profile_name],
+                input=profile_config.encode(),
+            )
+        except subprocess.CalledProcessError as e:
+            raise HarnessError(f"Failed to configure LXD profile {profile_name}") from e
+
+    def _configure_network(self, network_name: str, *network_args: List[str]):
+        LOG.debug("Checking for LXD network %s", network_name)
+        try:
+            run(["lxc", "network", "show", network_name])
+        except subprocess.CalledProcessError:
+            try:
+                LOG.debug("Creating LXD network %s", network_name)
+                run(["lxc", "network", "create", network_name, *network_args])
+
+            except subprocess.CalledProcessError as e:
+                raise HarnessError(
+                    f"Failed to create LXD network {network_name}"
+                ) from e
+
+    def send_file(self, instance_id: str, source: str, destination: str):
+        if instance_id not in self.instances:
+            raise HarnessError(f"unknown instance {instance_id}")
+
+        if not Path(destination).is_absolute():
+            raise HarnessError(f"path {destination} must be absolute")
+
+        LOG.debug(
+            "Copying file %s to instance %s at %s", source, instance_id, destination
+        )
+        try:
+            self.exec(
+                instance_id,
+                ["mkdir", "-m=0777", "-p", Path(destination).parent.as_posix()],
+                capture_output=True,
+            )
+            run(
+                ["lxc", "file", "push", source, f"{instance_id}{destination}"],
+                capture_output=True,
+            )
+        except subprocess.CalledProcessError as e:
+            LOG.error("command {e.cmd} failed")
+            LOG.error(f"  {e.returncode=}")
+            LOG.error(f"  {e.stdout.decode()=}")
+            LOG.error(f"  {e.stderr.decode()=}")
+            raise HarnessError("failed to push file") from e
+
+    def pull_file(self, instance_id: str, source: str, destination: str):
+        if instance_id not in self.instances:
+            raise HarnessError(f"unknown instance {instance_id}")
+
+        if not Path(source).is_absolute():
+            raise HarnessError(f"path {source} must be absolute")
+
+        LOG.debug(
+            "Copying file %s from instance %s to %s", source, instance_id, destination
+        )
+        try:
+            run(
+                ["lxc", "file", "pull", f"{instance_id}{source}", destination],
+                stdout=subprocess.DEVNULL,
+            )
+        except subprocess.CalledProcessError as e:
+            raise HarnessError("lxc file push command failed") from e
+
+    def exec(self, instance_id: str, command: list, **kwargs):
+        if instance_id not in self.instances:
+            raise HarnessError(f"unknown instance {instance_id}")
+
+        LOG.debug("Execute command %s in instance %s", command, instance_id)
+        return run(
+            ["lxc", "shell", instance_id, "--", "bash", "-c", shlex.join(command)],
+            **kwargs,
+        )
+
+    def delete_instance(self, instance_id: str):
+        if instance_id not in self.instances:
+            raise HarnessError(f"unknown instance {instance_id}")
+
+        try:
+            run(["lxc", "rm", instance_id, "--force"])
+        except subprocess.CalledProcessError as e:
+            raise HarnessError(f"failed to delete instance {instance_id}") from e
+
+        self.instances.discard(instance_id)
+
+    def cleanup(self):
+        for instance_id in self.instances.copy():
+            self.delete_instance(instance_id)
diff --git a/test/performance/tests/test_util/util.py b/test/performance/tests/test_util/util.py
new file mode 100644
index 00000000..0a0d45f7
--- /dev/null
+++ b/test/performance/tests/test_util/util.py
@@ -0,0 +1,324 @@
+#
+# Copyright 2024 Canonical, Ltd.#
+import json
+import logging
+import re
+import shlex
+import subprocess
+import urllib.request
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, List, Mapping, Optional, Union
+
+import pytest
+from tenacity import (
+    RetryCallState,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    stop_never,
+    wait_fixed,
+)
+from test_util import config, harness
+
+LOG = logging.getLogger(__name__)
+RISKS = ["stable", "candidate", "beta", "edge"]
+TRACK_RE = re.compile(r"^(\d+)\.(\d+)(\S*)$")
+
+
+def run(command: list, **kwargs) -> subprocess.CompletedProcess:
+    """Log and run command."""
+    kwargs.setdefault("check", True)
+
+    LOG.debug("Execute command %s (kwargs=%s)", shlex.join(command), kwargs)
+    return subprocess.run(command, **kwargs)
+
+
+def stubbornly(
+    retries: Optional[int] = None,
+    delay_s: Optional[Union[float, int]] = None,
+    exceptions: Optional[tuple] = None,
+    **retry_kds,
+):
+    """
+    Retry a command for a while, using tenacity
+
+    By default, retry immediately and forever until no exceptions occur.
+
+    Some commands need to execute until they pass some condition
+    > stubbornly(*retry_args).until(*some_condition).exec(*some_command)
+
+    Some commands need to execute until they complete
+    > stubbornly(*retry_args).exec(*some_command)
+
+    : param    retries              int: convenience param to use stop=retry.stop_after_attempt(<int>)
+    : param    delay_s        float|int: convenience param to use wait=retry.wait_fixed(delay_s)
+    : param exceptions Tuple[Exception]: convenience param to use retry=retry.retry_if_exception_type(exceptions)
+    : param retry_kds           Mapping: direct interface to all tenacity arguments for retrying
+    """
+
+    def _before_sleep(retry_state: RetryCallState):
+        attempt = retry_state.attempt_number
+        tries = f"/{retries}" if retries is not None else ""
+        LOG.info(
+            f"Attempt {attempt}{tries} failed. Error: {retry_state.outcome.exception()}"
+        )
+        LOG.info(f"Retrying in {delay_s} seconds...")
+
+    _waits = wait_fixed(delay_s) if delay_s is not None else wait_fixed(0)
+    _stops = stop_after_attempt(retries) if retries is not None else stop_never
+    _exceptions = exceptions or (Exception,)  # default to retry on all exceptions
+
+    _retry_args = dict(
+        wait=_waits,
+        stop=_stops,
+        retry=retry_if_exception_type(_exceptions),
+        before_sleep=_before_sleep,
+    )
+    # Permit any tenacity retry overrides from these ^defaults
+    _retry_args.update(retry_kds)
+
+    class Retriable:
+        def __init__(self) -> None:
+            self._condition = None
+            self._run = partial(run, capture_output=True)
+
+        @retry(**_retry_args)
+        def exec(
+            self,
+            command_args: List[str],
+            **command_kwds,
+        ):
+            """
+            Execute a command against a harness or locally with subprocess to be retried.
+
+            :param  List[str]        command_args: The command to be executed, as a str or list of str
+            :param Mapping[str,str]      command_kwds: Additional keyword arguments to be passed to exec
+            """
+
+            try:
+                resp = self._run(command_args, **command_kwds)
+            except subprocess.CalledProcessError as e:
+                LOG.warning(f"  rc={e.returncode}")
+                LOG.warning(f"  stdout={e.stdout.decode()}")
+                LOG.warning(f"  stderr={e.stderr.decode()}")
+                raise
+            if self._condition:
+                assert self._condition(resp), "Failed to meet condition"
+            return resp
+
+        def on(self, instance: harness.Instance) -> "Retriable":
+            """
+            Target the command at some instance.
+
+            :param instance Instance: Instance on a test harness.
+            """
+            self._run = partial(instance.exec, capture_output=True)
+            return self
+
+        def until(
+            self, condition: Callable[[subprocess.CompletedProcess], bool] = None
+        ) -> "Retriable":
+            """
+            Test the output of the executed command against an expected response
+
+            :param Callable condition: a callable which returns a truth about the command output
+            """
+            self._condition = condition
+            return self
+
+    return Retriable()
+
+
+def _as_int(value: Optional[str]) -> Optional[int]:
+    """Convert a string to an integer."""
+    try:
+        return int(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def setup_k8s_snap(
+    instance: harness.Instance,
+    tmp_path: Path,
+    snap: Optional[str] = None,
+    connect_interfaces=True,
+):
+    """Installs and sets up the snap on the given instance and connects the interfaces.
+
+    Args:
+        instance:   instance on which to install the snap
+        tmp_path:   path to store the snap on the instance
+        snap: choice of track, channel, revision, or file path
+            a snap track to install
+            a snap channel to install
+            a snap revision to install
+            a path to the snap to install
+    """
+    cmd = ["snap", "install", "--classic"]
+    which_snap = snap or config.SNAP
+
+    if not which_snap:
+        pytest.fail("Set TEST_SNAP to the channel, revision, or path to the snap")
+
+    if isinstance(which_snap, str) and which_snap.startswith("/"):
+        LOG.info("Install k8s snap by path")
+        snap_path = (tmp_path / "k8s.snap").as_posix()
+        instance.send_file(which_snap, snap_path)
+        cmd += ["--dangerous", snap_path]
+    elif snap_revision := _as_int(which_snap):
+        LOG.info("Install k8s snap by revision")
+        cmd += [config.SNAP_NAME, "--revision", snap_revision]
+    elif "/" in which_snap or which_snap in RISKS:
+        LOG.info("Install k8s snap by specific channel: %s", which_snap)
+        cmd += [config.SNAP_NAME, "--channel", which_snap]
+    elif channel := tracks_least_risk(which_snap, instance.arch):
+        LOG.info("Install k8s snap by least risky channel: %s", channel)
+        cmd += [config.SNAP_NAME, "--channel", channel]
+
+    instance.exec(cmd)
+    if connect_interfaces:
+        LOG.info("Ensure k8s interfaces and network requirements")
+        instance.exec(["/snap/k8s/current/k8s/hack/init.sh"], stdout=subprocess.DEVNULL)
+
+
+def wait_until_k8s_ready(
+    control_node: harness.Instance,
+    instances: List[harness.Instance],
+    retries: int = config.DEFAULT_WAIT_RETRIES,
+    delay_s: int = config.DEFAULT_WAIT_DELAY_S,
+    node_names: Mapping[str, str] = {},
+):
+    """
+    Validates that the K8s node is in Ready state.
+
+    By default, the hostname of the instances is used as the node name.
+    If the instance name is different from the hostname, the instance name should be passed to the
+    node_names dictionary, e.g. {"instance_id": "node_name"}.
+    """
+    for instance in instances:
+        if instance.id in node_names:
+            node_name = node_names[instance.id]
+        else:
+            node_name = hostname(instance)
+
+        result = (
+            stubbornly(retries=retries, delay_s=delay_s)
+            .on(control_node)
+            .until(lambda p: " Ready" in p.stdout.decode())
+            .exec(["k8s", "kubectl", "get", "node", node_name, "--no-headers"])
+        )
+    LOG.info("Kubelet registered successfully!")
+    LOG.info("%s", result.stdout.decode())
+
+
+def wait_for_dns(instance: harness.Instance):
+    LOG.info("Waiting for DNS to be ready")
+    instance.exec(["k8s", "x-wait-for", "dns"])
+
+
+def wait_for_network(instance: harness.Instance):
+    LOG.info("Waiting for network to be ready")
+    instance.exec(["k8s", "x-wait-for", "network"])
+
+
+def hostname(instance: harness.Instance) -> str:
+    """Return the hostname for a given instance."""
+    resp = instance.exec(["hostname"], capture_output=True)
+    return resp.stdout.decode().strip()
+
+
+def get_local_node_status(instance: harness.Instance) -> str:
+    resp = instance.exec(["k8s", "local-node-status"], capture_output=True)
+    return resp.stdout.decode().strip()
+
+
+def get_nodes(control_node: harness.Instance) -> List[Any]:
+    """Get a list of existing nodes.
+
+    Args:
+        control_node: instance on which to execute check
+
+    Returns:
+        list of nodes
+    """
+    result = control_node.exec(
+        ["k8s", "kubectl", "get", "nodes", "-o", "json"], capture_output=True
+    )
+    assert result.returncode == 0, "Failed to get nodes with kubectl"
+    node_list = json.loads(result.stdout.decode())
+    assert node_list["kind"] == "List", "Should have found a list of nodes"
+    return [node for node in node_list["items"]]
+
+
+def ready_nodes(control_node: harness.Instance) -> List[Any]:
+    """Get a list of the ready nodes.
+
+    Args:
+        control_node: instance on which to execute check
+
+    Returns:
+        list of nodes
+    """
+    return [
+        node
+        for node in get_nodes(control_node)
+        if all(
+            condition["status"] == "False"
+            for condition in node["status"]["conditions"]
+            if condition["type"] != "Ready"
+        )
+    ]
+
+
+# Create a token to join a node to an existing cluster
+def get_join_token(
+    initial_node: harness.Instance, joining_cplane_node: harness.Instance, *args: str
+) -> str:
+    out = initial_node.exec(
+        ["k8s", "get-join-token", joining_cplane_node.id, *args],
+        capture_output=True,
+    )
+    return out.stdout.decode().strip()
+
+
+# Join an existing cluster.
+def join_cluster(instance: harness.Instance, join_token: str):
+    instance.exec(["k8s", "join-cluster", join_token])
+
+def tracks_least_risk(track: str, arch: str) -> str:
+    """Determine the snap channel with the least risk in the provided track.
+
+    Args:
+        track: the track to determine the least risk channel for
+        arch: the architecture to narrow the revision
+
+    Returns:
+        the channel associated with the least risk
+    """
+    LOG.debug("Determining least risk channel for track: %s on %s", track, arch)
+    if track == "latest":
+        return f"latest/edge/{config.FLAVOR or 'classic'}"
+
+    INFO_URL = f"https://api.snapcraft.io/v2/snaps/info/{config.SNAP_NAME}"
+    HEADERS = {
+        "Snap-Device-Series": "16",
+        "User-Agent": "Mozilla/5.0",
+    }
+
+    req = urllib.request.Request(INFO_URL, headers=HEADERS)
+    with urllib.request.urlopen(req) as response:
+        snap_info = json.loads(response.read().decode())
+
+    risks = [
+        channel["channel"]["risk"]
+        for channel in snap_info["channel-map"]
+        if channel["channel"]["track"] == track
+        and channel["channel"]["architecture"] == arch
+    ]
+    if not risks:
+        raise ValueError(f"No risks found for track: {track}")
+    risk_level = {"stable": 0, "candidate": 1, "beta": 2, "edge": 3}
+    channel = f"{track}/{min(risks, key=lambda r: risk_level[r])}"
+    LOG.info("Least risk channel from track %s is %s", track, channel)
+    return channel
diff --git a/test/performance/tox.ini b/test/performance/tox.ini
new file mode 100644
index 00000000..e3d8431c
--- /dev/null
+++ b/test/performance/tox.ini
@@ -0,0 +1,51 @@
+[tox]
+skipsdist = True
+skip_missing_interpreters = True
+env_list = format, lint, performance
+
+[testenv]
+set_env =
+    PYTHONBREAKPOINT=pdb.set_trace
+    PY_COLORS=1
+passenv =
+    PYTHONPATH
+
+[testenv:format]
+description = Apply coding style standards to code
+deps = -r {toxinidir}/requirements-dev.txt
+commands =
+    licenseheaders -t {toxinidir}/.copyright.tmpl -cy -o 'Canonical, Ltd' -d {toxinidir}/tests
+    isort {toxinidir}/tests --profile=black
+    black {toxinidir}/tests
+
+[testenv:lint]
+description = Check code against coding style standards
+deps = -r {toxinidir}/requirements-dev.txt
+commands =
+    codespell {toxinidir}/tests
+    flake8 {toxinidir}/tests
+    licenseheaders -t {toxinidir}/.copyright.tmpl -cy -o 'Canonical, Ltd' -d {toxinidir}/tests --dry
+    isort {toxinidir}/tests --profile=black --check
+    black {toxinidir}/tests --check --diff
+
+[testenv:performance]
+description = Run performance tests
+deps =
+    -r {toxinidir}/requirements-test.txt
+commands =
+    pytest -vv \
+        --maxfail 1 \
+        --tb native \
+        --log-cli-level DEBUG \
+        --disable-warnings \
+        {posargs} \
+        {toxinidir}/tests
+passenv =
+    TEST_*
+
+[flake8]
+max-line-length = 120
+select = E,W,F,C,N
+ignore = W503
+exclude = venv,.git,.tox,.tox_env,.venv,build,dist,*.egg_info
+show-source = true