Merge 0e3adbf into 7f693e2

canonical · Jan 30, 2025 · 4565455 · 4565455
2 parents 7f693e2 + 0e3adbf
commit 4565455
Show file tree

Hide file tree

Showing 12 changed files with 149 additions and 25 deletions.
diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
@@ -17,7 +17,7 @@ jobs:
   # INTEGRATION_TEST_ARGS to operator-workflows automatically.
   integration-tests:
     name: Integration test with juju 3.1
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.1/stable
@@ -30,19 +30,20 @@ jobs:
       self-hosted-runner-label: stg-private-endpoint
   openstack-interface-tests-private-endpoint:
     name: openstack interface test using private-endpoint
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.6/stable
       pre-run-script: scripts/setup-lxd.sh
       provider: lxd
       test-tox-env: integration-juju3.6
       modules: '["test_runner_manager_openstack"]'
+      extra-arguments: '--log-format="%(asctime)s %(levelname)s %(message)s"'
       self-hosted-runner: true
       self-hosted-runner-label: stg-private-endpoint
   openstack-integration-tests-private-endpoint:
     name: Integration test using private-endpoint
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.6/stable

diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py
@@ -189,7 +189,11 @@ def _run_health_check_cloud_init(
     """
     result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status")
     if not result.ok:
-        logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr)
+        logger.error("cloud-init status command failed on %s: %s.", server_name, result.stderr)
+        cloud_init_log_output_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init-output.log")
+        logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
+        cloud_init_log_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init.log")
+        logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout)
         return False
 
     if CloudInitStatus.DONE in result.stdout:

diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py
@@ -653,6 +653,10 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
             logger.warning(
                 "cloud-init status command failed on %s: %s.", instance.server_name, result.stderr
             )
+            cloud_init_log_output_result = ssh_conn.run("cat /var/log/cloud-init-output.log", warn=True, timeout=60)
+            logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
+            cloud_init_log_result = ssh_conn.run("cat /var/log/cloud-init.log")
+            logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout, warn=True, timeout=60)
             raise RunnerStartError(f"Runner startup process not found on {instance.server_name}")
         # A short running job may have already completed and exited the runner, hence check the
         # condition via cloud-init status check.

diff --git a/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 b/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2
@@ -9,6 +9,7 @@ su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env'
 
 {% if aproxy_address %}
 snap install aproxy --edge
+snap watch --last=auto-refresh?
 snap set aproxy proxy={{ aproxy_address }} listen=:54969
 cat << EOF > /etc/nftables.conf
 define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+')

diff --git a/src/charm.py b/src/charm.py
@@ -110,7 +110,7 @@ class ReconcileRunnersEvent(EventBase):
 
 
 def catch_charm_errors(
-    func: Callable[["GithubRunnerCharm", EventT], None]
+    func: Callable[["GithubRunnerCharm", EventT], None],
 ) -> Callable[["GithubRunnerCharm", EventT], None]:
     """Catch common errors in charm.
 
@@ -145,7 +145,7 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None:
 
 
 def catch_action_errors(
-    func: Callable[["GithubRunnerCharm", ActionEvent], None]
+    func: Callable[["GithubRunnerCharm", ActionEvent], None],
 ) -> Callable[["GithubRunnerCharm", ActionEvent], None]:
     """Catch common errors in actions.
 
@@ -519,15 +519,13 @@ def _apt_install(self, packages: Sequence[str]) -> None:
     def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
         """Handle debug ssh relation changed event."""
         state = self._setup_state()
+        self.unit.status = MaintenanceStatus("Reconciling runners - added debug-ssh")
 
         if not self._get_set_image_ready_status():
             return
         runner_scaler = self._get_runner_scaler(state)
         runner_scaler.flush()
-        try:
-            runner_scaler.reconcile(state.runner_config.virtual_machines)
-        except ReconcileError:
-            logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
+        self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)
 
     @catch_charm_errors
     def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None:

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -45,7 +45,7 @@
 from tests.integration.helpers.openstack import OpenStackInstanceHelper, PrivateEndpointConfigs
 from tests.status_name import ACTIVE
 
-IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 30 * 60
+IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 20 * 60
 
 # The following line is required because we are using request.getfixturevalue in conjunction
 # with pytest-asyncio. See https://github.com/pytest-dev/pytest-asyncio/issues/112
@@ -317,7 +317,7 @@ async def app_no_runner(
 ) -> AsyncIterator[Application]:
     """Application with no runner."""
     await basic_app.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "0"})
-    await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=20 * 60)
     yield basic_app
 
 
@@ -339,7 +339,8 @@ async def image_builder_fixture(
             config={
                 "app-channel": "edge",
                 "build-interval": "12",
-                "revision-history-limit": "5",
+                # JAVI be careful, maybe all tests use the same names for the images
+                "revision-history-limit": "15",
                 "openstack-auth-url": private_endpoint_config["auth_url"],
                 # Bandit thinks this is a hardcoded password
                 "openstack-password": private_endpoint_config["password"],  # nosec: B105
@@ -401,9 +402,33 @@ async def app_openstack_runner_fixture(
             wait_idle=False,
         )
         await model.integrate(f"{image_builder.name}:image", f"{application.name}:image")
-    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(
+        apps=[application.name, image_builder.name], status=ACTIVE, timeout=20 * 60
+    )
 
-    return application
+    # better use test-mode charm config... but let's see
+    command = "find /var/lib/juju -type f -name 'constants.py' -exec sed -i 's/^CREATE_SERVER_TIMEOUT = .*/CREATE_SERVER_TIMEOUT = 900/gI' {} \\;"
+    run_actions = await application.run(command)
+    logging.info("JAVI run_actions %s", run_actions)
+    for action_result in run_actions.actions:
+        logging.info("JAVI action_result %s", action_result)
+        action = action_result.action
+        logging.info("JAVI action %s", action)
+        # no comment...
+        action_id = action.tag
+        if action_id.startswith("action-"):
+            # strip the action- part of "action-<num>" tag
+            action_id = action_id[7:]
+        action = await model._wait_for_new("action", action_id)
+        result = await action.wait()
+        logging.info("JAVI output of one unit of CREATE_SERVER_TIMEOUT %s", result.results)
+
+    yield application
+    try:
+        logging.info("JAVI after yield in app_openstack_runner_fixture")
+        # get_file_content(unit, filename)
+    except Exception:
+        logging.exception("JAVI something failed after yield")
 
 
 @pytest_asyncio.fixture(scope="module", name="app_scheduled_events")
@@ -415,7 +440,7 @@ async def app_scheduled_events_fixture(
     application = app_openstack_runner
     await application.set_config({"reconcile-interval": "8"})
     await application.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"})
-    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=20 * 60)
     await reconcile(app=application, model=model)
     return application
 

diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py
@@ -2,6 +2,7 @@
 #  See LICENSE file for licensing details.
 import logging
 import secrets
+import threading
 from asyncio import sleep
 from typing import Optional, TypedDict
 
@@ -17,6 +18,50 @@
 logger = logging.getLogger(__name__)
 
 
+async def javi_wait_for_idle(openstack_connection, model, *args, **kwargs) -> None:
+    """TODO.
+
+    Args:
+        openstack_connection: OpenStack connection object.
+        model: model
+        args: args
+        kwargs: kwargs
+    """
+    logger.info("javi_wait_for_idle")
+    e = threading.Event()
+
+    def _log_openstack():
+        """TODO."""
+        end_loop = False
+        while True:
+            end_loop = e.wait(20)
+            # probably not thread safe, but...
+            try:
+                servers = openstack_connection.list_servers()
+            except Exception as ex:
+                logger.exception("JAVI in log openstack thread")
+                raise ex
+            logger.info(" [ runner list ]")
+            for runner in servers:
+                logger.info(
+                    " [ runner %s ] status %s created %s updated %s",
+                    runner.name,
+                    runner.status,
+                    runner.created_at,
+                    runner.updated_at,
+                )
+            if end_loop:
+                break
+
+    try:
+        t = threading.Thread(target=_log_openstack)
+        t.start()
+        await model.wait_for_idle(*args, **kwargs)
+    finally:
+        e.set()
+        t.join()
+
+
 class OpenStackInstanceHelper:
     """Helper class to interact with OpenStack instances."""
 
@@ -179,6 +224,26 @@ async def get_runner_name(self, unit: Unit) -> str:
         assert len(runners) == 1
         return runners[0].name
 
+    def log_runners(self, unit: Unit) -> None:
+        """TODO LOG RUNNERS.
+
+        Expects only one runner to be present.
+
+        Args:
+            unit: The GitHub Runner Charm unit to get the runner name for.
+        """
+        runners = self._get_runners(unit)
+        logger.info("[ list of runners for unit %s]", unit)
+        for runner in runners:
+            logger.info(
+                " [ runner %s ] status %s created %s updated %s",
+                runner.name,
+                runner.status,
+                runner.created_at,
+                runner.updated_at,
+            )
+        logger.info("[ end list of runners for unit %s]")
+
     async def delete_single_runner(self, unit: Unit) -> None:
         """Delete the only runner.
 

diff --git a/tests/integration/test_charm_fork_path_change.py b/tests/integration/test_charm_fork_path_change.py
@@ -44,15 +44,31 @@ async def test_path_config_change(
 
     logger.info("Ensure there is a runner (this calls reconcile)")
     await instance_helper.ensure_charm_has_runner(app_with_forked_repo)
+    logger.info("after ensure_charm_has_runner")
+    instance_helper.log_runners(unit)
 
+    logger.info("Change Path config option")
     await app_with_forked_repo.set_config({PATH_CONFIG_NAME: path})
+    instance_helper.log_runners(unit)
+
+    status = await model.get_status()
+    logger.info(" status : %s", status)
 
     logger.info("Reconciling (again)")
     await reconcile(app=app_with_forked_repo, model=model)
 
+    logger.info("after Reconciling (again)")
+    instance_helper.log_runners(unit)
+
+    status = await model.get_status()
+    logger.info("JAVI status 2: %s", status)
+
     runner_names = await instance_helper.get_runner_names(unit)
     logger.info("runners: %s", runner_names)
     assert len(runner_names) == 1
+    #this will crash if there is not exactly one
+    logger.info("runner info: %s", instance_helper._get_single_runner(unit))
+
     runner_name = runner_names[0]
 
     runners_in_repo = github_repository.get_self_hosted_runners()
@@ -62,4 +78,4 @@ async def test_path_config_change(
         filter(lambda runner: runner.name == runner_name, runners_in_repo)
     )
 
-    assert len(runner_in_repo_with_same_name) == 1
+    assert len(runner_in_repo_with_same_name) == 1, "there has to be 1 runner in the repo"
diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py
@@ -15,7 +15,7 @@
 from juju.model import Model
 
 from tests.integration.helpers.common import wait_for
-from tests.integration.helpers.openstack import OpenStackInstanceHelper
+from tests.integration.helpers.openstack import OpenStackInstanceHelper, javi_wait_for_idle
 from tests.status_name import ACTIVE
 
 logger = logging.getLogger(__name__)
@@ -28,6 +28,7 @@ async def test_update_interval(
     model: Model,
     app_scheduled_events: Application,
     instance_helper: OpenStackInstanceHelper,
+    openstack_connection,
 ) -> None:
     """
     arrange: A working application with one runner.
@@ -57,7 +58,7 @@ async def _no_runners_available() -> bool:
 
     logger.info("Wait for 10 minutes")
     await sleep(10 * 60)
-    await model.wait_for_idle(status=ACTIVE, timeout=20 * 60)
+    await javi_wait_for_idle(openstack_connection, model, status=ACTIVE, timeout=20 * 60)
 
     newnames = await instance_helper.get_runner_names(unit)
     assert len(newnames) == 1, "There should be one runner after reconciliation"

diff --git a/tests/integration/test_charm_upgrade.py b/tests/integration/test_charm_upgrade.py
@@ -91,10 +91,10 @@ async def test_charm_upgrade(
     )
     await model.integrate(f"{image_builder.name}:image", f"{application.name}:image")
     await model.wait_for_idle(
-        apps=[application.name],
+        apps=[application.name, image_builder.name],
         raise_on_error=False,
         wait_for_active=True,
-        timeout=180 * 60,
+        timeout=20 * 60,
         check_freq=30,
     )
     origin = client.CharmOrigin(
@@ -125,6 +125,6 @@ async def test_charm_upgrade(
         apps=[application.name],
         raise_on_error=False,
         wait_for_active=True,
-        timeout=180 * 60,
+        timeout=20 * 60,
         check_freq=30,
     )
diff --git a/tests/integration/test_debug_ssh.py b/tests/integration/test_debug_ssh.py
@@ -11,7 +11,7 @@
 from juju.model import Model
 
 from tests.integration.helpers.common import dispatch_workflow, get_job_logs
-from tests.integration.helpers.openstack import OpenStackInstanceHelper
+from tests.integration.helpers.openstack import OpenStackInstanceHelper, javi_wait_for_idle
 from tests.status_name import ACTIVE
 
 logger = logging.getLogger(__name__)
@@ -28,24 +28,30 @@ async def test_ssh_debug(
     test_github_branch: Branch,
     tmate_ssh_server_unit_ip: str,
     instance_helper: OpenStackInstanceHelper,
+    openstack_connection,
 ):
     """
     arrange: given an integrated GitHub-Runner charm and tmate-ssh-server charm.
     act: when canonical/action-tmate is triggered.
     assert: the ssh connection info from action-log and tmate-ssh-server matches.
     """
-    await model.wait_for_idle(status=ACTIVE, timeout=60 * 120)
+    await javi_wait_for_idle(openstack_connection, model, status=ACTIVE, timeout=60 * 120)
 
     unit = app_no_wait_tmate.units[0]
     # We need the runner to connect to the current machine, instead of the tmate_ssh_server unit,
     # as the tmate_ssh_server is not routable.
+    logger.info("before iptables")
+    instance_helper.log_runners(unit)
+
     dnat_comman_in_runner = f"sudo iptables -t nat -A OUTPUT -p tcp -d {tmate_ssh_server_unit_ip} --dport 10022 -j DNAT --to-destination 127.0.0.1:10022"
     _, _, _ = await instance_helper.run_in_instance(
         unit,
         dnat_comman_in_runner,
         assert_on_failure=True,
     )
     await instance_helper.expose_to_instance(unit=unit, port=10022, host=tmate_ssh_server_unit_ip)
+    logger.info("after exposing instance")
+    instance_helper.log_runners(unit)
 
     # trigger tmate action
     logger.info("Dispatching workflow_dispatch_ssh_debug.yaml workflow.")
@@ -59,6 +65,8 @@ async def test_ssh_debug(
         workflow_id_or_name=SSH_DEBUG_WORKFLOW_FILE_NAME,
     )
 
+    logger.info("after workflow run")
+    instance_helper.log_runners(unit)
     logs = get_job_logs(workflow_run.jobs("latest")[0])
 
     # ensure ssh connection info printed in logs.