Merge 097c98d into 7f693e2

canonical · Jan 30, 2025 · 73d3228 · 73d3228
2 parents 7f693e2 + 097c98d
commit 73d3228
Show file tree

Hide file tree

Showing 12 changed files with 166 additions and 25 deletions.
diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml
@@ -17,7 +17,7 @@ jobs:
   # INTEGRATION_TEST_ARGS to operator-workflows automatically.
   integration-tests:
     name: Integration test with juju 3.1
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.1/stable
@@ -30,19 +30,20 @@ jobs:
       self-hosted-runner-label: stg-private-endpoint
   openstack-interface-tests-private-endpoint:
     name: openstack interface test using private-endpoint
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.6/stable
       pre-run-script: scripts/setup-lxd.sh
       provider: lxd
       test-tox-env: integration-juju3.6
       modules: '["test_runner_manager_openstack"]'
+      extra-arguments: '--log-format="%(asctime)s %(levelname)s %(message)s"'
       self-hosted-runner: true
       self-hosted-runner-label: stg-private-endpoint
   openstack-integration-tests-private-endpoint:
     name: Integration test using private-endpoint
-    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main
+    uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing
     secrets: inherit
     with:
       juju-channel: 3.6/stable

diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py
@@ -189,7 +189,11 @@ def _run_health_check_cloud_init(
     """
     result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status")
     if not result.ok:
-        logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr)
+        logger.error("cloud-init status command failed on %s: %s.", server_name, result.stderr)
+        cloud_init_log_output_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init-output.log")
+        logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
+        cloud_init_log_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init.log")
+        logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout)
         return False
 
     if CloudInitStatus.DONE in result.stdout:

diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py
@@ -653,6 +653,10 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
             logger.warning(
                 "cloud-init status command failed on %s: %s.", instance.server_name, result.stderr
             )
+            cloud_init_log_output_result = ssh_conn.run("cat /var/log/cloud-init-output.log", warn=True, timeout=60)
+            logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout)
+            cloud_init_log_result = ssh_conn.run("cat /var/log/cloud-init.log")
+            logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout, warn=True, timeout=60)
             raise RunnerStartError(f"Runner startup process not found on {instance.server_name}")
         # A short running job may have already completed and exited the runner, hence check the
         # condition via cloud-init status check.

diff --git a/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 b/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2
@@ -9,6 +9,8 @@ su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env'
 
 {% if aproxy_address %}
 snap install aproxy --edge
+snap refresh --hold=2h
+snap watch --last=auto-refresh?
 snap set aproxy proxy={{ aproxy_address }} listen=:54969
 cat << EOF > /etc/nftables.conf
 define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+')

diff --git a/src/charm.py b/src/charm.py
@@ -110,7 +110,7 @@ class ReconcileRunnersEvent(EventBase):
 
 
 def catch_charm_errors(
-    func: Callable[["GithubRunnerCharm", EventT], None]
+    func: Callable[["GithubRunnerCharm", EventT], None],
 ) -> Callable[["GithubRunnerCharm", EventT], None]:
     """Catch common errors in charm.
 
@@ -145,7 +145,7 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None:
 
 
 def catch_action_errors(
-    func: Callable[["GithubRunnerCharm", ActionEvent], None]
+    func: Callable[["GithubRunnerCharm", ActionEvent], None],
 ) -> Callable[["GithubRunnerCharm", ActionEvent], None]:
     """Catch common errors in actions.
 
@@ -336,6 +336,7 @@ def _on_upgrade_charm(self, _: UpgradeCharmEvent) -> None:
     @catch_charm_errors
     def _on_config_changed(self, _: ConfigChangedEvent) -> None:
         """Handle the configuration change."""
+        logger.info("JAVI CHARM _ON_CONFIG_CHANGED")
         state = self._setup_state()
         self._set_reconcile_timer()
 
@@ -351,6 +352,7 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None:
         if not self._get_set_image_ready_status():
             return
         if state.charm_config.token != self._stored.token:
+            logger.info("JAVI CHARM _ON_CONFIG_CHANGED FLUSH RECONCILE")
             runner_scaler = self._get_runner_scaler(state)
             runner_scaler.flush(flush_mode=FlushMode.FLUSH_IDLE)
             self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)
@@ -359,16 +361,19 @@ def _on_config_changed(self, _: ConfigChangedEvent) -> None:
     @catch_charm_errors
     def _on_reconcile_runners(self, _: ReconcileRunnersEvent) -> None:
         """Event handler for reconciling runners."""
+        logger.info("JAVI CHARM _on_reconcile_runners")
         self._trigger_reconciliation()
 
     @catch_charm_errors
     def _on_database_created(self, _: ops.RelationEvent) -> None:
         """Handle the MongoDB database created event."""
+        logger.info("JAVI CHARM _on_database_created")
         self._trigger_reconciliation()
 
     @catch_charm_errors
     def _on_endpoints_changed(self, _: ops.RelationEvent) -> None:
         """Handle the MongoDB endpoints changed event."""
+        logger.info("JAVI CHARM _on_endpoints_changed")
         self._trigger_reconciliation()
 
     def _trigger_reconciliation(self) -> None:
@@ -388,6 +393,7 @@ def _on_check_runners_action(self, event: ActionEvent) -> None:
         Args:
             event: The event fired on check_runners action.
         """
+        logger.info("JAVI CHARM _on_check_runners_action")
         state = self._setup_state()
 
         runner_scaler = self._get_runner_scaler(state)
@@ -410,6 +416,7 @@ def _on_reconcile_runners_action(self, event: ActionEvent) -> None:
         Args:
             event: Action event of reconciling the runner.
         """
+        logger.info("JAVI CHARM _on_reconcile_runners_action")
         self.unit.status = MaintenanceStatus("Reconciling runners")
         state = self._setup_state()
 
@@ -437,6 +444,7 @@ def _on_flush_runners_action(self, event: ActionEvent) -> None:
         Args:
             event: Action event of flushing all runners.
         """
+        logger.info("JAVI CHARM _on_flush_runners_action")
         state = self._setup_state()
 
         # Flushing mode not implemented for OpenStack yet.
@@ -461,6 +469,7 @@ def _on_update_dependencies_action(self, event: ActionEvent) -> None:
         Args:
             event: Action event of updating dependencies.
         """
+        logger.info("JAVI CHARM _on_update_dependencies_action")
         # No dependencies managed by the charm for OpenStack-based runners.
         event.set_results({"flush": False})
 
@@ -518,16 +527,15 @@ def _apt_install(self, packages: Sequence[str]) -> None:
     @catch_charm_errors
     def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None:
         """Handle debug ssh relation changed event."""
+        logger.info("JAVI CHARM _on_debug_ssh_relation_changed")
+        self.unit.status = MaintenanceStatus("Added debug-ssh relation")
         state = self._setup_state()
 
         if not self._get_set_image_ready_status():
             return
         runner_scaler = self._get_runner_scaler(state)
         runner_scaler.flush()
-        try:
-            runner_scaler.reconcile(state.runner_config.virtual_machines)
-        except ReconcileError:
-            logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG)
+        self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines)
 
     @catch_charm_errors
     def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None:
@@ -543,6 +551,7 @@ def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None:
     @catch_charm_errors
     def _on_image_relation_changed(self, _: ops.RelationChangedEvent) -> None:
         """Handle image relation changed event."""
+        logger.info("JAVI CHARM _on_image_relation_changed")
         state = self._setup_state()
         self.unit.status = MaintenanceStatus("Update image for runners")
 

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -45,7 +45,7 @@
 from tests.integration.helpers.openstack import OpenStackInstanceHelper, PrivateEndpointConfigs
 from tests.status_name import ACTIVE
 
-IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 30 * 60
+IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 20 * 60
 
 # The following line is required because we are using request.getfixturevalue in conjunction
 # with pytest-asyncio. See https://github.com/pytest-dev/pytest-asyncio/issues/112
@@ -317,7 +317,7 @@ async def app_no_runner(
 ) -> AsyncIterator[Application]:
     """Application with no runner."""
     await basic_app.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "0"})
-    await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=20 * 60)
     yield basic_app
 
 
@@ -339,7 +339,8 @@ async def image_builder_fixture(
             config={
                 "app-channel": "edge",
                 "build-interval": "12",
-                "revision-history-limit": "5",
+                # JAVI be careful, maybe all tests use the same names for the images
+                "revision-history-limit": "15",
                 "openstack-auth-url": private_endpoint_config["auth_url"],
                 # Bandit thinks this is a hardcoded password
                 "openstack-password": private_endpoint_config["password"],  # nosec: B105
@@ -401,9 +402,33 @@ async def app_openstack_runner_fixture(
             wait_idle=False,
         )
         await model.integrate(f"{image_builder.name}:image", f"{application.name}:image")
-    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(
+        apps=[application.name, image_builder.name], status=ACTIVE, timeout=20 * 60
+    )
 
-    return application
+    # better use test-mode charm config... but let's see
+    command = "find /var/lib/juju -type f -name 'constants.py' -exec sed -i 's/^CREATE_SERVER_TIMEOUT = .*/CREATE_SERVER_TIMEOUT = 900/gI' {} \\;"
+    run_actions = await application.run(command)
+    logging.info("JAVI run_actions %s", run_actions)
+    for action_result in run_actions.actions:
+        logging.info("JAVI action_result %s", action_result)
+        action = action_result.action
+        logging.info("JAVI action %s", action)
+        # no comment...
+        action_id = action.tag
+        if action_id.startswith("action-"):
+            # strip the action- part of "action-<num>" tag
+            action_id = action_id[7:]
+        action = await model._wait_for_new("action", action_id)
+        result = await action.wait()
+        logging.info("JAVI output of one unit of CREATE_SERVER_TIMEOUT %s", result.results)
+
+    yield application
+    try:
+        logging.info("JAVI after yield in app_openstack_runner_fixture")
+        # get_file_content(unit, filename)
+    except Exception:
+        logging.exception("JAVI something failed after yield")
 
 
 @pytest_asyncio.fixture(scope="module", name="app_scheduled_events")
@@ -415,7 +440,7 @@ async def app_scheduled_events_fixture(
     application = app_openstack_runner
     await application.set_config({"reconcile-interval": "8"})
     await application.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"})
-    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60)
+    await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=20 * 60)
     await reconcile(app=application, model=model)
     return application
 
@@ -595,6 +620,7 @@ async def app_with_forked_repo(
     Test should ensure it returns with the application in a good state and has
     one runner.
     """
+    logging.info("JAVI forked_github_repository.full_name: %s", forked_github_repository.full_name)
     await basic_app.set_config({PATH_CONFIG_NAME: forked_github_repository.full_name})
 
     return basic_app

diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py
@@ -2,6 +2,7 @@
 #  See LICENSE file for licensing details.
 import logging
 import secrets
+import threading
 from asyncio import sleep
 from typing import Optional, TypedDict
 
@@ -17,6 +18,50 @@
 logger = logging.getLogger(__name__)
 
 
+async def javi_wait_for_idle(openstack_connection, model, *args, **kwargs) -> None:
+    """TODO.
+
+    Args:
+        openstack_connection: OpenStack connection object.
+        model: model
+        args: args
+        kwargs: kwargs
+    """
+    logger.info("javi_wait_for_idle")
+    e = threading.Event()
+
+    def _log_openstack():
+        """TODO."""
+        end_loop = False
+        while True:
+            end_loop = e.wait(20)
+            # probably not thread safe, but...
+            try:
+                servers = openstack_connection.list_servers()
+            except Exception as ex:
+                logger.exception("JAVI in log openstack thread")
+                raise ex
+            logger.info(" [ runner list ]")
+            for runner in servers:
+                logger.info(
+                    " [ runner %s ] status %s created %s updated %s",
+                    runner.name,
+                    runner.status,
+                    runner.created_at,
+                    runner.updated_at,
+                )
+            if end_loop:
+                break
+
+    try:
+        t = threading.Thread(target=_log_openstack)
+        t.start()
+        await model.wait_for_idle(*args, **kwargs)
+    finally:
+        e.set()
+        t.join()
+
+
 class OpenStackInstanceHelper:
     """Helper class to interact with OpenStack instances."""
 
@@ -179,6 +224,26 @@ async def get_runner_name(self, unit: Unit) -> str:
         assert len(runners) == 1
         return runners[0].name
 
+    def log_runners(self, unit: Unit) -> None:
+        """TODO LOG RUNNERS.
+
+        Expects only one runner to be present.
+
+        Args:
+            unit: The GitHub Runner Charm unit to get the runner name for.
+        """
+        runners = self._get_runners(unit)
+        logger.info("[ list of runners for unit %s]", unit)
+        for runner in runners:
+            logger.info(
+                " [ runner %s ] status %s created %s updated %s",
+                runner.name,
+                runner.status,
+                runner.created_at,
+                runner.updated_at,
+            )
+        logger.info("[ end list of runners for unit %s]")
+
     async def delete_single_runner(self, unit: Unit) -> None:
         """Delete the only runner.
 

diff --git a/tests/integration/test_charm_fork_path_change.py b/tests/integration/test_charm_fork_path_change.py
@@ -44,15 +44,31 @@ async def test_path_config_change(
 
     logger.info("Ensure there is a runner (this calls reconcile)")
     await instance_helper.ensure_charm_has_runner(app_with_forked_repo)
+    logger.info("after ensure_charm_has_runner")
+    instance_helper.log_runners(unit)
 
+    logger.info("Change Path config option to %s", path)
     await app_with_forked_repo.set_config({PATH_CONFIG_NAME: path})
+    instance_helper.log_runners(unit)
+
+    status = await model.get_status()
+    logger.info(" status : %s", status)
 
     logger.info("Reconciling (again)")
     await reconcile(app=app_with_forked_repo, model=model)
 
+    logger.info("after Reconciling (again)")
+    instance_helper.log_runners(unit)
+
+    status = await model.get_status()
+    logger.info("JAVI status 2: %s", status)
+
     runner_names = await instance_helper.get_runner_names(unit)
     logger.info("runners: %s", runner_names)
     assert len(runner_names) == 1
+    #this will crash if there is not exactly one
+    logger.info("runner info: %s", instance_helper._get_single_runner(unit))
+
     runner_name = runner_names[0]
 
     runners_in_repo = github_repository.get_self_hosted_runners()
@@ -62,4 +78,4 @@ async def test_path_config_change(
         filter(lambda runner: runner.name == runner_name, runners_in_repo)
     )
 
-    assert len(runner_in_repo_with_same_name) == 1
+    assert len(runner_in_repo_with_same_name) == 1, "there has to be 1 runner in the repo"