diff --git a/.github/workflows/integration_test.yaml b/.github/workflows/integration_test.yaml index ddd3b32b4..9bf5f6b66 100644 --- a/.github/workflows/integration_test.yaml +++ b/.github/workflows/integration_test.yaml @@ -17,7 +17,7 @@ jobs: # INTEGRATION_TEST_ARGS to operator-workflows automatically. integration-tests: name: Integration test with juju 3.1 - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing secrets: inherit with: juju-channel: 3.1/stable @@ -30,7 +30,7 @@ jobs: self-hosted-runner-label: stg-private-endpoint openstack-interface-tests-private-endpoint: name: openstack interface test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing secrets: inherit with: juju-channel: 3.6/stable @@ -38,11 +38,12 @@ jobs: provider: lxd test-tox-env: integration-juju3.6 modules: '["test_runner_manager_openstack"]' + extra-arguments: '--log-format="%(asctime)s %(levelname)s %(message)s"' self-hosted-runner: true self-hosted-runner-label: stg-private-endpoint openstack-integration-tests-private-endpoint: name: Integration test using private-endpoint - uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@main + uses: canonical/operator-workflows/.github/workflows/integration_test.yaml@javi-testing secrets: inherit with: juju-channel: 3.6/stable diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py index 75796b14d..2770a1b88 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/health_checks.py @@ -189,7 +189,11 @@ def _run_health_check_cloud_init( """ result: invoke.runners.Result = _execute_ssh_command(ssh_conn, "cloud-init status") if not result.ok: - logger.warning("cloud-init status command failed on %s: %s.", server_name, result.stderr) + logger.error("cloud-init status command failed on %s: %s.", server_name, result.stderr) + cloud_init_log_output_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init-output.log") + logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout) + cloud_init_log_result = _execute_ssh_command(ssh_conn, "cat /var/log/cloud-init.log") + logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout) return False if CloudInitStatus.DONE in result.stdout: diff --git a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py index cf0e519c6..4b30ce77a 100644 --- a/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py +++ b/github-runner-manager/src/github_runner_manager/openstack_cloud/openstack_runner_manager.py @@ -653,6 +653,10 @@ def _wait_runner_startup(self, instance: OpenstackInstance) -> None: logger.warning( "cloud-init status command failed on %s: %s.", instance.server_name, result.stderr ) + cloud_init_log_output_result = ssh_conn.run("cat /var/log/cloud-init-output.log", warn=True, timeout=60) + logger.error("/var/log/cloud-init-output.log stdout: %s", cloud_init_log_output_result.stdout) + cloud_init_log_result = ssh_conn.run("cat /var/log/cloud-init.log") + logger.error("/var/log/cloud-init.log stdout: %s", cloud_init_log_result.stdout, warn=True, timeout=60) raise RunnerStartError(f"Runner startup process not found on {instance.server_name}") # A short running job may have already completed and exited the runner, hence check the # condition via cloud-init status check. diff --git a/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 b/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 index 1c406f841..3514c96c2 100644 --- a/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 +++ b/github-runner-manager/src/github_runner_manager/templates/openstack-userdata.sh.j2 @@ -9,6 +9,7 @@ su - ubuntu -c 'cd ~/actions-runner && echo "{{ env_contents }}" > .env' {% if aproxy_address %} snap install aproxy --edge +snap watch --last=auto-refresh? snap set aproxy proxy={{ aproxy_address }} listen=:54969 cat << EOF > /etc/nftables.conf define default-ip = $(ip route get $(ip route show 0.0.0.0/0 | grep -oP 'via \K\S+') | grep -oP 'src \K\S+') diff --git a/src/charm.py b/src/charm.py index 114a12673..597e80117 100755 --- a/src/charm.py +++ b/src/charm.py @@ -110,7 +110,7 @@ class ReconcileRunnersEvent(EventBase): def catch_charm_errors( - func: Callable[["GithubRunnerCharm", EventT], None] + func: Callable[["GithubRunnerCharm", EventT], None], ) -> Callable[["GithubRunnerCharm", EventT], None]: """Catch common errors in charm. @@ -145,7 +145,7 @@ def func_with_catch_errors(self: "GithubRunnerCharm", event: EventT) -> None: def catch_action_errors( - func: Callable[["GithubRunnerCharm", ActionEvent], None] + func: Callable[["GithubRunnerCharm", ActionEvent], None], ) -> Callable[["GithubRunnerCharm", ActionEvent], None]: """Catch common errors in actions. @@ -519,15 +519,13 @@ def _apt_install(self, packages: Sequence[str]) -> None: def _on_debug_ssh_relation_changed(self, _: ops.RelationChangedEvent) -> None: """Handle debug ssh relation changed event.""" state = self._setup_state() + self.unit.status = MaintenanceStatus("Reconciling runners - added debug-ssh") if not self._get_set_image_ready_status(): return runner_scaler = self._get_runner_scaler(state) runner_scaler.flush() - try: - runner_scaler.reconcile(state.runner_config.virtual_machines) - except ReconcileError: - logger.exception(FAILED_TO_RECONCILE_RUNNERS_MSG) + self._reconcile_openstack_runners(runner_scaler, state.runner_config.virtual_machines) @catch_charm_errors def _on_image_relation_joined(self, _: ops.RelationJoinedEvent) -> None: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 385a1afa1..798ce0c0b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -45,7 +45,7 @@ from tests.integration.helpers.openstack import OpenStackInstanceHelper, PrivateEndpointConfigs from tests.status_name import ACTIVE -IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 30 * 60 +IMAGE_BUILDER_DEPLOY_TIMEOUT_IN_SECONDS = 20 * 60 # The following line is required because we are using request.getfixturevalue in conjunction # with pytest-asyncio. See https://github.com/pytest-dev/pytest-asyncio/issues/112 @@ -317,7 +317,7 @@ async def app_no_runner( ) -> AsyncIterator[Application]: """Application with no runner.""" await basic_app.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "0"}) - await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=90 * 60) + await model.wait_for_idle(apps=[basic_app.name], status=ACTIVE, timeout=20 * 60) yield basic_app @@ -339,7 +339,8 @@ async def image_builder_fixture( config={ "app-channel": "edge", "build-interval": "12", - "revision-history-limit": "5", + # JAVI be careful, maybe all tests use the same names for the images + "revision-history-limit": "15", "openstack-auth-url": private_endpoint_config["auth_url"], # Bandit thinks this is a hardcoded password "openstack-password": private_endpoint_config["password"], # nosec: B105 @@ -401,9 +402,33 @@ async def app_openstack_runner_fixture( wait_idle=False, ) await model.integrate(f"{image_builder.name}:image", f"{application.name}:image") - await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60) + await model.wait_for_idle( + apps=[application.name, image_builder.name], status=ACTIVE, timeout=20 * 60 + ) - return application + # better use test-mode charm config... but let's see + command = "find /var/lib/juju -type f -name 'constants.py' -exec sed -i 's/^CREATE_SERVER_TIMEOUT = .*/CREATE_SERVER_TIMEOUT = 900/gI' {} \\;" + run_actions = await application.run(command) + logging.info("JAVI run_actions %s", run_actions) + for action_result in run_actions.actions: + logging.info("JAVI action_result %s", action_result) + action = action_result.action + logging.info("JAVI action %s", action) + # no comment... + action_id = action.tag + if action_id.startswith("action-"): + # strip the action- part of "action-" tag + action_id = action_id[7:] + action = await model._wait_for_new("action", action_id) + result = await action.wait() + logging.info("JAVI output of one unit of CREATE_SERVER_TIMEOUT %s", result.results) + + yield application + try: + logging.info("JAVI after yield in app_openstack_runner_fixture") + # get_file_content(unit, filename) + except Exception: + logging.exception("JAVI something failed after yield") @pytest_asyncio.fixture(scope="module", name="app_scheduled_events") @@ -415,7 +440,7 @@ async def app_scheduled_events_fixture( application = app_openstack_runner await application.set_config({"reconcile-interval": "8"}) await application.set_config({VIRTUAL_MACHINES_CONFIG_NAME: "1"}) - await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=90 * 60) + await model.wait_for_idle(apps=[application.name], status=ACTIVE, timeout=20 * 60) await reconcile(app=application, model=model) return application diff --git a/tests/integration/helpers/openstack.py b/tests/integration/helpers/openstack.py index 5533f33bb..904660680 100644 --- a/tests/integration/helpers/openstack.py +++ b/tests/integration/helpers/openstack.py @@ -2,6 +2,7 @@ # See LICENSE file for licensing details. import logging import secrets +import threading from asyncio import sleep from typing import Optional, TypedDict @@ -17,6 +18,50 @@ logger = logging.getLogger(__name__) +async def javi_wait_for_idle(openstack_connection, model, *args, **kwargs) -> None: + """TODO. + + Args: + openstack_connection: OpenStack connection object. + model: model + args: args + kwargs: kwargs + """ + logger.info("javi_wait_for_idle") + e = threading.Event() + + def _log_openstack(): + """TODO.""" + end_loop = False + while True: + end_loop = e.wait(20) + # probably not thread safe, but... + try: + servers = openstack_connection.list_servers() + except Exception as ex: + logger.exception("JAVI in log openstack thread") + raise ex + logger.info(" [ runner list ]") + for runner in servers: + logger.info( + " [ runner %s ] status %s created %s updated %s", + runner.name, + runner.status, + runner.created_at, + runner.updated_at, + ) + if end_loop: + break + + try: + t = threading.Thread(target=_log_openstack) + t.start() + await model.wait_for_idle(*args, **kwargs) + finally: + e.set() + t.join() + + class OpenStackInstanceHelper: """Helper class to interact with OpenStack instances.""" @@ -179,6 +224,26 @@ async def get_runner_name(self, unit: Unit) -> str: assert len(runners) == 1 return runners[0].name + def log_runners(self, unit: Unit) -> None: + """TODO LOG RUNNERS. + + Expects only one runner to be present. + + Args: + unit: The GitHub Runner Charm unit to get the runner name for. + """ + runners = self._get_runners(unit) + logger.info("[ list of runners for unit %s]", unit) + for runner in runners: + logger.info( + " [ runner %s ] status %s created %s updated %s", + runner.name, + runner.status, + runner.created_at, + runner.updated_at, + ) + logger.info("[ end list of runners for unit %s]") + async def delete_single_runner(self, unit: Unit) -> None: """Delete the only runner. diff --git a/tests/integration/test_charm_fork_path_change.py b/tests/integration/test_charm_fork_path_change.py index 0ee0b145f..6584e20fc 100644 --- a/tests/integration/test_charm_fork_path_change.py +++ b/tests/integration/test_charm_fork_path_change.py @@ -44,15 +44,31 @@ async def test_path_config_change( logger.info("Ensure there is a runner (this calls reconcile)") await instance_helper.ensure_charm_has_runner(app_with_forked_repo) + logger.info("after ensure_charm_has_runner") + instance_helper.log_runners(unit) + logger.info("Change Path config option") await app_with_forked_repo.set_config({PATH_CONFIG_NAME: path}) + instance_helper.log_runners(unit) + + status = await model.get_status() + logger.info(" status : %s", status) logger.info("Reconciling (again)") await reconcile(app=app_with_forked_repo, model=model) + logger.info("after Reconciling (again)") + instance_helper.log_runners(unit) + + status = await model.get_status() + logger.info("JAVI status 2: %s", status) + runner_names = await instance_helper.get_runner_names(unit) logger.info("runners: %s", runner_names) assert len(runner_names) == 1 + #this will crash if there is not exactly one + logger.info("runner info: %s", instance_helper._get_single_runner(unit)) + runner_name = runner_names[0] runners_in_repo = github_repository.get_self_hosted_runners() @@ -62,4 +78,4 @@ async def test_path_config_change( filter(lambda runner: runner.name == runner_name, runners_in_repo) ) - assert len(runner_in_repo_with_same_name) == 1 + assert len(runner_in_repo_with_same_name) == 1, "there has to be 1 runner in the repo" diff --git a/tests/integration/test_charm_scheduled_events.py b/tests/integration/test_charm_scheduled_events.py index bc507e4d0..2b41dcebd 100644 --- a/tests/integration/test_charm_scheduled_events.py +++ b/tests/integration/test_charm_scheduled_events.py @@ -15,7 +15,7 @@ from juju.model import Model from tests.integration.helpers.common import wait_for -from tests.integration.helpers.openstack import OpenStackInstanceHelper +from tests.integration.helpers.openstack import OpenStackInstanceHelper, javi_wait_for_idle from tests.status_name import ACTIVE logger = logging.getLogger(__name__) @@ -28,6 +28,7 @@ async def test_update_interval( model: Model, app_scheduled_events: Application, instance_helper: OpenStackInstanceHelper, + openstack_connection, ) -> None: """ arrange: A working application with one runner. @@ -57,7 +58,7 @@ async def _no_runners_available() -> bool: logger.info("Wait for 10 minutes") await sleep(10 * 60) - await model.wait_for_idle(status=ACTIVE, timeout=20 * 60) + await javi_wait_for_idle(openstack_connection, model, status=ACTIVE, timeout=20 * 60) newnames = await instance_helper.get_runner_names(unit) assert len(newnames) == 1, "There should be one runner after reconciliation" diff --git a/tests/integration/test_charm_upgrade.py b/tests/integration/test_charm_upgrade.py index a1123a6eb..bfb9754f3 100644 --- a/tests/integration/test_charm_upgrade.py +++ b/tests/integration/test_charm_upgrade.py @@ -91,10 +91,10 @@ async def test_charm_upgrade( ) await model.integrate(f"{image_builder.name}:image", f"{application.name}:image") await model.wait_for_idle( - apps=[application.name], + apps=[application.name, image_builder.name], raise_on_error=False, wait_for_active=True, - timeout=180 * 60, + timeout=20 * 60, check_freq=30, ) origin = client.CharmOrigin( @@ -125,6 +125,6 @@ async def test_charm_upgrade( apps=[application.name], raise_on_error=False, wait_for_active=True, - timeout=180 * 60, + timeout=20 * 60, check_freq=30, ) diff --git a/tests/integration/test_debug_ssh.py b/tests/integration/test_debug_ssh.py index c4ce8b658..1962b7173 100644 --- a/tests/integration/test_debug_ssh.py +++ b/tests/integration/test_debug_ssh.py @@ -11,7 +11,7 @@ from juju.model import Model from tests.integration.helpers.common import dispatch_workflow, get_job_logs -from tests.integration.helpers.openstack import OpenStackInstanceHelper +from tests.integration.helpers.openstack import OpenStackInstanceHelper, javi_wait_for_idle from tests.status_name import ACTIVE logger = logging.getLogger(__name__) @@ -28,17 +28,21 @@ async def test_ssh_debug( test_github_branch: Branch, tmate_ssh_server_unit_ip: str, instance_helper: OpenStackInstanceHelper, + openstack_connection, ): """ arrange: given an integrated GitHub-Runner charm and tmate-ssh-server charm. act: when canonical/action-tmate is triggered. assert: the ssh connection info from action-log and tmate-ssh-server matches. """ - await model.wait_for_idle(status=ACTIVE, timeout=60 * 120) + await javi_wait_for_idle(openstack_connection, model, status=ACTIVE, timeout=60 * 120) unit = app_no_wait_tmate.units[0] # We need the runner to connect to the current machine, instead of the tmate_ssh_server unit, # as the tmate_ssh_server is not routable. + logger.info("before iptables") + instance_helper.log_runners(unit) + dnat_comman_in_runner = f"sudo iptables -t nat -A OUTPUT -p tcp -d {tmate_ssh_server_unit_ip} --dport 10022 -j DNAT --to-destination 127.0.0.1:10022" _, _, _ = await instance_helper.run_in_instance( unit, @@ -46,6 +50,8 @@ async def test_ssh_debug( assert_on_failure=True, ) await instance_helper.expose_to_instance(unit=unit, port=10022, host=tmate_ssh_server_unit_ip) + logger.info("after exposing instance") + instance_helper.log_runners(unit) # trigger tmate action logger.info("Dispatching workflow_dispatch_ssh_debug.yaml workflow.") @@ -59,6 +65,8 @@ async def test_ssh_debug( workflow_id_or_name=SSH_DEBUG_WORKFLOW_FILE_NAME, ) + logger.info("after workflow run") + instance_helper.log_runners(unit) logs = get_job_logs(workflow_run.jobs("latest")[0]) # ensure ssh connection info printed in logs. diff --git a/tests/integration/test_runner_manager_openstack.py b/tests/integration/test_runner_manager_openstack.py index 25ee580de..3139a5762 100644 --- a/tests/integration/test_runner_manager_openstack.py +++ b/tests/integration/test_runner_manager_openstack.py @@ -32,6 +32,7 @@ ) from github_runner_manager.metrics import events from github_runner_manager.openstack_cloud import health_checks +from github_runner_manager.openstack_cloud import constants from github_runner_manager.openstack_cloud.openstack_runner_manager import ( OpenStackCredentials, OpenStackRunnerManager, @@ -50,7 +51,7 @@ ) logger = logging.getLogger(__name__) - +constants.CREATE_SERVER_TIMEOUT = 900 @pytest.fixture(scope="module", name="runner_label") def runner_label():