From e1b3abf9a13a07372bcc0f744adbaa2c730c3246 Mon Sep 17 00:00:00 2001 From: Guillaume Boutry Date: Tue, 20 Feb 2024 13:40:54 +0100 Subject: [PATCH] MAAS: deploy to controller plan to right size Merge resize and deploy steps into deploy. --- sunbeam-python/sunbeam/commands/openstack.py | 180 ++++-------------- sunbeam-python/sunbeam/commands/resize.py | 5 +- .../sunbeam/plugins/interface/v1/openstack.py | 6 +- .../sunbeam/provider/maas/client.py | 15 +- .../sunbeam/provider/maas/commands.py | 6 +- .../unit/sunbeam/commands/test_openstack.py | 123 ++++-------- 6 files changed, 89 insertions(+), 246 deletions(-) diff --git a/sunbeam-python/sunbeam/commands/openstack.py b/sunbeam-python/sunbeam/commands/openstack.py index 3e192a19..66c9092c 100644 --- a/sunbeam-python/sunbeam/commands/openstack.py +++ b/sunbeam-python/sunbeam/commands/openstack.py @@ -46,7 +46,6 @@ from sunbeam.jobs.juju import ( JujuHelper, JujuWaitException, - ModelNotFoundException, TimeoutException, run_sync, ) @@ -61,19 +60,6 @@ TOPOLOGY_KEY = "Topology" -def determine_target_topology_at_bootstrap() -> str: - """Determines the target topology at bootstrap time. - - Under a threshold of 20GiB RAM on the bootstrapping node, - target is considered to be 'single' - Otherwise, target is considered to be 'multi' - """ - host_total_ram = get_host_total_ram() - if host_total_ram < RAM_32_GB_IN_KB: - return "single" - return "multi" - - def determine_target_topology(client: Client) -> str: """Determines the target topology. @@ -103,7 +89,7 @@ def compute_ha_scale(topology: str, control_nodes: int) -> int: def compute_os_api_scale(topology: str, control_nodes: int) -> int: if topology == "single": return 1 - if topology == "multi": + if topology == "multi" or control_nodes < 3: return min(control_nodes, 3) if topology == "large": return min(control_nodes + 2, 7) @@ -113,7 +99,7 @@ def compute_os_api_scale(topology: str, control_nodes: int) -> int: def compute_ingress_scale(topology: str, control_nodes: int) -> int: if topology == "single": return 1 - return control_nodes + return min(control_nodes, 3) def compute_ceph_replica_scale(osds: int) -> int: @@ -140,25 +126,26 @@ def __init__( topology: str, database: str, machine_model: str, + force: bool = False, ): super().__init__( "Deploying OpenStack Control Plane", "Deploying OpenStack Control Plane to Kubernetes (this may take a while)", ) + self.client = client self.manifest = manifest self.jhelper = jhelper self.topology = topology self.database = database self.machine_model = machine_model + self.force = force self.model = OPENSTACK_MODEL self.cloud = MICROK8S_CLOUD - self.client = client self.tfplan = "openstack-plan" - def get_storage_tfvars(self) -> dict: + def get_storage_tfvars(self, storage_nodes: list[dict]) -> dict: """Create terraform variables related to storage.""" tfvars = {} - storage_nodes = self.client.cluster.list_nodes_by_role("storage") if storage_nodes: tfvars["ceph-osd-replication-count"] = compute_ceph_replica_scale( run_sync(_get_number_of_osds(self.jhelper, self.machine_model)) @@ -185,38 +172,53 @@ def is_skip(self, status: Optional[Status] = None) -> Result: # Config was never registered in database previous_config = {} - determined_topology = determine_target_topology_at_bootstrap() + determined_topology = determine_target_topology(self.client) if self.topology == "auto": self.topology = previous_config.get("topology", determined_topology) - LOG.debug(f"Bootstrap: topology {self.topology}") + LOG.debug(f"topology {self.topology}") if self.database == "auto": self.database = previous_config.get("database", determined_topology) - LOG.debug(f"Bootstrap: database topology {self.database}") - + if self.database == "large": + # multi and large are the same + self.database = "multi" + LOG.debug(f"database topology {self.database}") if (database := previous_config.get("database")) and database != self.database: return Result( ResultType.FAILED, "Database topology cannot be changed, please destroy and re-bootstrap", ) + is_not_compatible = self.database == "single" and self.topology == "large" + if not self.force and is_not_compatible: + return Result( + ResultType.FAILED, + ( + "Cannot deploy control plane to large with single database," + " use -f/--force to override" + ), + ) + return Result(ResultType.COMPLETED) def run(self, status: Optional[Status] = None) -> Result: """Execute configuration using terraform.""" # TODO(jamespage): # This needs to evolve to add support for things like: - # - Enabling HA # - Enabling/disabling specific services - # - Switch channels for the charmed operators update_config( self.client, TOPOLOGY_KEY, {"topology": self.topology, "database": self.database}, ) - extra_tfvars = self.get_storage_tfvars() + self.update_status(status, "fetching cluster nodes") + control_nodes = self.client.cluster.list_nodes_by_role("control") + storage_nodes = self.client.cluster.list_nodes_by_role("storage") + + self.update_status(status, "computing deployment sizing") + extra_tfvars = self.get_storage_tfvars(storage_nodes) extra_tfvars.update( { "model": self.model, @@ -224,10 +226,15 @@ def run(self, status: Optional[Status] = None) -> Result: "credential": f"{self.cloud}{CREDENTIAL_SUFFIX}", "config": {"workload-storage": MICROK8S_DEFAULT_STORAGECLASS}, "many-mysql": self.database == "multi", + "ha-scale": compute_ha_scale(self.topology, len(control_nodes)), + "os-api-scale": compute_os_api_scale(self.topology, len(control_nodes)), + "ingress-scale": compute_ingress_scale( + self.topology, len(control_nodes) + ), } ) + self.update_status(status, "deploying services") try: - self.update_status(status, "deploying services") self.manifest.update_tfvars_and_apply_tf( self.client, tfplan=self.tfplan, @@ -262,125 +269,6 @@ def run(self, status: Optional[Status] = None) -> Result: return Result(ResultType.COMPLETED) -class ResizeControlPlaneStep(BaseStep, JujuStepHelper): - """Resize OpenStack using Terraform cloud.""" - - _CONFIG = CONFIG_KEY - - def __init__( - self, - client: Client, - manifest: Manifest, - jhelper: JujuHelper, - topology: str, - machine_model: str, - force: bool = False, - ): - super().__init__( - "Resizing OpenStack Control Plane", - "Resizing OpenStack Control Plane to match appropriate topology", - ) - self.client = client - self.manifest = manifest - self.jhelper = jhelper - self.topology = topology - self.machine_model = machine_model - self.force = force - self.model = OPENSTACK_MODEL - self.tfplan = "openstack-plan" - - def is_skip(self, status: Optional[Status] = None) -> Result: - """Determines if the step should be skipped or not. - - :return: ResultType.SKIPPED if the Step should be skipped, - ResultType.COMPLETED or ResultType.FAILED otherwise - """ - try: - run_sync(self.jhelper.get_model(OPENSTACK_MODEL)) - except ModelNotFoundException: - return Result( - ResultType.FAILED, - "OpenStack control plane is not deployed, cannot resize", - ) - - return Result(ResultType.COMPLETED) - - def run(self, status: Optional[Status] = None) -> Result: - """Execute configuration using terraform.""" - self.update_status(status, "fetching configuration") - topology_dict = read_config(self.client, TOPOLOGY_KEY) - if self.topology == "auto": - topology = determine_target_topology(self.client) - else: - topology = self.topology - topology_dict["topology"] = topology - is_not_compatible = ( - topology_dict["database"] == "single" and topology == "large" - ) - if not self.force and is_not_compatible: - return Result( - ResultType.FAILED, - ( - "Cannot resize control plane to large with single database," - " use -f/--force to override" - ), - ) - update_config( - self.client, - TOPOLOGY_KEY, - topology_dict, - ) - control_nodes = self.client.cluster.list_nodes_by_role("control") - storage_nodes = self.client.cluster.list_nodes_by_role("storage") - # NOTE(jamespage) - # When dedicated control nodes are used, ceph is not enabled during - # bootstrap - however storage nodes may be added later so re-assess - extra_tfvars = { - "ha-scale": compute_ha_scale(topology, len(control_nodes)), - "os-api-scale": compute_os_api_scale(topology, len(control_nodes)), - "ingress-scale": compute_ingress_scale(topology, len(control_nodes)), - "enable-ceph": len(storage_nodes) > 0, - "ceph-offer-url": f"admin/{self.machine_model}.{microceph.APPLICATION}", - "ceph-osd-replication-count": compute_ceph_replica_scale( - run_sync(_get_number_of_osds(self.jhelper, self.machine_model)) - ), - } - - self.update_status(status, "scaling services") - try: - self.manifest.update_tfvars_and_apply_tf( - self.client, - tfplan=self.tfplan, - tfvar_config=self._CONFIG, - override_tfvars=extra_tfvars, - ) - except TerraformException as e: - LOG.exception("Error resizing control plane") - return Result(ResultType.FAILED, str(e)) - - # Remove cinder-ceph from apps to wait on if ceph is not enabled - apps = run_sync(self.jhelper.get_application_names(self.model)) - if not storage_nodes and "cinder-ceph" in apps: - apps.remove("cinder-ceph") - task = run_sync(update_status_background(self, apps, status)) - try: - run_sync( - self.jhelper.wait_until_active( - self.model, - apps, - timeout=OPENSTACK_DEPLOY_TIMEOUT, - ) - ) - except (JujuWaitException, TimeoutException) as e: - LOG.warning(str(e)) - return Result(ResultType.FAILED, str(e)) - finally: - if not task.done(): - task.cancel() - - return Result(ResultType.COMPLETED) - - class PatchLoadBalancerServicesStep(BaseStep): SERVICES = ["traefik", "traefik-public", "rabbitmq", "ovn-relay"] MODEL = OPENSTACK_MODEL diff --git a/sunbeam-python/sunbeam/commands/resize.py b/sunbeam-python/sunbeam/commands/resize.py index a62b7ffb..9b53d65a 100644 --- a/sunbeam-python/sunbeam/commands/resize.py +++ b/sunbeam-python/sunbeam/commands/resize.py @@ -18,7 +18,7 @@ from rich.console import Console from sunbeam.clusterd.client import Client -from sunbeam.commands.openstack import ResizeControlPlaneStep +from sunbeam.commands.openstack import DeployControlPlaneStep from sunbeam.commands.terraform import TerraformInitStep from sunbeam.jobs.common import click_option_topology, run_plan from sunbeam.jobs.deployment import Deployment @@ -47,11 +47,12 @@ def resize(ctx: click.Context, topology: str, force: bool = False) -> None: jhelper = JujuHelper(deployment.get_connected_controller()) plan = [ TerraformInitStep(manifest_obj.get_tfhelper(tfplan)), - ResizeControlPlaneStep( + DeployControlPlaneStep( client, manifest_obj, jhelper, topology, + "auto", deployment.infrastructure_model, force=force, ), diff --git a/sunbeam-python/sunbeam/plugins/interface/v1/openstack.py b/sunbeam-python/sunbeam/plugins/interface/v1/openstack.py index 6f55db25..5f17383d 100644 --- a/sunbeam-python/sunbeam/plugins/interface/v1/openstack.py +++ b/sunbeam-python/sunbeam/plugins/interface/v1/openstack.py @@ -30,7 +30,7 @@ from sunbeam.commands.juju import JujuStepHelper from sunbeam.commands.openstack import ( OPENSTACK_MODEL, - determine_target_topology_at_bootstrap, + TOPOLOGY_KEY, ) from sunbeam.commands.terraform import TerraformException, TerraformInitStep from sunbeam.jobs.checks import VerifyBootstrappedCheck @@ -210,7 +210,9 @@ def get_tfvar_config_key(self) -> str: def get_database_topology(self) -> str: """Returns the database topology of the cluster.""" # Database topology can be set only during bootstrap and cannot be changed. - return determine_target_topology_at_bootstrap() + client = self.deployment.get_client() + topology = read_config(client, TOPOLOGY_KEY) + return topology["database"] def set_application_timeout_on_enable(self) -> int: """Set Application Timeout on enabling the plugin. diff --git a/sunbeam-python/sunbeam/provider/maas/client.py b/sunbeam-python/sunbeam/provider/maas/client.py index eab82fab..9d9cbd2a 100644 --- a/sunbeam-python/sunbeam/provider/maas/client.py +++ b/sunbeam-python/sunbeam/provider/maas/client.py @@ -122,16 +122,17 @@ def from_deployment(cls, deployment: Deployment) -> "MaasClient": ) -def _to_root_disk(device: dict, size: int = 0) -> dict: - """Convert device to root disk. - - Device can be a blockdevice or a partition. - """ +def _to_root_disk(device: dict, partition: dict | None = None) -> dict: + """Convert device to root disk.""" + if partition: + size = partition["size"] + else: + size = device["size"] root_disk = { "name": device["name"], "tags": device["tags"], "root_partition": { - "size": size or device["size"], + "size": size, }, } return root_disk @@ -160,7 +161,7 @@ def _convert_raw_machine(machine_raw: dict) -> dict: for partition in blockdevice.get("partitions", []): fs = partition.get("filesystem") if fs.get("label") == "root": - root_disk = _to_root_disk(partition, size=partition["size"]) + root_disk = _to_root_disk(blockdevice, partition) spaces = [] nics = [] diff --git a/sunbeam-python/sunbeam/provider/maas/commands.py b/sunbeam-python/sunbeam/provider/maas/commands.py index d30bccb6..d21ea637 100644 --- a/sunbeam-python/sunbeam/provider/maas/commands.py +++ b/sunbeam-python/sunbeam/provider/maas/commands.py @@ -361,6 +361,7 @@ def deploy( ctx: click.Context, manifest: Path | None = None, accept_defaults: bool = False, + topology: str = "auto", ) -> None: """Deploy the MAAS-backed deployment. @@ -520,8 +521,9 @@ def deploy( client, manifest, jhelper, - "auto", - "auto", # TODO(gboutry): use the right values + topology, + # maas deployment always deploys multiple databases + "large", deployment.infrastructure_model, ) ) diff --git a/sunbeam-python/tests/unit/sunbeam/commands/test_openstack.py b/sunbeam-python/tests/unit/sunbeam/commands/test_openstack.py index 8b37082a..e283f701 100644 --- a/sunbeam-python/tests/unit/sunbeam/commands/test_openstack.py +++ b/sunbeam-python/tests/unit/sunbeam/commands/test_openstack.py @@ -24,7 +24,6 @@ DeployControlPlaneStep, PatchLoadBalancerServicesStep, ReapplyOpenStackTerraformPlanStep, - ResizeControlPlaneStep, compute_ceph_replica_scale, compute_ha_scale, compute_ingress_scale, @@ -67,6 +66,10 @@ def setUp(self): self.jhelper.run_action.return_value = {} self.manifest = Mock() self.client = Mock() + self.client.cluster.list_nodes_by_role.side_effect = [ + [{"name": f"control-{i}"} for i in range(4)], + [{"name": f"storage-{i}"} for i in range(4)], + ] def test_run_pristine_installation(self): self.jhelper.get_application.side_effect = ApplicationNotFoundException( @@ -173,96 +176,42 @@ def test_is_skip_database_changed(self): assert result.result_type == ResultType.FAILED - -class TestResizeControlPlaneStep(unittest.TestCase): - def __init__(self, methodName: str = "runTest") -> None: - super().__init__(methodName) - self.read_config = patch( + def test_is_skip_incompatible_topology(self): + step = DeployControlPlaneStep( + self.client, + self.manifest, + self.jhelper, + "large", + "auto", + MODEL, + force=False, + ) + with patch( "sunbeam.commands.openstack.read_config", Mock(return_value={"topology": "single", "database": "single"}), - ) - - def setUp(self): - self.client = Mock( - cluster=Mock(list_nodes_by_role=Mock(return_value=[1, 2, 3, 4])) - ) - self.read_config.start() - self.jhelper = AsyncMock() - self.jhelper.run_action.return_value = {} - self.manifest = Mock() - - def tearDown(self): - self.read_config.stop() - - def test_run_pristine_installation(self): - self.jhelper.get_application.side_effect = ApplicationNotFoundException( - "not found" - ) - - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, "single", MODEL, False - ) - result = step.run() - - self.manifest.update_tfvars_and_apply_tf.assert_called_once() - assert result.result_type == ResultType.COMPLETED - - def test_run_tf_apply_failed(self): - self.manifest.update_tfvars_and_apply_tf.side_effect = TerraformException( - "apply failed..." - ) - - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, TOPOLOGY, MODEL, False - ) - result = step.run() - - self.manifest.update_tfvars_and_apply_tf.assert_called_once() - assert result.result_type == ResultType.FAILED - assert result.message == "apply failed..." - - def test_run_waiting_timed_out(self): - self.jhelper.wait_until_active.side_effect = TimeoutException("timed out") - - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, TOPOLOGY, MODEL, False - ) - result = step.run() - - self.jhelper.wait_until_active.assert_called_once() - assert result.result_type == ResultType.FAILED - assert result.message == "timed out" - - def test_run_unit_in_error_state(self): - self.jhelper.wait_until_active.side_effect = JujuWaitException( - "Unit in error: placement/0" - ) - - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, TOPOLOGY, MODEL, False - ) - result = step.run() - - self.jhelper.wait_until_active.assert_called_once() - assert result.result_type == ResultType.FAILED - assert result.message == "Unit in error: placement/0" - - def test_run_incompatible_topology(self): - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, "large", MODEL, force=False - ) - result = step.run() + ): + result = step.is_skip() assert result.result_type == ResultType.FAILED - assert "Cannot resize control plane to large" in result.message + assert result.message + assert "use -f/--force to override" in result.message - def test_run_force_incompatible_topology(self): - step = ResizeControlPlaneStep( - self.client, self.manifest, self.jhelper, "large", MODEL, force=True + def test_is_skip_force_incompatible_topology(self): + step = DeployControlPlaneStep( + self.client, + self.manifest, + self.jhelper, + "large", + "auto", + MODEL, + force=True, ) - result = step.run() + with patch( + "sunbeam.commands.openstack.read_config", + Mock(return_value={"topology": "single", "database": "single"}), + ): + result = step.is_skip() - self.jhelper.wait_until_active.assert_called_once() assert result.result_type == ResultType.COMPLETED @@ -405,9 +354,9 @@ def test_compute_os_api_scale(topology, control_nodes, scale): ("single", 1, 1), ("multi", 2, 2), ("multi", 3, 3), - ("multi", 9, 9), - ("large", 4, 4), - ("large", 9, 9), + ("multi", 9, 3), + ("large", 4, 3), + ("large", 9, 3), ], ) def test_compute_ingress_scale(topology, control_nodes, scale):