Skip to content

Commit

Permalink
MAAS: deploy to controller plan to right size
Browse files Browse the repository at this point in the history
Merge resize and deploy steps into deploy.
  • Loading branch information
gboutry committed Feb 20, 2024
1 parent 147b063 commit e1b3abf
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 246 deletions.
180 changes: 34 additions & 146 deletions sunbeam-python/sunbeam/commands/openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
from sunbeam.jobs.juju import (
JujuHelper,
JujuWaitException,
ModelNotFoundException,
TimeoutException,
run_sync,
)
Expand All @@ -61,19 +60,6 @@
TOPOLOGY_KEY = "Topology"


def determine_target_topology_at_bootstrap() -> str:
"""Determines the target topology at bootstrap time.
Under a threshold of 20GiB RAM on the bootstrapping node,
target is considered to be 'single'
Otherwise, target is considered to be 'multi'
"""
host_total_ram = get_host_total_ram()
if host_total_ram < RAM_32_GB_IN_KB:
return "single"
return "multi"


def determine_target_topology(client: Client) -> str:
"""Determines the target topology.
Expand Down Expand Up @@ -103,7 +89,7 @@ def compute_ha_scale(topology: str, control_nodes: int) -> int:
def compute_os_api_scale(topology: str, control_nodes: int) -> int:
if topology == "single":
return 1
if topology == "multi":
if topology == "multi" or control_nodes < 3:
return min(control_nodes, 3)
if topology == "large":
return min(control_nodes + 2, 7)
Expand All @@ -113,7 +99,7 @@ def compute_os_api_scale(topology: str, control_nodes: int) -> int:
def compute_ingress_scale(topology: str, control_nodes: int) -> int:
if topology == "single":
return 1
return control_nodes
return min(control_nodes, 3)


def compute_ceph_replica_scale(osds: int) -> int:
Expand All @@ -140,25 +126,26 @@ def __init__(
topology: str,
database: str,
machine_model: str,
force: bool = False,
):
super().__init__(
"Deploying OpenStack Control Plane",
"Deploying OpenStack Control Plane to Kubernetes (this may take a while)",
)
self.client = client
self.manifest = manifest
self.jhelper = jhelper
self.topology = topology
self.database = database
self.machine_model = machine_model
self.force = force
self.model = OPENSTACK_MODEL
self.cloud = MICROK8S_CLOUD
self.client = client
self.tfplan = "openstack-plan"

def get_storage_tfvars(self) -> dict:
def get_storage_tfvars(self, storage_nodes: list[dict]) -> dict:
"""Create terraform variables related to storage."""
tfvars = {}
storage_nodes = self.client.cluster.list_nodes_by_role("storage")
if storage_nodes:
tfvars["ceph-osd-replication-count"] = compute_ceph_replica_scale(
run_sync(_get_number_of_osds(self.jhelper, self.machine_model))
Expand All @@ -185,49 +172,69 @@ def is_skip(self, status: Optional[Status] = None) -> Result:
# Config was never registered in database
previous_config = {}

determined_topology = determine_target_topology_at_bootstrap()
determined_topology = determine_target_topology(self.client)

if self.topology == "auto":
self.topology = previous_config.get("topology", determined_topology)
LOG.debug(f"Bootstrap: topology {self.topology}")
LOG.debug(f"topology {self.topology}")

if self.database == "auto":
self.database = previous_config.get("database", determined_topology)
LOG.debug(f"Bootstrap: database topology {self.database}")

if self.database == "large":
# multi and large are the same
self.database = "multi"
LOG.debug(f"database topology {self.database}")
if (database := previous_config.get("database")) and database != self.database:
return Result(
ResultType.FAILED,
"Database topology cannot be changed, please destroy and re-bootstrap",
)

is_not_compatible = self.database == "single" and self.topology == "large"
if not self.force and is_not_compatible:
return Result(
ResultType.FAILED,
(
"Cannot deploy control plane to large with single database,"
" use -f/--force to override"
),
)

return Result(ResultType.COMPLETED)

def run(self, status: Optional[Status] = None) -> Result:
"""Execute configuration using terraform."""
# TODO(jamespage):
# This needs to evolve to add support for things like:
# - Enabling HA
# - Enabling/disabling specific services
# - Switch channels for the charmed operators
update_config(
self.client,
TOPOLOGY_KEY,
{"topology": self.topology, "database": self.database},
)

extra_tfvars = self.get_storage_tfvars()
self.update_status(status, "fetching cluster nodes")
control_nodes = self.client.cluster.list_nodes_by_role("control")
storage_nodes = self.client.cluster.list_nodes_by_role("storage")

self.update_status(status, "computing deployment sizing")
extra_tfvars = self.get_storage_tfvars(storage_nodes)
extra_tfvars.update(
{
"model": self.model,
"cloud": self.cloud,
"credential": f"{self.cloud}{CREDENTIAL_SUFFIX}",
"config": {"workload-storage": MICROK8S_DEFAULT_STORAGECLASS},
"many-mysql": self.database == "multi",
"ha-scale": compute_ha_scale(self.topology, len(control_nodes)),
"os-api-scale": compute_os_api_scale(self.topology, len(control_nodes)),
"ingress-scale": compute_ingress_scale(
self.topology, len(control_nodes)
),
}
)
self.update_status(status, "deploying services")
try:
self.update_status(status, "deploying services")
self.manifest.update_tfvars_and_apply_tf(
self.client,
tfplan=self.tfplan,
Expand Down Expand Up @@ -262,125 +269,6 @@ def run(self, status: Optional[Status] = None) -> Result:
return Result(ResultType.COMPLETED)


class ResizeControlPlaneStep(BaseStep, JujuStepHelper):
"""Resize OpenStack using Terraform cloud."""

_CONFIG = CONFIG_KEY

def __init__(
self,
client: Client,
manifest: Manifest,
jhelper: JujuHelper,
topology: str,
machine_model: str,
force: bool = False,
):
super().__init__(
"Resizing OpenStack Control Plane",
"Resizing OpenStack Control Plane to match appropriate topology",
)
self.client = client
self.manifest = manifest
self.jhelper = jhelper
self.topology = topology
self.machine_model = machine_model
self.force = force
self.model = OPENSTACK_MODEL
self.tfplan = "openstack-plan"

def is_skip(self, status: Optional[Status] = None) -> Result:
"""Determines if the step should be skipped or not.
:return: ResultType.SKIPPED if the Step should be skipped,
ResultType.COMPLETED or ResultType.FAILED otherwise
"""
try:
run_sync(self.jhelper.get_model(OPENSTACK_MODEL))
except ModelNotFoundException:
return Result(
ResultType.FAILED,
"OpenStack control plane is not deployed, cannot resize",
)

return Result(ResultType.COMPLETED)

def run(self, status: Optional[Status] = None) -> Result:
"""Execute configuration using terraform."""
self.update_status(status, "fetching configuration")
topology_dict = read_config(self.client, TOPOLOGY_KEY)
if self.topology == "auto":
topology = determine_target_topology(self.client)
else:
topology = self.topology
topology_dict["topology"] = topology
is_not_compatible = (
topology_dict["database"] == "single" and topology == "large"
)
if not self.force and is_not_compatible:
return Result(
ResultType.FAILED,
(
"Cannot resize control plane to large with single database,"
" use -f/--force to override"
),
)
update_config(
self.client,
TOPOLOGY_KEY,
topology_dict,
)
control_nodes = self.client.cluster.list_nodes_by_role("control")
storage_nodes = self.client.cluster.list_nodes_by_role("storage")
# NOTE(jamespage)
# When dedicated control nodes are used, ceph is not enabled during
# bootstrap - however storage nodes may be added later so re-assess
extra_tfvars = {
"ha-scale": compute_ha_scale(topology, len(control_nodes)),
"os-api-scale": compute_os_api_scale(topology, len(control_nodes)),
"ingress-scale": compute_ingress_scale(topology, len(control_nodes)),
"enable-ceph": len(storage_nodes) > 0,
"ceph-offer-url": f"admin/{self.machine_model}.{microceph.APPLICATION}",
"ceph-osd-replication-count": compute_ceph_replica_scale(
run_sync(_get_number_of_osds(self.jhelper, self.machine_model))
),
}

self.update_status(status, "scaling services")
try:
self.manifest.update_tfvars_and_apply_tf(
self.client,
tfplan=self.tfplan,
tfvar_config=self._CONFIG,
override_tfvars=extra_tfvars,
)
except TerraformException as e:
LOG.exception("Error resizing control plane")
return Result(ResultType.FAILED, str(e))

# Remove cinder-ceph from apps to wait on if ceph is not enabled
apps = run_sync(self.jhelper.get_application_names(self.model))
if not storage_nodes and "cinder-ceph" in apps:
apps.remove("cinder-ceph")
task = run_sync(update_status_background(self, apps, status))
try:
run_sync(
self.jhelper.wait_until_active(
self.model,
apps,
timeout=OPENSTACK_DEPLOY_TIMEOUT,
)
)
except (JujuWaitException, TimeoutException) as e:
LOG.warning(str(e))
return Result(ResultType.FAILED, str(e))
finally:
if not task.done():
task.cancel()

return Result(ResultType.COMPLETED)


class PatchLoadBalancerServicesStep(BaseStep):
SERVICES = ["traefik", "traefik-public", "rabbitmq", "ovn-relay"]
MODEL = OPENSTACK_MODEL
Expand Down
5 changes: 3 additions & 2 deletions sunbeam-python/sunbeam/commands/resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from rich.console import Console

from sunbeam.clusterd.client import Client
from sunbeam.commands.openstack import ResizeControlPlaneStep
from sunbeam.commands.openstack import DeployControlPlaneStep
from sunbeam.commands.terraform import TerraformInitStep
from sunbeam.jobs.common import click_option_topology, run_plan
from sunbeam.jobs.deployment import Deployment
Expand Down Expand Up @@ -47,11 +47,12 @@ def resize(ctx: click.Context, topology: str, force: bool = False) -> None:
jhelper = JujuHelper(deployment.get_connected_controller())
plan = [
TerraformInitStep(manifest_obj.get_tfhelper(tfplan)),
ResizeControlPlaneStep(
DeployControlPlaneStep(
client,
manifest_obj,
jhelper,
topology,
"auto",
deployment.infrastructure_model,
force=force,
),
Expand Down
6 changes: 4 additions & 2 deletions sunbeam-python/sunbeam/plugins/interface/v1/openstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from sunbeam.commands.juju import JujuStepHelper
from sunbeam.commands.openstack import (
OPENSTACK_MODEL,
determine_target_topology_at_bootstrap,
TOPOLOGY_KEY,
)
from sunbeam.commands.terraform import TerraformException, TerraformInitStep
from sunbeam.jobs.checks import VerifyBootstrappedCheck
Expand Down Expand Up @@ -210,7 +210,9 @@ def get_tfvar_config_key(self) -> str:
def get_database_topology(self) -> str:
"""Returns the database topology of the cluster."""
# Database topology can be set only during bootstrap and cannot be changed.
return determine_target_topology_at_bootstrap()
client = self.deployment.get_client()
topology = read_config(client, TOPOLOGY_KEY)
return topology["database"]

def set_application_timeout_on_enable(self) -> int:
"""Set Application Timeout on enabling the plugin.
Expand Down
15 changes: 8 additions & 7 deletions sunbeam-python/sunbeam/provider/maas/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,16 +122,17 @@ def from_deployment(cls, deployment: Deployment) -> "MaasClient":
)


def _to_root_disk(device: dict, size: int = 0) -> dict:
"""Convert device to root disk.
Device can be a blockdevice or a partition.
"""
def _to_root_disk(device: dict, partition: dict | None = None) -> dict:
"""Convert device to root disk."""
if partition:
size = partition["size"]
else:
size = device["size"]
root_disk = {
"name": device["name"],
"tags": device["tags"],
"root_partition": {
"size": size or device["size"],
"size": size,
},
}
return root_disk
Expand Down Expand Up @@ -160,7 +161,7 @@ def _convert_raw_machine(machine_raw: dict) -> dict:
for partition in blockdevice.get("partitions", []):
fs = partition.get("filesystem")
if fs.get("label") == "root":
root_disk = _to_root_disk(partition, size=partition["size"])
root_disk = _to_root_disk(blockdevice, partition)

spaces = []
nics = []
Expand Down
6 changes: 4 additions & 2 deletions sunbeam-python/sunbeam/provider/maas/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def deploy(
ctx: click.Context,
manifest: Path | None = None,
accept_defaults: bool = False,
topology: str = "auto",
) -> None:
"""Deploy the MAAS-backed deployment.
Expand Down Expand Up @@ -520,8 +521,9 @@ def deploy(
client,
manifest,
jhelper,
"auto",
"auto", # TODO(gboutry): use the right values
topology,
# maas deployment always deploys multiple databases
"large",
deployment.infrastructure_model,
)
)
Expand Down
Loading

0 comments on commit e1b3abf

Please sign in to comment.