diff --git a/aviso-server/monitoring/aviso_monitoring/config.py b/aviso-server/monitoring/aviso_monitoring/config.py index cc9ae9b..a6ded4d 100644 --- a/aviso-server/monitoring/aviso_monitoring/config.py +++ b/aviso-server/monitoring/aviso_monitoring/config.py @@ -27,6 +27,7 @@ def __init__( aviso_auth_reporter=None, etcd_reporter=None, prometheus_reporter=None, + kube_state_metrics=None, ): try: # we build the configuration in priority order from the lower to the higher @@ -41,6 +42,7 @@ def __init__( self.aviso_auth_reporter = aviso_auth_reporter self.etcd_reporter = etcd_reporter self.prometheus_reporter = prometheus_reporter + self.kube_state_metrics = kube_state_metrics logger.debug("Loading configuration completed") @@ -113,6 +115,8 @@ def _create_default_config() -> Dict: }, } + kube_state_metrics = {"ssl_enabled": False, "token": None} + # main config config = {} config["udp_server"] = udp_server @@ -121,6 +125,7 @@ def _create_default_config() -> Dict: config["aviso_auth_reporter"] = aviso_auth_reporter config["etcd_reporter"] = etcd_reporter config["prometheus_reporter"] = prometheus_reporter + config["kube_state_metrics"] = kube_state_metrics return config def _read_env_variables(self) -> Dict: @@ -179,7 +184,7 @@ def aviso_rest_reporter(self, aviso_rest_reporter): assert ar is not None, "aviso_rest_reporter has not been configured" assert ar.get("tlms") is not None, "aviso_rest_reporter tlms has not been configured" assert ar.get("enabled") is not None, "aviso_rest_reporter enabled has not been configured" - if type(ar["enabled"]) is str: + if isinstance(ar["enabled"], str): ar["enabled"] = ar["enabled"].casefold() == "true".casefold() assert ar.get("frequency") is not None, "aviso_rest_reporter frequency has not been configured" self._aviso_rest_reporter = ar @@ -199,7 +204,7 @@ def aviso_auth_reporter(self, aviso_auth_reporter): assert aa is not None, "aviso_auth_reporter has not been configured" assert aa.get("tlms") is not None, "aviso_auth_reporter tlms has not been configured" assert aa.get("enabled") is not None, "aviso_auth_reporter enabled has not been configured" - if type(aa["enabled"]) is str: + if isinstance(aa["enabled"], str): aa["enabled"] = aa["enabled"].casefold() == "true".casefold() assert aa.get("frequency") is not None, "aviso_auth_reporter frequency has not been configured" self._aviso_auth_reporter = aa @@ -219,7 +224,7 @@ def etcd_reporter(self, etcd_reporter): assert e is not None, "etcd_reporter has not been configured" assert e.get("tlms") is not None, "etcd_reporter tlms has not been configured" assert e.get("enabled") is not None, "etcd_reporter enabled has not been configured" - if type(e["enabled"]) is str: + if isinstance(e["enabled"], str): e["enabled"] = e["enabled"].casefold() == "true".casefold() assert e.get("frequency") is not None, "etcd_reporter frequency has not been configured" assert e.get("member_urls") is not None, "etcd_reporter member_urls has not been configured" @@ -241,11 +246,29 @@ def prometheus_reporter(self, prometheus_reporter): assert pr is not None, "prometheus_reporter has not been configured" assert pr.get("host") is not None, "prometheus_reporter host has not been configured" assert pr.get("enabled") is not None, "prometheus_reporter enabled has not been configured" - if type(pr["enabled"]) is str: + if isinstance(pr["enabled"], str): pr["enabled"] = pr["enabled"].casefold() == "true".casefold() assert pr.get("port") is not None, "prometheus_reporter port has not been configured" self._prometheus_reporter = pr + @property + def kube_state_metrics(self): + return self._kube_state_metrics + + @kube_state_metrics.setter + def kube_state_metrics(self, kube_state_metrics): + ksm = self._config.get("kube_state_metrics") + if kube_state_metrics is not None and ksm is not None: + Config.deep_update(ksm, kube_state_metrics) + elif kube_state_metrics is not None: + ksm = kube_state_metrics + # verify is valid + assert ksm is not None, "kube_state_metrics has not been configured" + assert ksm.get("ssl_enabled") is not None, "kube_state_metrics ssl_enabled has not been configured" + if ksm["ssl_enabled"]: + assert ksm.get("token") is not None, "kube_state_metrics token has not been configured" + self._kube_state_metrics = ksm + def __str__(self): config_string = ( f"udp_server: {self.udp_server}" @@ -254,6 +277,7 @@ def __str__(self): + f", aviso_auth_reporter: {self.aviso_auth_reporter}" + f", etcd_reporter: {self.etcd_reporter}" + f", prometheus_reporter: {self.prometheus_reporter}" + + f", kube_state_metrics: {self.kube_state_metrics}" ) return config_string diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py index 7a118f8..6526930 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_auth_reporter.py @@ -201,7 +201,12 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def metric(self): - pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-auth-\w+"}' + namespace = self.get_k8s_pod_namespace() + if not namespace: + logger.warning("Could not determine the pod's namespace.") + namespace = "aviso" + + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-auth"}}' # defaults status = 0 message = "All pods available" @@ -243,3 +248,30 @@ def metric(self): m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"} logger.debug(f"{self.metric_name} metric: {m_status}") return m_status + + @staticmethod + def get_k8s_pod_namespace(): + """ + Retrieves the Kubernetes (k8s) namespace in which the current pod is running. + + This function reads the namespace name from a file that Kubernetes automatically + mounts inside the pod. This file is typically located at: + '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + + Returns: + str: The namespace in which the pod is running. If the namespace cannot be determined + (e.g., the file doesn't exist or the pod is not running in a k8s environment), + the function returns None. + """ + namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + try: + with open(namespace_file, "r") as file: + return file.read().strip() + except FileNotFoundError: + logger.error(f"Namespace file not found: {namespace_file}") + except IOError as e: + logger.error(f"I/O error occurred when reading namespace file: {e}") + except Exception as e: + logger.exception(f"Unexpected error occurred when reading namespace file: {e}") + + return None diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py index da520a8..384f59d 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/aviso_rest_reporter.py @@ -18,6 +18,8 @@ def __init__(self, config, *args, **kwargs): self.frequency = aviso_rest_config["frequency"] self.enabled = aviso_rest_config["enabled"] self.tlms = aviso_rest_config["tlms"] + # configure the metric vars once only here + OpsviewReporter.configure_metric_vars(config) super().__init__(config, *args, **kwargs) def process_messages(self): @@ -182,7 +184,12 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def metric(self): - pattern = r'kube_deployment_status_replicas{namespace="aviso",deployment="aviso-rest-\w+"}' + namespace = self.get_k8s_pod_namespace() + if not namespace: + logger.warning("Could not determine the pod's namespace.") + namespace = "aviso" + + pattern = rf'kube_deployment_status_replicas{{namespace="{namespace}",deployment="aviso-rest"}}' # defaults status = 0 message = "All pods available" @@ -224,3 +231,30 @@ def metric(self): m_status = {"name": self.metric_name, "status": 1, "message": "Metric could not be retrieved"} logger.debug(f"{self.metric_name} metric: {m_status}") return m_status + + @staticmethod + def get_k8s_pod_namespace(): + """ + Retrieves the Kubernetes (k8s) namespace in which the current pod is running. + + This function reads the namespace name from a file that Kubernetes automatically + mounts inside the pod. This file is typically located at: + '/var/run/secrets/kubernetes.io/serviceaccount/namespace' + + Returns: + str: The namespace in which the pod is running. If the namespace cannot be determined + (e.g., the file doesn't exist or the pod is not running in a k8s environment), + the function returns None. + """ + namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + try: + with open(namespace_file, "r") as file: + return file.read().strip() + except FileNotFoundError: + logger.error(f"Namespace file not found: {namespace_file}") + except IOError as e: + logger.error(f"I/O error occurred when reading namespace file: {e}") + except Exception as e: + logger.exception(f"Unexpected error occurred when reading namespace file: {e}") + + return None diff --git a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py index 09ba543..1dc3b6a 100644 --- a/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py +++ b/aviso-server/monitoring/aviso_monitoring/reporter/opsview_reporter.py @@ -19,12 +19,25 @@ class OpsviewReporter(ABC): + metric_ssl_enabled = False + metric_token = "" + def __init__(self, config: Config, msg_receiver=None): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) self.monitor_servers = config.monitor_servers self.msg_receiver = msg_receiver self.token = {} + @classmethod + def configure_metric_vars(cls, config): + """ + Configures the class attributes based on the provided config. + """ + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + if config.kube_state_metrics["ssl_enabled"]: + cls.metric_ssl_enabled = True + cls.metric_token = config.kube_state_metrics["token"] + def ms_authenticate(self, m_server): """ This method authenticate to the monitoring server @@ -199,7 +212,8 @@ def aggregate_unique_counter_tlms(tlms): } return agg_tlm - def retrieve_metrics(metric_servers, req_timeout): + @classmethod + def retrieve_metrics(cls, metric_servers, req_timeout): """ This methods retrieves the metrics provided by specific metric servers using a Prometheus interface. """ @@ -207,8 +221,11 @@ def retrieve_metrics(metric_servers, req_timeout): for u in metric_servers: url = u + "/metrics" logger.debug(f"Retrieving metrics from {url}...") + headers = {} try: - resp = requests.get(url, verify=False, timeout=req_timeout) + if cls.metric_ssl_enabled: + headers["Authorization"] = f"Bearer {cls.metric_token}" + resp = requests.get(url, verify=False, timeout=req_timeout, headers=headers) except Exception as e: logger.exception(f"Not able to get metrics from {url}, error {e}") raw_tlms[u] = None