From 99846af29d1c3a0d9de4a52dbbad65240c3b6ed2 Mon Sep 17 00:00:00 2001 From: jean-christophe81 <98889244+jean-christophe81@users.noreply.github.com> Date: Wed, 8 Jan 2025 17:56:50 +0100 Subject: [PATCH] fix defunct process and grpc coredump (#1978) fix(grpc layers): coredump on process shutdown fix(agent): hang on exec issue --- agent/inc/com/centreon/agent/bireactor.hh | 7 +- agent/src/bireactor.cc | 11 +-- agent/src/main.cc | 3 +- .../inc/com/centreon/broker/grpc/stream.hh | 7 +- broker/grpc/src/stream.cc | 13 ++-- .../detail/centreon_posix_process_launcher.hh | 4 ++ .../centreon_agent/agent_impl.hh | 2 +- .../src/centreon_agent/agent_impl.cc | 22 +++--- tests/broker-engine/cma.robot | 69 +++++++++++++++++-- tests/resources/Common.py | 17 +++++ tests/resources/resources.resource | 11 +++ 11 files changed, 137 insertions(+), 29 deletions(-) diff --git a/agent/inc/com/centreon/agent/bireactor.hh b/agent/inc/com/centreon/agent/bireactor.hh index 16af5594c81..1cb4347a286 100644 --- a/agent/inc/com/centreon/agent/bireactor.hh +++ b/agent/inc/com/centreon/agent/bireactor.hh @@ -28,7 +28,12 @@ class bireactor : public bireactor_class, public std::enable_shared_from_this> { private: - static std::set> _instances; + /** + * @brief we store reactor instances in this container until OnDone is called + * by grpc layers. We allocate this container and never free this because + * threads terminate in unknown order. + */ + static std::set>* _instances; static std::mutex _instances_m; bool _write_pending; diff --git a/agent/src/bireactor.cc b/agent/src/bireactor.cc index 6e396a0bf12..6a7f07dd4e8 100644 --- a/agent/src/bireactor.cc +++ b/agent/src/bireactor.cc @@ -29,8 +29,9 @@ using namespace com::centreon::agent; * @tparam bireactor_class */ template -std::set>> - bireactor::_instances; +std::set>>* + bireactor::_instances = + new std::set>>; template std::mutex bireactor::_instances_m; @@ -61,7 +62,7 @@ template void bireactor::register_stream( const std::shared_ptr& strm) { std::lock_guard l(_instances_m); - _instances.insert(strm); + _instances->insert(strm); } template @@ -162,7 +163,7 @@ void bireactor::OnDone() { std::lock_guard l(_instances_m); SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone() to {}", static_cast(me.get()), peer); - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase(std::static_pointer_cast>(me)); }); } @@ -186,7 +187,7 @@ void bireactor::OnDone(const ::grpc::Status& status) { static_cast(me.get()), peer, status.error_message(), status.error_details()); } - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase(std::static_pointer_cast>(me)); }); } diff --git a/agent/src/main.cc b/agent/src/main.cc index 42ca20cd889..07719168736 100644 --- a/agent/src/main.cc +++ b/agent/src/main.cc @@ -164,7 +164,8 @@ int main(int argc, char* argv[]) { g_logger->flush_on(spdlog::level::warn); - spdlog::flush_every(std::chrono::seconds(1)); + // don't use it because spdlog mutex would hang child process + // spdlog::flush_every(std::chrono::seconds(1)); SPDLOG_LOGGER_INFO(g_logger, "centreon-monitoring-agent start, you can decrease log " diff --git a/broker/grpc/inc/com/centreon/broker/grpc/stream.hh b/broker/grpc/inc/com/centreon/broker/grpc/stream.hh index d6fa2b17e97..79a1dd38d35 100644 --- a/broker/grpc/inc/com/centreon/broker/grpc/stream.hh +++ b/broker/grpc/inc/com/centreon/broker/grpc/stream.hh @@ -79,7 +79,12 @@ template class stream : public io::stream, public bireactor_class, public std::enable_shared_from_this> { - static std::set> _instances; + /** + * @brief we store reactor instances in this container until OnDone is called + * by grpc layers. We allocate this container and never free this because + * threads terminate in unknown order. + */ + static std::set>* _instances; static std::mutex _instances_m; using read_queue = std::queue; diff --git a/broker/grpc/src/stream.cc b/broker/grpc/src/stream.cc index 282ae57582b..6ab028ac28b 100644 --- a/broker/grpc/src/stream.cc +++ b/broker/grpc/src/stream.cc @@ -104,8 +104,9 @@ const std::string com::centreon::broker::grpc::authorization_header( * @tparam bireactor_class */ template -std::set>> - stream::_instances; +std::set>>* + stream::_instances = + new std::set>>; template std::mutex stream::_instances_m; @@ -149,7 +150,7 @@ template void stream::register_stream( const std::shared_ptr>& strm) { std::lock_guard l(_instances_m); - _instances.insert(strm); + _instances->insert(strm); } /** @@ -375,7 +376,8 @@ void stream::OnDone() { std::lock_guard l(_instances_m); SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone()", static_cast(me.get())); - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase( + std::static_pointer_cast>(me)); }); } @@ -402,7 +404,8 @@ void stream::OnDone(const ::grpc::Status& status) { SPDLOG_LOGGER_DEBUG(logger, "{:p} client::OnDone({}) {}", static_cast(me.get()), status.error_message(), status.error_details()); - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase( + std::static_pointer_cast>(me)); }); } diff --git a/common/process/inc/com/centreon/common/process/detail/centreon_posix_process_launcher.hh b/common/process/inc/com/centreon/common/process/detail/centreon_posix_process_launcher.hh index 79aa5a6ce8a..69cc9868e83 100644 --- a/common/process/inc/com/centreon/common/process/detail/centreon_posix_process_launcher.hh +++ b/common/process/inc/com/centreon/common/process/detail/centreon_posix_process_launcher.hh @@ -190,6 +190,10 @@ struct centreon_posix_default_launcher { BOOST_PROCESS_V2_ASSIGN_EC(ec, child_error, system_category()) if (ec) { + if (pid > 0) { + ::kill(pid, SIGKILL); + ::waitpid(pid, nullptr, 0); + } detail::on_error(*this, executable, argv, ec, inits...); return basic_process{exec}; } diff --git a/engine/modules/opentelemetry/inc/com/centreon/engine/modules/opentelemetry/centreon_agent/agent_impl.hh b/engine/modules/opentelemetry/inc/com/centreon/engine/modules/opentelemetry/centreon_agent/agent_impl.hh index 21695050119..614cd4490b6 100644 --- a/engine/modules/opentelemetry/inc/com/centreon/engine/modules/opentelemetry/centreon_agent/agent_impl.hh +++ b/engine/modules/opentelemetry/inc/com/centreon/engine/modules/opentelemetry/centreon_agent/agent_impl.hh @@ -53,7 +53,7 @@ class agent_impl std::shared_ptr _last_sent_config ABSL_GUARDED_BY(_protect); - static std::set> _instances + static std::set>* _instances ABSL_GUARDED_BY(_instances_m); static absl::Mutex _instances_m; diff --git a/engine/modules/opentelemetry/src/centreon_agent/agent_impl.cc b/engine/modules/opentelemetry/src/centreon_agent/agent_impl.cc index c57e7cc822f..c5ca22158f0 100644 --- a/engine/modules/opentelemetry/src/centreon_agent/agent_impl.cc +++ b/engine/modules/opentelemetry/src/centreon_agent/agent_impl.cc @@ -36,8 +36,9 @@ using namespace com::centreon::engine::modules::opentelemetry::centreon_agent; * @tparam bireactor_class */ template -std::set>> - agent_impl::_instances; +std::set>>* + agent_impl::_instances = + new std::set>>; template absl::Mutex agent_impl::_instances_m; @@ -119,7 +120,7 @@ template void agent_impl::all_agent_calc_and_send_config_if_needed( const agent_config::pointer& new_conf) { absl::MutexLock l(&_instances_m); - for (auto& instance : _instances) { + for (auto& instance : *_instances) { instance->calc_and_send_config_if_needed(new_conf); } } @@ -262,7 +263,7 @@ template void agent_impl::register_stream( const std::shared_ptr& strm) { absl::MutexLock l(&_instances_m); - _instances.insert(strm); + _instances->insert(strm); } /** @@ -379,7 +380,8 @@ void agent_impl::OnDone() { absl::MutexLock l(&_instances_m); SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone()", static_cast(me.get())); - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase( + std::static_pointer_cast>(me)); }); } @@ -410,7 +412,8 @@ void agent_impl::OnDone(const ::grpc::Status& status) { static_cast(me.get()), status.error_message(), status.error_details()); } - _instances.erase(std::static_pointer_cast>(me)); + _instances->erase( + std::static_pointer_cast>(me)); }); } @@ -432,12 +435,13 @@ void agent_impl::shutdown() { */ template void agent_impl::shutdown_all() { - std::set> to_shutdown; + std::set>* to_shutdown; { absl::MutexLock l(&_instances_m); - to_shutdown = std::move(_instances); + to_shutdown = _instances; + _instances = new std::set>>; } - for (std::shared_ptr conn : to_shutdown) { + for (std::shared_ptr conn : *to_shutdown) { conn->shutdown(); } } diff --git a/tests/broker-engine/cma.robot b/tests/broker-engine/cma.robot index e79a806e01b..8e486ee79e7 100644 --- a/tests/broker-engine/cma.robot +++ b/tests/broker-engine/cma.robot @@ -360,7 +360,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_CPU Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -428,7 +428,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_STORAGE Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -496,7 +496,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_UPTIME Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -562,7 +562,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_MEMORY Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -629,7 +629,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_SERVICE Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -698,7 +698,7 @@ BEOTEL_CENTREON_AGENT_CHECK_HEALTH Ctn Engine Config Set Value 0 log_level_checks trace - Ctn Clear Db metrics + Ctn Clear Metrics Ctn Config Broker central Ctn Config Broker module @@ -795,6 +795,63 @@ BEOTEL_CENTREON_AGENT_CEIP Should Be True ${result} agent_information table not updated as expected +BEOTEL_CENTREON_AGENT_LINUX_NO_DEFUNCT_PROCESS + [Documentation] agent check host and we expect to get it in check result + [Tags] broker engine opentelemetry MON-156455 + + ${run_env} Ctn Run Env + Pass Execution If "${run_env}" == "WSL" "This test is only for linux agent version" + + Ctn Config Engine ${1} ${2} ${2} + Ctn Add Otl ServerModule + ... 0 + ... {"otel_server":{"host": "0.0.0.0","port": 4317},"max_length_grpc_log":0, "centreon_agent":{"check_interval":10, "export_period":10}} + Ctn Config Add Otl Connector + ... 0 + ... OTEL connector + ... opentelemetry --processor=centreon_agent --extractor=attributes --host_path=resource_metrics.resource.attributes.host.name --service_path=resource_metrics.resource.attributes.service.name + Ctn Engine Config Replace Value In Hosts ${0} host_1 check_command otel_check_icmp + Ctn Set Hosts Passive ${0} host_1 + + Ctn Engine Config Add Command ${0} otel_check_icmp turlututu OTEL connector + + Ctn Engine Config Set Value 0 log_level_checks trace + + Ctn Config Broker central + Ctn Config Broker module + Ctn Config Broker rrd + Ctn Config Centreon Agent + Ctn Broker Config Log central sql trace + + Ctn Config BBDO3 1 + Ctn Clear Retention + + ${start} Get Current Date + Ctn Start Broker + Ctn Start Engine + Ctn Start Agent + + # Let's wait for the otel server start + Ctn Wait For Otel Server To Be Ready ${start} + Sleep 30s + + ${nb_agent_process} Ctn Get Nb Process centagent + Should Be True ${nb_agent_process} >= 1 and ${nb_agent_process} <= 2 "There should be no centagent defunct process" + + Log To Console Stop agent + Ctn Kindly Stop Agent + + FOR ${i} IN RANGE 1 10 + ${nb_agent_process} Ctn Get Nb Process centagent + IF ${nb_agent_process} == 0 + Exit For Loop + ELSE + Sleep 2s + END + END + + Should Be True ${nb_agent_process} == 0 "There should be no centagent process" + *** Keywords *** Ctn Create Cert And Init diff --git a/tests/resources/Common.py b/tests/resources/Common.py index 76da0cd8c95..1a2ec8dce4f 100644 --- a/tests/resources/Common.py +++ b/tests/resources/Common.py @@ -2055,3 +2055,20 @@ def ctn_check_agent_information(total_nb_agent: int, nb_poller:int, timeout: int logger.console(f"unexpected result: {result}") return False + +def ctn_get_nb_process(exe:str): + """ + ctn_get_nb_process + + get the number of process with a specific executable + Args: + exe: executable to search + Returns: number of process + """ + + counter = 0 + + for p in psutil.process_iter(): + if exe in p.name() or exe in ' '.join(p.cmdline()): + counter += 1 + return counter diff --git a/tests/resources/resources.resource b/tests/resources/resources.resource index ff7da97a671..a0f8c04fe8a 100644 --- a/tests/resources/resources.resource +++ b/tests/resources/resources.resource @@ -107,6 +107,13 @@ Ctn Kindly Stop Broker ... If bbdo2 is set to True, the rrd broker will not be checked for RRD logs ... since there are still bugs in this context. [Arguments] ${only_central}=False + + #in some case tear down calls this function whereas broker has not been started + ${b1_obj} Get Process Object b1 + IF "${b1_obj}" == "${None}" + Log To Console "central broker not started" + RETURN + END Send Signal To Process SIGTERM b1 IF not ${only_central} Send Signal To Process SIGTERM b2 ${result} Wait For Process b1 timeout=60s @@ -228,6 +235,10 @@ Ctn Stop Custom Engine Ctn Stop Engine Log To Console "stop centengine" ${count} Ctn Get Engines Count + IF ${count} == 0 + Log To Console "no engine to stop" + RETURN + END FOR ${idx} IN RANGE 0 ${count} Send Signal To Process SIGTERM e${idx} END