Skip to content

Commit

Permalink
fix defunct process and grpc coredump (#1978)
Browse files Browse the repository at this point in the history
fix(grpc layers): coredump on process shutdown
fix(agent): hang on exec issue
  • Loading branch information
jean-christophe81 authored Jan 8, 2025
1 parent 294b720 commit 99846af
Show file tree
Hide file tree
Showing 11 changed files with 137 additions and 29 deletions.
7 changes: 6 additions & 1 deletion agent/inc/com/centreon/agent/bireactor.hh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,12 @@ class bireactor
: public bireactor_class,
public std::enable_shared_from_this<bireactor<bireactor_class>> {
private:
static std::set<std::shared_ptr<bireactor>> _instances;
/**
* @brief we store reactor instances in this container until OnDone is called
* by grpc layers. We allocate this container and never free this because
* threads terminate in unknown order.
*/
static std::set<std::shared_ptr<bireactor>>* _instances;
static std::mutex _instances_m;

bool _write_pending;
Expand Down
11 changes: 6 additions & 5 deletions agent/src/bireactor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ using namespace com::centreon::agent;
* @tparam bireactor_class
*/
template <class bireactor_class>
std::set<std::shared_ptr<bireactor<bireactor_class>>>
bireactor<bireactor_class>::_instances;
std::set<std::shared_ptr<bireactor<bireactor_class>>>*
bireactor<bireactor_class>::_instances =
new std::set<std::shared_ptr<bireactor<bireactor_class>>>;

template <class bireactor_class>
std::mutex bireactor<bireactor_class>::_instances_m;
Expand Down Expand Up @@ -61,7 +62,7 @@ template <class bireactor_class>
void bireactor<bireactor_class>::register_stream(
const std::shared_ptr<bireactor>& strm) {
std::lock_guard l(_instances_m);
_instances.insert(strm);
_instances->insert(strm);
}

template <class bireactor_class>
Expand Down Expand Up @@ -162,7 +163,7 @@ void bireactor<bireactor_class>::OnDone() {
std::lock_guard l(_instances_m);
SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone() to {}",
static_cast<void*>(me.get()), peer);
_instances.erase(std::static_pointer_cast<bireactor<bireactor_class>>(me));
_instances->erase(std::static_pointer_cast<bireactor<bireactor_class>>(me));
});
}

Expand All @@ -186,7 +187,7 @@ void bireactor<bireactor_class>::OnDone(const ::grpc::Status& status) {
static_cast<void*>(me.get()), peer,
status.error_message(), status.error_details());
}
_instances.erase(std::static_pointer_cast<bireactor<bireactor_class>>(me));
_instances->erase(std::static_pointer_cast<bireactor<bireactor_class>>(me));
});
}

Expand Down
3 changes: 2 additions & 1 deletion agent/src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ int main(int argc, char* argv[]) {

g_logger->flush_on(spdlog::level::warn);

spdlog::flush_every(std::chrono::seconds(1));
// don't use it because spdlog mutex would hang child process
// spdlog::flush_every(std::chrono::seconds(1));

SPDLOG_LOGGER_INFO(g_logger,
"centreon-monitoring-agent start, you can decrease log "
Expand Down
7 changes: 6 additions & 1 deletion broker/grpc/inc/com/centreon/broker/grpc/stream.hh
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ template <class bireactor_class>
class stream : public io::stream,
public bireactor_class,
public std::enable_shared_from_this<stream<bireactor_class>> {
static std::set<std::shared_ptr<stream>> _instances;
/**
* @brief we store reactor instances in this container until OnDone is called
* by grpc layers. We allocate this container and never free this because
* threads terminate in unknown order.
*/
static std::set<std::shared_ptr<stream>>* _instances;
static std::mutex _instances_m;

using read_queue = std::queue<event_ptr>;
Expand Down
13 changes: 8 additions & 5 deletions broker/grpc/src/stream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,9 @@ const std::string com::centreon::broker::grpc::authorization_header(
* @tparam bireactor_class
*/
template <class bireactor_class>
std::set<std::shared_ptr<stream<bireactor_class>>>
stream<bireactor_class>::_instances;
std::set<std::shared_ptr<stream<bireactor_class>>>*
stream<bireactor_class>::_instances =
new std::set<std::shared_ptr<stream<bireactor_class>>>;

template <class bireactor_class>
std::mutex stream<bireactor_class>::_instances_m;
Expand Down Expand Up @@ -149,7 +150,7 @@ template <class bireactor_class>
void stream<bireactor_class>::register_stream(
const std::shared_ptr<stream<bireactor_class>>& strm) {
std::lock_guard l(_instances_m);
_instances.insert(strm);
_instances->insert(strm);
}

/**
Expand Down Expand Up @@ -375,7 +376,8 @@ void stream<bireactor_class>::OnDone() {
std::lock_guard l(_instances_m);
SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone()",
static_cast<void*>(me.get()));
_instances.erase(std::static_pointer_cast<stream<bireactor_class>>(me));
_instances->erase(
std::static_pointer_cast<stream<bireactor_class>>(me));
});
}

Expand All @@ -402,7 +404,8 @@ void stream<bireactor_class>::OnDone(const ::grpc::Status& status) {
SPDLOG_LOGGER_DEBUG(logger, "{:p} client::OnDone({}) {}",
static_cast<void*>(me.get()),
status.error_message(), status.error_details());
_instances.erase(std::static_pointer_cast<stream<bireactor_class>>(me));
_instances->erase(
std::static_pointer_cast<stream<bireactor_class>>(me));
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ struct centreon_posix_default_launcher {
BOOST_PROCESS_V2_ASSIGN_EC(ec, child_error, system_category())

if (ec) {
if (pid > 0) {
::kill(pid, SIGKILL);
::waitpid(pid, nullptr, 0);
}
detail::on_error(*this, executable, argv, ec, inits...);
return basic_process<Executor>{exec};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class agent_impl
std::shared_ptr<agent::MessageToAgent> _last_sent_config
ABSL_GUARDED_BY(_protect);

static std::set<std::shared_ptr<agent_impl>> _instances
static std::set<std::shared_ptr<agent_impl>>* _instances
ABSL_GUARDED_BY(_instances_m);
static absl::Mutex _instances_m;

Expand Down
22 changes: 13 additions & 9 deletions engine/modules/opentelemetry/src/centreon_agent/agent_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ using namespace com::centreon::engine::modules::opentelemetry::centreon_agent;
* @tparam bireactor_class
*/
template <class bireactor_class>
std::set<std::shared_ptr<agent_impl<bireactor_class>>>
agent_impl<bireactor_class>::_instances;
std::set<std::shared_ptr<agent_impl<bireactor_class>>>*
agent_impl<bireactor_class>::_instances =
new std::set<std::shared_ptr<agent_impl<bireactor_class>>>;

template <class bireactor_class>
absl::Mutex agent_impl<bireactor_class>::_instances_m;
Expand Down Expand Up @@ -119,7 +120,7 @@ template <class bireactor_class>
void agent_impl<bireactor_class>::all_agent_calc_and_send_config_if_needed(
const agent_config::pointer& new_conf) {
absl::MutexLock l(&_instances_m);
for (auto& instance : _instances) {
for (auto& instance : *_instances) {
instance->calc_and_send_config_if_needed(new_conf);
}
}
Expand Down Expand Up @@ -262,7 +263,7 @@ template <class bireactor_class>
void agent_impl<bireactor_class>::register_stream(
const std::shared_ptr<agent_impl>& strm) {
absl::MutexLock l(&_instances_m);
_instances.insert(strm);
_instances->insert(strm);
}

/**
Expand Down Expand Up @@ -379,7 +380,8 @@ void agent_impl<bireactor_class>::OnDone() {
absl::MutexLock l(&_instances_m);
SPDLOG_LOGGER_DEBUG(logger, "{:p} server::OnDone()",
static_cast<void*>(me.get()));
_instances.erase(std::static_pointer_cast<agent_impl<bireactor_class>>(me));
_instances->erase(
std::static_pointer_cast<agent_impl<bireactor_class>>(me));
});
}

Expand Down Expand Up @@ -410,7 +412,8 @@ void agent_impl<bireactor_class>::OnDone(const ::grpc::Status& status) {
static_cast<void*>(me.get()), status.error_message(),
status.error_details());
}
_instances.erase(std::static_pointer_cast<agent_impl<bireactor_class>>(me));
_instances->erase(
std::static_pointer_cast<agent_impl<bireactor_class>>(me));
});
}

Expand All @@ -432,12 +435,13 @@ void agent_impl<bireactor_class>::shutdown() {
*/
template <class bireactor_class>
void agent_impl<bireactor_class>::shutdown_all() {
std::set<std::shared_ptr<agent_impl>> to_shutdown;
std::set<std::shared_ptr<agent_impl>>* to_shutdown;
{
absl::MutexLock l(&_instances_m);
to_shutdown = std::move(_instances);
to_shutdown = _instances;
_instances = new std::set<std::shared_ptr<agent_impl<bireactor_class>>>;
}
for (std::shared_ptr<agent_impl> conn : to_shutdown) {
for (std::shared_ptr<agent_impl> conn : *to_shutdown) {
conn->shutdown();
}
}
Expand Down
69 changes: 63 additions & 6 deletions tests/broker-engine/cma.robot
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_CPU

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -428,7 +428,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_STORAGE

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -496,7 +496,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_UPTIME

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -562,7 +562,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_MEMORY

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -629,7 +629,7 @@ BEOTEL_CENTREON_AGENT_CHECK_NATIVE_SERVICE

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -698,7 +698,7 @@ BEOTEL_CENTREON_AGENT_CHECK_HEALTH

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Clear Db metrics
Ctn Clear Metrics

Ctn Config Broker central
Ctn Config Broker module
Expand Down Expand Up @@ -795,6 +795,63 @@ BEOTEL_CENTREON_AGENT_CEIP
Should Be True ${result} agent_information table not updated as expected


BEOTEL_CENTREON_AGENT_LINUX_NO_DEFUNCT_PROCESS
[Documentation] agent check host and we expect to get it in check result
[Tags] broker engine opentelemetry MON-156455
${run_env} Ctn Run Env
Pass Execution If "${run_env}" == "WSL" "This test is only for linux agent version"

Ctn Config Engine ${1} ${2} ${2}
Ctn Add Otl ServerModule
... 0
... {"otel_server":{"host": "0.0.0.0","port": 4317},"max_length_grpc_log":0, "centreon_agent":{"check_interval":10, "export_period":10}}
Ctn Config Add Otl Connector
... 0
... OTEL connector
... opentelemetry --processor=centreon_agent --extractor=attributes --host_path=resource_metrics.resource.attributes.host.name --service_path=resource_metrics.resource.attributes.service.name
Ctn Engine Config Replace Value In Hosts ${0} host_1 check_command otel_check_icmp
Ctn Set Hosts Passive ${0} host_1

Ctn Engine Config Add Command ${0} otel_check_icmp turlututu OTEL connector

Ctn Engine Config Set Value 0 log_level_checks trace

Ctn Config Broker central
Ctn Config Broker module
Ctn Config Broker rrd
Ctn Config Centreon Agent
Ctn Broker Config Log central sql trace

Ctn Config BBDO3 1
Ctn Clear Retention

${start} Get Current Date
Ctn Start Broker
Ctn Start Engine
Ctn Start Agent

# Let's wait for the otel server start
Ctn Wait For Otel Server To Be Ready ${start}
Sleep 30s

${nb_agent_process} Ctn Get Nb Process centagent
Should Be True ${nb_agent_process} >= 1 and ${nb_agent_process} <= 2 "There should be no centagent defunct process"

Log To Console Stop agent
Ctn Kindly Stop Agent

FOR ${i} IN RANGE 1 10
${nb_agent_process} Ctn Get Nb Process centagent
IF ${nb_agent_process} == 0
Exit For Loop
ELSE
Sleep 2s
END
END

Should Be True ${nb_agent_process} == 0 "There should be no centagent process"


*** Keywords ***
Ctn Create Cert And Init
Expand Down
17 changes: 17 additions & 0 deletions tests/resources/Common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,3 +2055,20 @@ def ctn_check_agent_information(total_nb_agent: int, nb_poller:int, timeout: int
logger.console(f"unexpected result: {result}")
return False


def ctn_get_nb_process(exe:str):
"""
ctn_get_nb_process
get the number of process with a specific executable
Args:
exe: executable to search
Returns: number of process
"""

counter = 0

for p in psutil.process_iter():
if exe in p.name() or exe in ' '.join(p.cmdline()):
counter += 1
return counter
11 changes: 11 additions & 0 deletions tests/resources/resources.resource
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,13 @@ Ctn Kindly Stop Broker
... If bbdo2 is set to True, the rrd broker will not be checked for RRD logs
... since there are still bugs in this context.
[Arguments] ${only_central}=False

#in some case tear down calls this function whereas broker has not been started
${b1_obj} Get Process Object b1
IF "${b1_obj}" == "${None}"
Log To Console "central broker not started"
RETURN
END
Send Signal To Process SIGTERM b1
IF not ${only_central} Send Signal To Process SIGTERM b2
${result} Wait For Process b1 timeout=60s
Expand Down Expand Up @@ -228,6 +235,10 @@ Ctn Stop Custom Engine
Ctn Stop Engine
Log To Console "stop centengine"
${count} Ctn Get Engines Count
IF ${count} == 0
Log To Console "no engine to stop"
RETURN
END
FOR ${idx} IN RANGE 0 ${count}
Send Signal To Process SIGTERM e${idx}
END
Expand Down

1 comment on commit 99846af

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Robot Results

✅ Passed ❌ Failed ⏭️ Skipped Total Pass % ⏱️ Duration
4 1 0 5 80.00 3m0.708665s

Failed Tests

Name Message ⏱️ Duration Suite
EBNSGU3_BBDO3 The servicegroup 1 still exists 85.330 s Servicegroups

Please sign in to comment.