Skip to content

Commit

Permalink
fix(broker): mariadb connection with errno0 (#1888)
Browse files Browse the repository at this point in the history
* fix(broker/mysql): mysql_connection had an issue with last mariadb server

REFS: MON-153666
  • Loading branch information
bouda1 committed Nov 22, 2024
1 parent 0f28488 commit 7cf73e3
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 9 deletions.
27 changes: 18 additions & 9 deletions broker/core/src/mysql_connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/

#include <errmsg.h>
#include <mysqld_error.h>

#include "com/centreon/broker/config/applier/init.hh"
#include "com/centreon/broker/log_v2.hh"
Expand Down Expand Up @@ -463,18 +464,26 @@ void mysql_connection::_statement(mysql_task* t) {
"mysql_connection {:p}: execute statement {} attempt {}: {}",
static_cast<const void*>(this), task->statement_id, attempts, query);
if (mysql_stmt_execute(stmt)) {
std::string err_msg(
fmt::format("{} errno={} {}", mysql_error::msg[task->error_code],
::mysql_errno(_conn), ::mysql_stmt_error(stmt)));
SPDLOG_LOGGER_ERROR(log_v2::sql(),
"connection fail to execute statement {:p}: {}",
static_cast<const void*>(this), err_msg);
if (_server_error(::mysql_stmt_errno(stmt))) {
int32_t err_code = ::mysql_stmt_errno(stmt);
std::string err_msg(fmt::format("{} errno={} {}",
mysql_error::msg[task->error_code],
err_code, ::mysql_stmt_error(stmt)));
if (err_code == 0) {
SPDLOG_LOGGER_ERROR(
log_v2::sql(),
"mysql_connection: errno=0, so we simulate a server error 1213");
err_code = CR_SERVER_LOST;
} else {
SPDLOG_LOGGER_ERROR(log_v2::sql(),
"connection fail to execute statement {:p}: {}",
static_cast<const void*>(this), err_msg);
}
if (_server_error(err_code)) {
set_error_message(err_msg);
break;
}
if (mysql_stmt_errno(stmt) != 1213 &&
mysql_stmt_errno(stmt) != 1205) // Dead Lock error
if (err_code != ER_LOCK_DEADLOCK &&
err_code != ER_LOCK_WAIT_TIMEOUT) // Dead Lock error
attempts = MAX_ATTEMPTS;

if (mysql_commit(_conn)) {
Expand Down
143 changes: 143 additions & 0 deletions tests/broker-engine/services-and-bulk-stmt.robot
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ EBBPS1
IF "${output}" == "((0,),)" BREAK
END
Should Be Equal As Strings ${output} ((0,),)
Disconnect From Database

FOR ${i} IN RANGE ${1000}
Ctn Process Service Check Result host_1 service_${i+1} 2 warning${i}
Expand Down Expand Up @@ -102,6 +103,7 @@ EBBPS1
IF "${output}" == "((0,),)" BREAK
END
Should Be Equal As Strings ${output} ((0,),)
Disconnect From Database

EBBPS2
[Documentation] 1000 service check results are sent to the poller. The test is done with the unified_sql stream, no service status is lost, we find the 1000 results in the database: table services.
Expand Down Expand Up @@ -148,6 +150,7 @@ EBBPS2
IF "${output}" == "((0,),)" BREAK
END
Should Be Equal As Strings ${output} ((0,),)
Disconnect From Database

FOR ${i} IN RANGE ${1000}
Ctn Process Service Check Result host_1 service_${i+1} 2 critical${i}
Expand Down Expand Up @@ -184,6 +187,7 @@ EBBPS2
IF "${output}" == "((0,),)" BREAK
END
Should Be Equal As Strings ${output} ((0,),)
Disconnect From Database

EBMSSM
[Documentation] 1000 services are configured with 100 metrics each. The rrd output is removed from the broker configuration. GetSqlManagerStats is called to measure writes into data_bin.
Expand Down Expand Up @@ -230,6 +234,7 @@ EBMSSM
Sleep 1s
END
Should Be True ${output[0][0]} >= 100000
Disconnect From Database

EBPS2
[Documentation] 1000 services are configured with 20 metrics each. The rrd output is removed from the broker configuration to avoid to write too many rrd files. While metrics are written in bulk, the database is stopped. This must not crash broker.
Expand Down Expand Up @@ -449,6 +454,144 @@ Services_and_bulks_${id}
... 2 150


EBMSSMDBD
[Documentation] 1000 services are configured with 100 metrics each.
... The rrd output is removed from the broker configuration.
... While metrics are written in the database, we stop the database and then restart it.
... Broker must recover its connection to the database and continue to write metrics.
[Tags] broker engine unified_sql MON-153323
Ctn Clear Metrics
Ctn Config Engine ${1} ${1} ${1000}
# We want all the services to be passive to avoid parasite checks during our test.
Ctn Set Services Passive ${0} service_.*
Ctn Config Broker central
Ctn Config Broker rrd
Ctn Config Broker module ${1}
Ctn Config BBDO3 1
Ctn Broker Config Log central core error
Ctn Broker Config Log central tcp error
Ctn Broker Config Log central sql debug
Ctn Config Broker Sql Output central unified_sql
Ctn Config Broker Remove Rrd Output central
Ctn Clear Retention
${start} Get Current Date
Ctn Start Broker
Ctn Start Engine

Ctn Wait For Engine To Be Ready ${start} 1

${start} Ctn Get Round Current Date
# Let's wait for one "INSERT INTO data_bin" to appear in stats.
Log To Console Many service checks with 100 metrics each are processed.
FOR ${i} IN RANGE ${1000}
Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100
END

Log To Console We wait for at least one metric to be written in the database.
# Let's wait for all force checks to be in the storage database.
Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort}
FOR ${i} IN RANGE ${500}
${output} Query
... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start}
IF ${output[0][0]} >= 1 BREAK
Sleep 1s
END
Disconnect From Database

Log To Console Let's start some database manipulation...
${start} Get Current Date

FOR ${i} IN RANGE ${3}
Ctn Stop Mysql
Sleep 10s
Ctn Start Mysql
${content} Create List could not insert data in data_bin
${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10
Log To Console ${result}
END

EBMSSMPART
[Documentation] 1000 services are configured with 100 metrics each.
... The rrd output is removed from the broker configuration.
... The data_bin table is configured with two partitions p1 and p2 such
... that p1 contains old data and p2 contains current data.
... While metrics are written in the database, we remove the p2 partition.
... Once the p2 partition is recreated, broker must recover its connection
... to the database and continue to write metrics.
... To check that last point, we force a last service check and we check
... that its metrics are written in the database.
[Tags] broker engine unified_sql MON-153323
Ctn Clear Metrics
Ctn Config Engine ${1} ${1} ${1000}
# We want all the services to be passive to avoid parasite checks during our test.
Ctn Set Services Passive ${0} service_.*
Ctn Config Broker central
Ctn Config Broker rrd
Ctn Config Broker module ${1}
Ctn Config BBDO3 1
Ctn Broker Config Log central core error
Ctn Broker Config Log central tcp error
Ctn Broker Config Log central sql trace
Ctn Config Broker Sql Output central unified_sql
Ctn Config Broker Remove Rrd Output central
Ctn Clear Retention

Ctn Prepare Partitions For Data Bin
${start} Get Current Date
Ctn Start Broker
Ctn Start Engine

Ctn Wait For Engine To Be Ready ${start} 1

${start} Ctn Get Round Current Date
# Let's wait for one "INSERT INTO data_bin" to appear in stats.
Log To Console Many service checks with 100 metrics each are processed.
FOR ${i} IN RANGE ${1000}
Ctn Process Service Check Result With Metrics host_1 service_${i+1} 1 warning${i} 100
END

Log To Console We wait for at least one metric to be written in the database.
# Let's wait for all force checks to be in the storage database.
Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort}
FOR ${i} IN RANGE ${500}
${output} Query
... SELECT COUNT(s.last_check) FROM metrics m LEFT JOIN index_data i ON m.index_id = i.id LEFT JOIN services s ON s.host_id = i.host_id AND s.service_id = i.service_id WHERE metric_name LIKE "metric_%%" AND s.last_check >= ${start}
IF ${output[0][0]} >= 1 BREAK
Sleep 1s
END
Disconnect From Database

Log To Console Let's start some database manipulation...
Ctn Remove P2 From Data Bin
${start} Get Current Date

${content} Create List errno=
FOR ${i} IN RANGE ${6}
${result} Ctn Find In Log With Timeout ${centralLog} ${start} ${content} 10
IF ${result} BREAK
END

Log To Console Let's recreate the p2 partition...
Ctn Add P2 To Data Bin

${start} Ctn Get Round Current Date
Ctn Process Service Check Result With Metrics host_1 service_1 0 Last Output OK 100

Log To Console Let's wait for the last service check to be in the database...
Connect To Database pymysql ${DBName} ${DBUser} ${DBPass} ${DBHost} ${DBPort}
FOR ${i} IN RANGE ${120}
${output} Query SELECT count(*) FROM data_bin WHERE ctime >= ${start} - 10
Log To Console ${output}
IF ${output[0][0]} >= 100 BREAK
Sleep 1s
END
Log To Console ${output}
Should Be True ${output[0][0]} >= 100
Disconnect From Database

Ctn Init Data Bin Without Partition


*** Keywords ***
Ctn Test Clean
Ctn Stop Engine
Expand Down
98 changes: 98 additions & 0 deletions tests/resources/Broker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2566,3 +2566,101 @@ def ctn_broker_get_ba(port: int, ba_id: int, output_file: str, timeout=TIMEOUT):
except:
logger.console("gRPC server not ready")
return res


def ctn_prepare_partitions_for_data_bin():
"""
Create two partitions for the data_bin table.
The first one named p1 contains data with ctime older than now - 60.
The second one named p2 contains data with ctime older than now + 3600.
"""
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME_STORAGE,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)

now = int(time.time())
before = now - 60
after = now + 3600
with connection:
with connection.cursor() as cursor:
cursor.execute("DROP TABLE IF EXISTS data_bin")
sql = f"""CREATE TABLE `data_bin` (
`id_metric` int(11) DEFAULT NULL,
`ctime` int(11) DEFAULT NULL,
`value` float DEFAULT NULL,
`status` enum('0','1','2','3','4') DEFAULT NULL,
KEY `index_metric` (`id_metric`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1
PARTITION BY RANGE (`ctime`)
(PARTITION `p1` VALUES LESS THAN ({before}) ENGINE = InnoDB,
PARTITION `p2` VALUES LESS THAN ({after}) ENGINE = InnoDB)"""
cursor.execute(sql)
connection.commit()


def ctn_remove_p2_from_data_bin():
"""
Remove the partition p2 from the data_bin table.
"""
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME_STORAGE,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)

with connection:
with connection.cursor() as cursor:
cursor.execute("ALTER TABLE data_bin DROP PARTITION p2")
connection.commit()


def ctn_add_p2_to_data_bin():
"""
Add the partition p2 the the data_bin table.
"""
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME_STORAGE,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)

after = int(time.time()) + 3600
with connection:
with connection.cursor() as cursor:
cursor.execute(
f"ALTER TABLE data_bin ADD PARTITION (PARTITION p2 VALUES LESS THAN ({after}))")
connection.commit()


def ctn_init_data_bin_without_partition():
"""
Recreate the data_bin table without partition.
"""
connection = pymysql.connect(host=DB_HOST,
user=DB_USER,
password=DB_PASS,
database=DB_NAME_STORAGE,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)

now = int(time.time())
before = now - 60
after = now + 3600
with connection:
with connection.cursor() as cursor:
cursor.execute("DROP TABLE IF EXISTS data_bin")
sql = f"""CREATE TABLE `data_bin` (
`id_metric` int(11) DEFAULT NULL,
`ctime` int(11) DEFAULT NULL,
`value` float DEFAULT NULL,
`status` enum('0','1','2','3','4') DEFAULT NULL,
KEY `index_metric` (`id_metric`)
) ENGINE=InnoDB DEFAULT CHARSET=latin1"""
cursor.execute(sql)
connection.commit()

1 comment on commit 7cf73e3

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Robot Results

✅ Passed ❌ Failed ⏭️ Skipped Total Pass % ⏱️ Duration
13 1 0 14 92.86 0s

Passed Tests

Name ⏱️ Duration Suite
BETAG1 0.000 s Tags
BETAG2 0.000 s Tags
BEUTAG1 0.000 s Tags
BEUTAG2 0.000 s Tags
BEUTAG3 0.000 s Tags
BEUTAG4 0.000 s Tags
BEUTAG5 0.000 s Tags
BEUTAG6 0.000 s Tags
BEUTAG8 0.000 s Tags
BEUTAG9 0.000 s Tags
BEUTAG10 0.000 s Tags
BEUTAG12 0.000 s Tags
BEUTAG_REMOVE_HOST_FROM_HOSTGROUP 0.000 s Tags

Failed Tests

Name Message ⏱️ Duration Suite
BEUTAG11 First step: Service (1, 4) should have servicegroup tags 2 and 4 0.000 s Tags

Please sign in to comment.