From 368dafe5f1e5398f6b05cab872bdcf6fd35f4ea2 Mon Sep 17 00:00:00 2001 From: Abinaya Dhandapani Date: Sun, 26 Jan 2025 23:04:17 -0600 Subject: [PATCH] Handle APML_ALERT assertion for SBRMI::[AlertSts] 1. If there is core poison consumption error occured, the APML_ALERT will be toggled low, and no one to clear it to de-assert APML_ALERT. This commit clears the RMI_ALERTSTATUS [0x10 ~ 0x1f] [0x50 ~ 0x5f] which will clear the SBRMI::[AlertSts] bit and de-assert APML_ALERT. 2. Incremented the minor version by one to indicate there is a code change to ADDC for v14 release. 3. Handled error case if the family is not Turin/Genoa. Signed-off-by: Abinaya Dhandapani --- inc/cper.hpp | 2 +- src/main.cpp | 62 +++++++++++++++++++++++++++++++++++----------------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/inc/cper.hpp b/inc/cper.hpp index 0446587..3291623 100644 --- a/inc/cper.hpp +++ b/inc/cper.hpp @@ -32,7 +32,7 @@ * CPER section descriptor revision, used in revision field in struct * cper_section_descriptor */ -#define CPER_MINOR_REV (0x000C) +#define CPER_MINOR_REV (0x000D) #define ADDC_GEN_NUMBER_1 (0x01) #define ADDC_GEN_NUMBER_2 (0x02) diff --git a/src/main.cpp b/src/main.cpp index 1565323..308b9e8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -502,29 +502,49 @@ void clearSbrmiAlertMask() oob_status_t ret; - sd_journal_print(LOG_ERR, - "Clear Alert Mask bit of SBRMI Control register \n"); - uint8_t buffer; + for (uint8_t socNum = 0; socNum < num_of_proc; socNum++) + { + sd_journal_print( + LOG_INFO, + "Clear Alert Mask bit of SBRMI Control register for socket %d\n", + socNum); - ret = read_register(p0_info, SBRMI_CONTROL_REGISTER, &buffer); + uint8_t buffer; - if (ret == OOB_SUCCESS) - { - buffer = buffer & 0xFE; - write_register(p0_info, SBRMI_CONTROL_REGISTER, - static_cast(buffer)); - } + ret = read_register(socNum, SBRMI_CONTROL_REGISTER, &buffer); - if (num_of_proc == TWO_SOCKET) - { - buffer = 0; - ret = read_register(p1_info, SBRMI_CONTROL_REGISTER, &buffer); if (ret == OOB_SUCCESS) { buffer = buffer & 0xFE; - write_register(p1_info, SBRMI_CONTROL_REGISTER, + write_register(socNum, SBRMI_CONTROL_REGISTER, static_cast(buffer)); } + + for (uint8_t i = 0; i < sizeof(alert_status); i++) + { + ret = read_register(socNum, alert_status[i], &buffer); + + if (ret == OOB_SUCCESS) + { + if ((buffer & MASK_0X0F) != 0) + { + sd_journal_print( + LOG_INFO, + "Socket%d: MCE Stat of SBRMIx[0x%x] is set to 0x%x\n", + socNum, alert_status[i], buffer); + + buffer = buffer & INT_255; + write_register(socNum, alert_status[i], + static_cast(buffer)); + } + } + else + { + sd_journal_print(LOG_ERR, + "Socket%d: Failed to read SBRMIx[0x%x]", + socNum, alert_status[i]); + } + } } } @@ -625,8 +645,6 @@ void performPlatformInitialization() oob_status_t ret = OOB_MAILBOX_CMD_UNKNOWN; struct processor_info platInfo[INDEX_1]; - std::cout << "perform performPlatformInitialization" << std::endl; - if (platformInitialized == false) { while (ret != OOB_SUCCESS) @@ -641,7 +659,6 @@ void performPlatformInitialization() } sleep(INDEX_1); } - std::cout << "platformInitialized " << std::endl; if (ret == OOB_SUCCESS) { @@ -655,7 +672,6 @@ void performPlatformInitialization() } else if (platInfo->family == TURIN_FAMILY_ID) { - std::cout << "Turin platform " << std::endl; currentHostStateMonitor(); clearSbrmiAlertMask(); @@ -668,6 +684,12 @@ void performPlatformInitialization() runtimeErrPollingSupported = true; } + else + { + throw std::runtime_error( + "This program is not supported for the platform 0x%x\n" + + platInfo->family); + } platformInitialized = true; apmlInitialized = true; } @@ -703,7 +725,7 @@ void apmlActiveMonitor() { ret = get_bmc_ras_oob_config(INDEX_0, &d_out); - if(ret == OOB_MAILBOX_CMD_UNKNOWN) + if (ret == OOB_MAILBOX_CMD_UNKNOWN) { ret = esmi_get_processor_info(INDEX_0, plat_info); }