-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enable APML RAS Manager Initialization
- Added initialization for APML RAS Manager. - Included conditional compilation for APML support. - Added a placeholder error log for PLDM RAS capabilities, indicating that they are yet to be enabled. - The init function repeatedly attempts to get the BMC RAS OOB configuration until successful. - The function initializes the platform with the block ID's that needs to be harvested during a crashdump and sets up a D-Bus match to monitor watchdog state changes to monitor BIOS post complete. - It reads CPU IDs for all CPUs and logs errors on failure. - The function also handles BIOS post-completion by configuring PCIE OOB settings and enabling PCIE error thresholds based on watchdog timer changes. - It also clears SbrmiAlertMask register so APML_ALERT_L will be asserted during a syncflood in the system. - The commit has oem_cper.h providing the outline of file format for both runtime and crashdump CPER records. - Overall , this commit provides all the necessary preps needed to enable the crashdump flow. Signed-off-by: aasboddu <[email protected]>, Abinaya Dhandapani <[email protected]>
- Loading branch information
Showing
16 changed files
with
2,863 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
#include "error_monitor.hpp" | ||
extern "C" | ||
{ | ||
#include "apml.h" | ||
#include "apml_common.h" | ||
#include "esmi_cpuid_msr.h" | ||
#include "esmi_mailbox.h" | ||
#include "esmi_rmi.h" | ||
} | ||
#include <boost/asio/deadline_timer.hpp> | ||
#include <boost/asio/posix/stream_descriptor.hpp> | ||
#include <gpiod.hpp> | ||
|
||
namespace amd | ||
{ | ||
namespace ras | ||
{ | ||
namespace apml | ||
{ | ||
class Manager : public amd::ras::Manager | ||
{ | ||
public: | ||
Manager() = delete; | ||
~Manager() = default; | ||
Manager(const Manager&) = delete; | ||
Manager& operator=(const Manager&) = delete; | ||
Manager(Manager&&) = delete; | ||
Manager& operator=(Manager&&) = delete; | ||
|
||
Manager(amd::ras::config::Manager& manager, | ||
sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus, | ||
boost::asio::io_context& io) : | ||
amd::ras::Manager(manager), p0apmlAlertEvent(io), p1apmlAlertEvent(io), | ||
objectServer(objectServer), systemBus(systemBus), io(io) | ||
{} | ||
|
||
virtual void init(); | ||
|
||
virtual void configure(); | ||
|
||
|
||
/** | ||
* @brief Requests GPIO events for hardware alert handling. | ||
* | ||
* This function configures a GPIO line and stream descriptor to listen for | ||
* events. It triggers the provided callback function upon event detection. | ||
* | ||
* @param[in] gpioPin The GPIO pin to monitor. | ||
* @param[in] callback The function to call when an event is detected. | ||
* @param[in] line The GPIO line to use for event detection. | ||
* @param[in] stream The stream descriptor used to listen for events. | ||
*/ | ||
void requestGPIOEvents(const std::string&, const std::function<void()>&, | ||
gpiod::line&, | ||
boost::asio::posix::stream_descriptor&); | ||
|
||
/** | ||
* @brief Handler for P0 alert events. | ||
* | ||
* This function is invoked when an alert event occurs on P0. The function | ||
* handles the event by processing the necessary response. | ||
*/ | ||
void p0AlertEventHandler(); | ||
|
||
/** | ||
* @brief Handler for P1 alert events. | ||
* | ||
* This function is invoked when an alert event occurs on P1. The function | ||
* handles the event by processing the necessary response. | ||
*/ | ||
void p1AlertEventHandler(); | ||
|
||
/** | ||
* @brief GPIO line for handling P0 alert events. | ||
* | ||
* This GPIO line is used to detect hardware alerts for P0 and trigger | ||
* events for processing. | ||
*/ | ||
gpiod::line p0apmlAlertLine; | ||
|
||
/** | ||
* @brief GPIO line for handling P1 alert events. | ||
* | ||
* This GPIO line is used to detect hardware alerts for P1 and trigger | ||
* events for processing. | ||
*/ | ||
gpiod::line p1apmlAlertLine; | ||
|
||
/** @brief Stream descriptor for handling P0 APML alert events. | ||
* | ||
* @details This stream descriptor listens for alert events related to the | ||
* P0 sensor and triggers actions upon detection. | ||
*/ | ||
boost::asio::posix::stream_descriptor p0apmlAlertEvent; | ||
|
||
/** @brief Stream descriptor for handling P1 APML alert events. | ||
* | ||
* @details This stream descriptor listens for alert events related to the | ||
* P1 sensor and triggers actions upon detection. | ||
*/ | ||
boost::asio::posix::stream_descriptor p1apmlAlertEvent; | ||
|
||
private: | ||
sdbusplus::asio::object_server& objectServer; | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus; | ||
|
||
uint8_t progId = 1; | ||
uint64_t recordId = 1; | ||
uint8_t watchdogTimerCounter = 0; | ||
boost::asio::io_context& io; | ||
std::mutex harvestMutex; | ||
std::vector<uint8_t> blockId; | ||
bool apmlInitialized = false; | ||
bool platformInitialized = false; | ||
bool p0AlertProcessed = false; | ||
bool p1AlertProcessed = false; | ||
std::mutex mcaErrorHarvestMtx; | ||
std::mutex dramErrorHarvestMtx; | ||
std::mutex pcieErrorHarvestMtx; | ||
|
||
/** @brief Update processor OOB configuration. | ||
* | ||
* @details This API updates processor OOB configuration | ||
* for MCA, DRAM and PCIe with the user input. | ||
* | ||
* @param[in] oob_config_d_in - oob configuration data containing | ||
* mca_oob_misc0_ec_enable, dram_cecc_oob_ec_mode, | ||
* dram_cecc_leak_rate, pcie_err_reporting_en, | ||
* pcie_ue_oob_counter_en and core_mca_err_reporting_en. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setRasOobConfig(struct oob_config_d_in); | ||
|
||
/** @brief Get processor OOB configuration. | ||
* | ||
* @details This API reads processor OOB configuration | ||
* for MCA, DRAM and PCIe. | ||
* | ||
* @param[out] oob_config_d_in - oob configuration data containing | ||
* mca_oob_misc0_ec_enable, dram_cecc_oob_ec_mode, | ||
* dram_cecc_leak_rate, pcie_err_reporting_en, | ||
* pcie_ue_oob_counter_en and core_mca_err_reporting_en. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t getRasOobConfig(struct oob_config_d_in*); | ||
|
||
/** @brief Set PCIe OOB error reporting. | ||
* | ||
* @details This API enables OOB configuration for PCIe | ||
* based on PcieAerPollingEn attribute in rasConfigTable. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieOobConfig(); | ||
|
||
/** @brief Update PCIe OOB configuration. | ||
* | ||
* @details This API updates PCIe OOB registers and enables | ||
* PCIe OOB error reporting. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieOobRegisters(); | ||
|
||
/** @brief Set RAS error threshold configuration. | ||
* | ||
* @details This API updates RAS error thresholds for | ||
* MCA, DRAM and PCIe with the user input. | ||
* | ||
* @param[in] run_time_threshold - runtime threshold configuration | ||
* containing error type [00(MCA), 01(DRAM CECC), 10(PCIE_UE), | ||
* 11(PCIE_CE)], error count threshold and max interrupt rate. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setRasErrThreshold(struct run_time_threshold); | ||
|
||
/** @brief Set PCIe error threshold configuration. | ||
* | ||
* @details This API enables PCIe error thresholds | ||
* based on PcieAerThresholdEn attribute in rasConfigTable. | ||
* | ||
* @return OOB_SUCCESS is returned upon successful call. | ||
* @return APML_ERR error code is returned upon failure. | ||
*/ | ||
oob_status_t setPcieErrThreshold(); | ||
|
||
/** @brief Clear the SBRMI alert mask bit. | ||
* | ||
* @details Clears alert mask bit in SBRMI control register for the given | ||
* SOC socket number. | ||
* | ||
* @param[in] socNum - Socket number of the processor. | ||
*/ | ||
void clearSbrmiAlertMask(uint8_t socNum); | ||
|
||
/** @brief Monitors the current host power state. | ||
* | ||
* @details This API monitors the current host power state using | ||
* xyz.openbmc_project.State.Host D-bus Interface. | ||
*/ | ||
void currentHostStateMonitor(); | ||
|
||
/** @brief Initializes platform-specific settings. | ||
* | ||
* @details It initializes the platform based on the family ID. Block ID's | ||
* are selected based on the platform that needs to be harvested during a | ||
* crashdump. It also invokes clearSbrmiAlertMask() API to clear | ||
* Sbrmi::AlertMask bit | ||
*/ | ||
void platformInitialize(); | ||
|
||
|
||
bool decodeInterrupt(uint8_t); | ||
bool harvestMcaValidityCheck(uint8_t, uint16_t*, uint16_t*); | ||
|
||
oob_status_t runTimeErrValidityCheck(uint8_t, struct ras_rt_err_req_type, | ||
struct ras_rt_valid_err_inst*); | ||
|
||
void harvestRuntimeErrors(uint8_t, struct ras_rt_valid_err_inst, | ||
struct ras_rt_valid_err_inst); | ||
|
||
void runTimeErrorInfoCheck(uint8_t, uint8_t); | ||
|
||
void harvestMcaDataBanks(uint8_t, uint16_t, uint16_t); | ||
|
||
|
||
void getLastTransAddr(const std::shared_ptr<FatalCperRecord>&, uint8_t); | ||
void harvestDebugLogDump(const std::shared_ptr<FatalCperRecord>&, uint8_t, | ||
uint8_t, int64_t*, uint16_t&); | ||
|
||
template <typename T> | ||
void dumpProcErrorSection(const std::shared_ptr<T>& data, uint8_t soc_num, | ||
struct ras_rt_valid_err_inst inst, | ||
uint8_t category, uint16_t Section, | ||
uint32_t* Severity, uint64_t* CheckInfo); | ||
|
||
void harvestDramCeccErrorCounters(struct ras_rt_valid_err_inst, | ||
uint8_t ); | ||
}; | ||
|
||
} // namespace apml | ||
} // namespace ras | ||
} // namespace amd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
#pragma once | ||
|
||
#include "oem_cper.hpp" | ||
#include "hex_util.hpp" | ||
#include "crashdump_manager.hpp" | ||
#include "host_util.hpp" | ||
#include "error_monitor.hpp" | ||
#include <phosphor-logging/lg2.hpp> | ||
#include <phosphor-logging/log.hpp> | ||
|
||
#include <chrono> | ||
#include <cstring> | ||
#include <ctime> | ||
#include <filesystem> | ||
#include <memory> | ||
#include <regex> | ||
|
||
static const std::string runtimeMcaErr = "RUNTIME_MCA_ERROR"; | ||
static const std::string runtimePcieErr = "RUNTIME_PCIE_ERROR"; | ||
static const std::string runtimeDramErr = "RUNTIME_DRAM_ERROR"; | ||
static const std::string fatalErr = "FATAL"; | ||
|
||
namespace amd | ||
{ | ||
namespace ras | ||
{ | ||
namespace cper_util | ||
{ | ||
|
||
constexpr int SEV_NON_FATAL_UNCORRECTED = 0; | ||
constexpr int SEV_NON_FATAL_CORRECTED = 2; | ||
|
||
constexpr int CPER_VALID_PLATFORM_ID = 0x1; | ||
constexpr int CPER_VALID_TIMESTAMP = 0x2; | ||
constexpr int ADDC_GEN_NUMBER_1 = 0x01; | ||
constexpr int ADDC_GEN_NUMBER_2 = 0x02; | ||
constexpr int TURIN_FAMILY_ID = 0x1A; | ||
constexpr int GENOA_FAMILY_ID = 0x19; | ||
constexpr uint16_t PCIE_VENDOR_ID = 0x1022; | ||
constexpr int MINOR_REVISION = 0xB; | ||
|
||
/*template void dumpHeaderSection(const std::shared_ptr<FatalCperRecord>& data, | ||
uint16_t sectionCount, uint32_t errorSeverity, | ||
const std::string& errorType, | ||
unsigned int boardId, uint64_t& recordId); | ||
template void dumpHeaderSection( | ||
const std::shared_ptr<McaRuntimeCperRecord>& data, uint16_t sectionCount, | ||
uint32_t errorSeverity, const std::string& errorType, unsigned int boardId, | ||
uint64_t& recordId); | ||
template void dumpHeaderSection( | ||
const std::shared_ptr<PcieRuntimeCperRecord>& data, uint16_t sectionCount, | ||
uint32_t errorSeverity, const std::string& errorType, unsigned int boardId, | ||
uint64_t& recordId); | ||
template void dumpErrorDescriptorSection( | ||
const std::shared_ptr<FatalCperRecord>&, uint16_t, const std::string&, | ||
uint32_t*, uint8_t, uint32_t); | ||
template void dumpErrorDescriptorSection( | ||
const std::shared_ptr<McaRuntimeCperRecord>&, uint16_t, const std::string&, | ||
uint32_t*, uint8_t, uint32_t); | ||
template void dumpErrorDescriptorSection( | ||
const std::shared_ptr<PcieRuntimeCperRecord>&, uint16_t, const std::string&, | ||
uint32_t*, uint8_t, uint32_t); | ||
template void createCperFile(const std::shared_ptr<FatalCperRecord>&, | ||
const std::string&, uint16_t, int&); | ||
template void createCperFile(const std::shared_ptr<McaRuntimeCperRecord>&, | ||
const std::string&, uint16_t, int&); | ||
template void createCperFile(const std::shared_ptr<PcieRuntimeCperRecord>&, | ||
const std::string&, uint16_t, int&); | ||
*/ | ||
std::string findCperFilename(int number); | ||
|
||
void exportCperToDBus( | ||
int num, const EFI_ERROR_TIME_STAMP& TimeStampStr, | ||
sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus); | ||
|
||
void createCperRecord( | ||
sdbusplus::asio::object_server& objectServer, | ||
std::shared_ptr<sdbusplus::asio::connection>& systemBus); | ||
|
||
template <typename T> | ||
void calculateTimeStamp(const std::shared_ptr<T>&); | ||
|
||
template <typename T> | ||
void dumpHeaderSection(const std::shared_ptr<T>&, uint16_t, uint32_t, | ||
const std::string&, unsigned int, uint64_t&); | ||
template <typename T> | ||
void dumpErrorDescriptorSection(const std::shared_ptr<T>&, uint16_t, | ||
const std::string&, uint32_t*, uint8_t, | ||
uint32_t); | ||
|
||
void dumpProcessorErrorSection( | ||
const std::shared_ptr<FatalCperRecord>& fatalPtr, uint8_t socNum, | ||
const std::unique_ptr<CpuId[]>& cpuId, uint8_t cpuCount); | ||
|
||
void dumpProcErrorInfoSection( | ||
const std::shared_ptr<McaRuntimeCperRecord>& procPtr, uint16_t sectionCount, | ||
uint64_t* checkInfo, uint32_t sectionStart, uint8_t cpuCount, | ||
const std::unique_ptr<CpuId[]>& cpuId); | ||
|
||
void dumpContextInfo(const std::shared_ptr<FatalCperRecord>& fatalPtr, | ||
uint16_t numbanks, uint16_t bytespermca, uint8_t socNum, | ||
const std::unique_ptr<uint64_t[]>& ppin, | ||
const std::unique_ptr<uint32_t[]>& uCode, uint8_t cpuCount); | ||
|
||
void dumpPcieErrorInfoSection( | ||
const std::shared_ptr<PcieRuntimeCperRecord>& data, uint16_t sectionStart, | ||
uint16_t sectionCount); | ||
|
||
std::string getCperFilename(int num); | ||
|
||
template <typename T> | ||
void createCperFile(const std::shared_ptr<T>&, const std::string&, uint16_t, | ||
int&); | ||
|
||
bool checkSignatureIdMatch(std::map<std::string, std::string>* configSigIdList, | ||
const std::shared_ptr<FatalCperRecord>& rcd); | ||
|
||
/*The function returns the highest severity out of all Section Severity for CPER | ||
header Severity Order = Fatal > non-fatal uncorrected > corrected*/ | ||
bool calculateErrorSeverity(uint32_t* severity, uint16_t sectionCount, | ||
uint32_t* highestSeverity, | ||
const std::string& errorType); | ||
|
||
} | ||
} | ||
} |
Oops, something went wrong.