Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ option(BUILD_EXAMPLES "Build examples" OFF)
# Enable shared libraries for gtest
option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON)

option(BUILD_ESMI "Enable AMDSMI ESMI Library" ON)
if(BUILD_ESMI)
add_definitions("-DENABLE_ESMI_LIB=1")
endif()

# Enable address sanitizer
set(ADDRESS_SANITIZER_DEFAULT OFF)
if(DEFINED ENV{ADDRESS_SANITIZER})
Expand Down
3 changes: 0 additions & 3 deletions CPPLINT.cfg

This file was deleted.

53 changes: 53 additions & 0 deletions common/rdc_field.data
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,56 @@ FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit",
FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true)
FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true)
FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true)

// CPU-related fields description
FLD_DESC_ENT(RDC_FI_CPU_COUNT, "The number of CPU cores", "CPU_COUNT", false)
FLD_DESC_ENT(RDC_FI_CPU_UTIL_TOTAL, "CPU total percentage of time in use", "CPU_UTIL_TOTAL", false)
FLD_DESC_ENT(RDC_FI_CPU_UTIL_USER, "Percent of time in use by the user", "CPU_UTIL_USER", false)
FLD_DESC_ENT(RDC_FI_CPU_UTIL_NICE, "Percent of time in use by low priority programs", "CPU_UTIL_NICE", false)
FLD_DESC_ENT(RDC_FI_CPU_UTIL_SYS, "Percent of time in use by the system", "CPU_UTIL_SYS", false)
FLD_DESC_ENT(RDC_FI_CPU_UTIL_IRQ, "Percent of time in use by interrupts", "CPU_UTIL_IRQ", false)
FLD_DESC_ENT(RDC_FI_CPU_TEMP_CURRENT, "Temperature (Celsius)", "CPU_TEMP_CURRENT", false)
FLD_DESC_ENT(RDC_FI_CPU_CLOCK_CURRENT, "Clock speed (KHz)", "CPU_CLOCK_CURRENT", false)
FLD_DESC_ENT(RDC_FI_CPU_POWER_UTIL_CURRENT, "Power usage (watts)", "CPU_POWER_UTIL_CURRENT", false)
FLD_DESC_ENT(RDC_FI_CPU_POWER_LIMIT, "Power limit (watts)", "CPU_POWER_LIMIT", false)
FLD_DESC_ENT(RDC_FI_CPU_VENDOR, "Name of the vendor", "CPU_VENDOR", false)
FLD_DESC_ENT(RDC_FI_CPU_MODEL, "Name of the model", "CPU_MODEL", false)

// AI generated:
// Field Definitions
FLD_DESC_ENT(RDC_FI_CPU_CORE_ENERGY, "CPU core energy consumption (microjoules)", "CPU_CORE_ENERGY", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_ENERGY, "CPU socket energy consumption (microjoules)", "CPU_SOCKET_ENERGY", false)
FLD_DESC_ENT(RDC_FI_CPU_THREADS_PER_CORE, "Number of threads per CPU core", "CPU_THREADS_PER_CORE", false)
FLD_DESC_ENT(RDC_FI_CPU_HSMP_DRIVER_VERSION, "HSMP driver version", "CPU_HSMP_DRIVER_VERSION", false)
FLD_DESC_ENT(RDC_FI_CPU_SMU_FW_VERSION, "SMU firmware version", "CPU_SMU_FW_VERSION", false)
FLD_DESC_ENT(RDC_FI_CPU_HSMP_PROTO_VERSION, "HSMP protocol version", "CPU_HSMP_PROTO_VERSION", false)
FLD_DESC_ENT(RDC_FI_CPU_PROCHOT_STATUS, "CPU PROCHOT status", "CPU_PROCHOT_STATUS", false)
FLD_DESC_ENT(RDC_FI_CPU_FCLK_FREQUENCY, "CPU fabric clock frequency (MHz)", "CPU_FCLK_FREQUENCY", false)
FLD_DESC_ENT(RDC_FI_CPU_MCLK_FREQUENCY, "CPU memory clock frequency (MHz)", "CPU_MCLK_FREQUENCY", false)
FLD_DESC_ENT(RDC_FI_CPU_CCLK_LIMIT, "CPU core clock limit (MHz)", "CPU_CCLK_LIMIT", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT, "CPU socket active frequency limit (MHz)", "CPU_SOCKET_ACTIVE_FREQ_LIMIT", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC, "CPU socket frequency limit source type", "CPU_SOCKET_FREQ_LIMIT_SRC", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX, "CPU socket maximum frequency range (MHz)", "CPU_SOCKET_FREQ_RANGE_MAX", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN, "CPU socket minimum frequency range (MHz)", "CPU_SOCKET_FREQ_RANGE_MIN", false)
FLD_DESC_ENT(RDC_FI_CPU_CORE_FREQ_LIMIT, "CPU core current frequency limit (MHz)", "CPU_CORE_FREQ_LIMIT", false)
FLD_DESC_ENT(RDC_FI_CPU_CORE_BOOST_LIMIT, "CPU core boost limit (MHz)", "CPU_CORE_BOOST_LIMIT", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_C0_RESIDENCY, "CPU socket C0 residency percentage", "CPU_SOCKET_C0_RESIDENCY", false)
FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_MAX_BW, "CPU DDR maximum bandwidth (MB/s)", "CPU_DDR_BW_MAX_BW", false)
FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_UTILIZED_BW, "CPU DDR utilized bandwidth (MB/s)", "CPU_DDR_BW_UTILIZED_BW", false)
FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_UTILIZED_PCT, "CPU DDR utilized bandwidth percentage", "CPU_DDR_BW_UTILIZED_PCT", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_TEMPERATURE, "CPU socket temperature (millidegrees Celsius)", "CPU_SOCKET_TEMPERATURE", false)
FLD_DESC_ENT(RDC_FI_CPU_DIMM_TEMP_RANGE, "CPU DIMM temperature range", "CPU_DIMM_TEMP_RANGE", false)
FLD_DESC_ENT(RDC_FI_CPU_DIMM_REFRESH_RATE, "CPU DIMM refresh rate", "CPU_DIMM_REFRESH_RATE", false)
FLD_DESC_ENT(RDC_FI_CPU_DIMM_POWER_CONSUMPTION, "CPU DIMM power consumption (milliwatts)", "CPU_DIMM_POWER_CONSUMPTION", false)
FLD_DESC_ENT(RDC_FI_CPU_DIMM_THERMAL_SENSOR, "CPU DIMM thermal sensor temperature (millidegrees)", "CPU_DIMM_THERMAL_SENSOR", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL, "CPU socket LCLK DPM level", "CPU_SOCKET_LCLK_DPM_LEVEL", false)
FLD_DESC_ENT(RDC_FI_CPU_IO_BANDWIDTH, "CPU I/O bandwidth (MB/s)", "CPU_IO_BANDWIDTH", false)
FLD_DESC_ENT(RDC_FI_CPU_XGMI_BANDWIDTH, "CPU XGMI bandwidth (MB/s)", "CPU_XGMI_BANDWIDTH", false)
FLD_DESC_ENT(RDC_FI_CPU_HSMP_METRICS_VERSION, "HSMP metrics table version", "CPU_HSMP_METRICS_VERSION", false)
FLD_DESC_ENT(RDC_FI_CPU_HSMP_METRICS_TABLE, "HSMP metrics table data", "CPU_HSMP_METRICS_TABLE", false)
FLD_DESC_ENT(RDC_FI_CPU_FIRST_ONLINE_CORE, "First online core on CPU socket", "CPU_FIRST_ONLINE_CORE", false)
FLD_DESC_ENT(RDC_FI_CPU_FAMILY, "CPU family identifier", "CPU_FAMILY", false)
FLD_DESC_ENT(RDC_FI_CPU_MODEL_ID, "CPU model identifier", "CPU_MODEL_ID", false)
FLD_DESC_ENT(RDC_FI_CPU_CORES_PER_SOCKET, "Number of CPU cores per socket", "CPU_CORES_PER_SOCKET", false)
FLD_DESC_ENT(RDC_FI_CPU_SOCKET_COUNT, "Number of CPU sockets", "CPU_SOCKET_COUNT", false)

2 changes: 1 addition & 1 deletion common/rdc_fields_supported.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static const fld_id2name_map_t field_id_to_descript = {

#define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) {#ID, (ID)},
static fld_name2id_map_t field_name_to_id = {
#include "common/rdc_field.data" // NOLINT
#include "common/rdc_field.data"
};
#undef FLD_DESC_ENT

Expand Down
55 changes: 55 additions & 0 deletions include/rdc/rdc.h
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,61 @@ typedef enum {
RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums
RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter
RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds)
/**
* @brief RDC CPU related fields
*/
RDC_FI_CPU_COUNT = 10000, //!< CPU count
RDC_FI_CPU_UTIL_TOTAL, //!< CPU total percentage of time in use
RDC_FI_CPU_UTIL_USER, //!< Percent of time in use by the user
RDC_FI_CPU_UTIL_NICE, //!< Percent of time in use by low priority programs
RDC_FI_CPU_UTIL_SYS, //!< Percent of time in use by the system
RDC_FI_CPU_UTIL_IRQ, //!< Percent of time in use by interrupts
RDC_FI_CPU_TEMP_CURRENT, //!< Temperature (Celsius)
RDC_FI_CPU_CLOCK_CURRENT, //!< Clock speed (KHz)
RDC_FI_CPU_POWER_UTIL_CURRENT, //!< Power usage (watts)
RDC_FI_CPU_POWER_LIMIT, //!< Power limit (watts)
RDC_FI_CPU_VENDOR, //!< Name of the vendor
RDC_FI_CPU_MODEL, //!< Name of the model

/** TODO: Clean up duplicates
* @brief Below are AI generated from functions
*/
RDC_FI_CPU_CORE_ENERGY, //!< CPU core energy consumption (microjoules)
RDC_FI_CPU_SOCKET_ENERGY, //!< CPU socket energy consumption (microjoules)
RDC_FI_CPU_THREADS_PER_CORE, //!< Number of threads per CPU core
RDC_FI_CPU_HSMP_DRIVER_VERSION, //!< HSMP driver version
RDC_FI_CPU_SMU_FW_VERSION, //!< SMU firmware version
RDC_FI_CPU_HSMP_PROTO_VERSION, //!< HSMP protocol version
RDC_FI_CPU_PROCHOT_STATUS, //!< CPU PROCHOT status
RDC_FI_CPU_FCLK_FREQUENCY, //!< CPU fabric clock frequency (MHz)
RDC_FI_CPU_MCLK_FREQUENCY, //!< CPU memory clock frequency (MHz)
RDC_FI_CPU_CCLK_LIMIT, //!< CPU core clock limit (MHz)
RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT, //!< CPU socket active frequency limit (MHz)
RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC, //!< CPU socket frequency limit source type
RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX, //!< CPU socket maximum frequency range (MHz)
RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN, //!< CPU socket minimum frequency range (MHz)
RDC_FI_CPU_CORE_FREQ_LIMIT, //!< CPU core current frequency limit (MHz)
RDC_FI_CPU_CORE_BOOST_LIMIT, //!< CPU core boost limit (MHz)
RDC_FI_CPU_SOCKET_C0_RESIDENCY, //!< CPU socket C0 residency percentage
RDC_FI_CPU_DDR_BW_MAX_BW, //!< CPU DDR maximum bandwidth (MB/s)
RDC_FI_CPU_DDR_BW_UTILIZED_BW, //!< CPU DDR utilized bandwidth (MB/s)
RDC_FI_CPU_DDR_BW_UTILIZED_PCT, //!< CPU DDR utilized bandwidth percentage
RDC_FI_CPU_SOCKET_TEMPERATURE, //!< CPU socket temperature (millidegrees Celsius)
RDC_FI_CPU_DIMM_TEMP_RANGE, //!< CPU DIMM temperature range
RDC_FI_CPU_DIMM_REFRESH_RATE, //!< CPU DIMM refresh rate
RDC_FI_CPU_DIMM_POWER_CONSUMPTION, //!< CPU DIMM power consumption (milliwatts)
RDC_FI_CPU_DIMM_THERMAL_SENSOR, //!< CPU DIMM thermal sensor temperature (millidegrees)
RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL, //!< CPU socket LCLK DPM level
RDC_FI_CPU_IO_BANDWIDTH, //!< CPU I/O bandwidth (MB/s)
RDC_FI_CPU_XGMI_BANDWIDTH, //!< CPU XGMI bandwidth (MB/s)
RDC_FI_CPU_HSMP_METRICS_VERSION, //!< HSMP metrics table version
RDC_FI_CPU_HSMP_METRICS_TABLE, //!< HSMP metrics table data
RDC_FI_CPU_FIRST_ONLINE_CORE, //!< First online core on CPU socket
RDC_FI_CPU_FAMILY, //!< CPU family identifier
RDC_FI_CPU_MODEL_ID, //!< CPU model identifier
RDC_FI_CPU_CORES_PER_SOCKET, //!< Number of CPU cores per socket
RDC_FI_CPU_SOCKET_COUNT, //!< Number of CPU sockets

} rdc_field_t;

// even and odd numbers are used for correctable and uncorrectable errors
Expand Down
10 changes: 10 additions & 0 deletions include/rdc_lib/impl/RdcMetricFetcherImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher {
public:
rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value) override;
rdc_status_t fetch_gpu_field_(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value,
amdsmi_processor_handle& processor_handle);
rdc_status_t fetch_gpu_partition_field_(uint32_t gpu_index, rdc_field_t field_id,
rdc_field_value* value);
rdc_status_t fetch_cpu_field_(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
rdc_status_t bulk_fetch_smi_fields(
rdc_gpu_field_t* fields, uint32_t fields_count,
std::vector<rdc_gpu_field_value_t>& results) override; // NOLINT
Expand All @@ -91,6 +96,11 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher {
bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value);
void get_pcie_throughput(const RdcFieldKey& key);

//!< is ESMI/CPU mode enabled?
bool is_cpu_enabled = false;

bool async_fetching = false;

//!< Async metric retreive
std::map<RdcFieldKey, MetricValue> async_metrics_;
std::map<RdcFieldKey, std::shared_ptr<FieldSMIData>> smi_data_;
Expand Down
4 changes: 2 additions & 2 deletions include/rdc_modules/rdc_rvs/RvsBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ namespace rdc {

inline amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id,
amdsmi_processor_handle* processor_handle) {
uint32_t socket_count;
uint32_t processor_count;
uint32_t socket_count = 0;
uint32_t processor_count = 0;
auto ret = amdsmi_get_socket_handles(&socket_count, nullptr);
if (ret != AMDSMI_STATUS_SUCCESS) {
return ret;
Expand Down
1 change: 1 addition & 0 deletions protos/rdc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ service RdcAPI {
// Discovery API
// rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count)
rpc GetAllDevices(Empty) returns (GetAllDevicesResponse) {}

// rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr)
rpc GetDeviceAttributes(GetDeviceAttributesRequest) returns (GetDeviceAttributesResponse) {}

Expand Down
17 changes: 15 additions & 2 deletions python_binding/rdc_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,19 @@ class rdc_field_t(c_int):
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007

RDC_FI_CPU_COUNT = 10000
RDC_FI_CPU_UTIL_TOTAL = 10001
RDC_FI_CPU_UTIL_USER = 10002
RDC_FI_CPU_UTIL_NICE = 10003
RDC_FI_CPU_UTIL_SYS = 10004
RDC_FI_CPU_UTIL_IRQ = 10005
RDC_FI_CPU_TEMP_CURRENT = 10006
RDC_FI_CPU_CLOCK_CURRENT = 10007
RDC_FI_CPU_POWER_UTIL_CURRENT = 10008
RDC_FI_CPU_POWER_LIMIT = 10009
RDC_FI_CPU_VENDOR = 10010
RDC_FI_CPU_MODEL = 10011

_rdc_metric_type_lookup = {
RDC_FI_INVALID: rdc_metric_type_t.INVALID,
RDC_FI_GPU_COUNT: rdc_metric_type_t.LABEL,
Expand Down Expand Up @@ -277,7 +290,7 @@ class rdc_field_t(c_int):
def get_rdc_metric_type(cls, rdc_metric_t):
if isinstance(rdc_metric_t, str):
rdc_metric_t = getattr(cls, rdc_metric_t, None)

# If the metric was found, do the lookup, otherwise default GAUGE
if rdc_metric_t is not None:
return cls._rdc_metric_type_lookup.get(rdc_metric_t, rdc_metric_type_t.GAUGE)
Expand All @@ -288,7 +301,7 @@ def get_field_name(cls, value):
for attr_name, attr_value in cls.__dict__.items():
if isinstance(attr_value, int) and attr_value == value:
return attr_name
return "Unknown field value"
return "Unknown field value"

rdc_handle_t = c_void_p
rdc_gpu_group_t = c_uint32
Expand Down
1 change: 1 addition & 0 deletions rdc_libs/bootstrap/src/RdcBootStrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupI
return static_cast<amd::rdc::RdcHandler*>(p_rdc_handle)->rdc_group_gpu_add(groupId, gpuIndex);
}

// TODO: rewrite get_all to allow different types
rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle,
uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) {
if (!p_rdc_handle || !count) {
Expand Down
15 changes: 14 additions & 1 deletion rdc_libs/rdc/src/RdcEmbeddedHandler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,21 @@ class smi_initializer {
smi_initializer() {
// Make sure smi will not be initialized multiple times
amdsmi_shut_down();
amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR;
uint64_t init_flag = AMDSMI_INIT_AMD_GPUS;
#ifdef ENABLE_ESMI_LIB
init_flag |= AMDSMI_INIT_AMD_CPUS;
#endif
ret = amdsmi_init(init_flag);
#ifdef ENABLE_ESMI_LIB
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "Failed to initalize amdsmi with CPUs enabled.. Disabling CPUs.");
init_flag &= ~AMDSMI_INIT_AMD_CPUS;
ret = amdsmi_init(init_flag);
}
#endif
if (ret != AMDSMI_STATUS_SUCCESS) {
RDC_LOG(RDC_ERROR, "SMI FAILED with" << ret);
throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail");
}
}
Expand Down
Loading
Loading