From 6bee95a7cfaf6f453f37b00ac5d64b315a8730b1 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Tue, 24 Jun 2025 01:30:45 -0500 Subject: [PATCH 1/6] CPU - Init amdsmi with CPUs enabled Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 5 +++++ common/rdc_field.data | 14 ++++++++++++++ common/rdc_fields_supported.cc | 2 +- include/rdc/rdc.h | 15 +++++++++++++++ protos/rdc.proto | 1 + rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 15 ++++++++++++++- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 21 ++++++++++++++++++--- rdc_libs/rdc/src/RdcSmiLib.cc | 1 + 8 files changed, 69 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e6b4c0b4..b50ca55e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,6 +115,11 @@ option(BUILD_EXAMPLES "Build examples" OFF) # Enable shared libraries for gtest option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON) +option(BUILD_ESMI "Enable AMDSMI ESMI Library" OFF) +if(BUILD_ESMI) + add_definitions("-DENABLE_ESMI_LIB=1") +endif() + # Enable address sanitizer set(ADDRESS_SANITIZER_DEFAULT OFF) if(DEFINED ENV{ADDRESS_SANITIZER}) diff --git a/common/rdc_field.data b/common/rdc_field.data index 8c8603d1..b1f65c09 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -238,3 +238,17 @@ FLD_DESC_ENT(RDC_HEALTH_RETIRED_PAGE_LIMIT, "Retired page limit", FLD_DESC_ENT(RDC_HEALTH_EEPROM_CONFIG_VALID, "Verify checksum of EEPROM", "EEPROM_CONFIG_VALID", true) FLD_DESC_ENT(RDC_HEALTH_POWER_THROTTLE_TIME, "Power throttle status counter", "POWER_THROTTLE_TIME", true) FLD_DESC_ENT(RDC_HEALTH_THERMAL_THROTTLE_TIME, "Total time(ms) in thermal throttle status", "THERMAL_THROTTLE_TIME", true) + +// CPU-related fields description +FLD_DESC_ENT(RDC_FI_CPU_COUNT, "The number of CPU cores", "CPU_COUNT", false) +FLD_DESC_ENT(RDC_FI_CPU_UTIL_TOTAL, "CPU total percentage of time in use", "CPU_UTIL_TOTAL", false) +FLD_DESC_ENT(RDC_FI_CPU_UTIL_USER, "Percent of time in use by the user", "CPU_UTIL_USER", false) +FLD_DESC_ENT(RDC_FI_CPU_UTIL_NICE, "Percent of time in use by low priority programs", "CPU_UTIL_NICE", false) +FLD_DESC_ENT(RDC_FI_CPU_UTIL_SYS, "Percent of time in use by the system", "CPU_UTIL_SYS", false) +FLD_DESC_ENT(RDC_FI_CPU_UTIL_IRQ, "Percent of time in use by interrupts", "CPU_UTIL_IRQ", false) +FLD_DESC_ENT(RDC_FI_CPU_TEMP_CURRENT, "Temperature (Celsius)", "CPU_TEMP_CURRENT", false) +FLD_DESC_ENT(RDC_FI_CPU_CLOCK_CURRENT, "Clock speed (KHz)", "CPU_CLOCK_CURRENT", false) +FLD_DESC_ENT(RDC_FI_CPU_POWER_UTIL_CURRENT, "Power usage (watts)", "CPU_POWER_UTIL_CURRENT", false) +FLD_DESC_ENT(RDC_FI_CPU_POWER_LIMIT, "Power limit (watts)", "CPU_POWER_LIMIT", false) +FLD_DESC_ENT(RDC_FI_CPU_VENDOR, "Name of the vendor", "CPU_VENDOR", false) +FLD_DESC_ENT(RDC_FI_CPU_MODEL, "Name of the model", "CPU_MODEL", false) diff --git a/common/rdc_fields_supported.cc b/common/rdc_fields_supported.cc index 243dab25..5c3f2c33 100644 --- a/common/rdc_fields_supported.cc +++ b/common/rdc_fields_supported.cc @@ -38,7 +38,7 @@ static const fld_id2name_map_t field_id_to_descript = { #define FLD_DESC_ENT(ID, DESC, LABEL, DISPLAY) {#ID, (ID)}, static fld_name2id_map_t field_name_to_id = { -#include "common/rdc_field.data" // NOLINT +#include "common/rdc_field.data" }; #undef FLD_DESC_ENT diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 60383f3c..425418b7 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -424,6 +424,21 @@ typedef enum { RDC_HEALTH_EEPROM_CONFIG_VALID, //!< Reads the EEPROM and verifies the checksums RDC_HEALTH_POWER_THROTTLE_TIME, //!< Power throttle status counter RDC_HEALTH_THERMAL_THROTTLE_TIME, //!< Total time in thermal throttle status (microseconds) + /** + * @brief RDC CPU related fields + */ + RDC_FI_CPU_COUNT = 10001, //!< CPU count + RDC_FI_CPU_UTIL_TOTAL, //!< CPU total percentage of time in use + RDC_FI_CPU_UTIL_USER, //!< Percent of time in use by the user + RDC_FI_CPU_UTIL_NICE, //!< Percent of time in use by low priority programs + RDC_FI_CPU_UTIL_SYS, //!< Percent of time in use by the system + RDC_FI_CPU_UTIL_IRQ, //!< Percent of time in use by interrupts + RDC_FI_CPU_TEMP_CURRENT, //!< Temperature (Celsius) + RDC_FI_CPU_CLOCK_CURRENT, //!< Clock speed (KHz) + RDC_FI_CPU_POWER_UTIL_CURRENT, //!< Power usage (watts) + RDC_FI_CPU_POWER_LIMIT, //!< Power limit (watts) + RDC_FI_CPU_VENDOR, //!< Name of the vendor + RDC_FI_CPU_MODEL, //!< Name of the model } rdc_field_t; // even and odd numbers are used for correctable and uncorrectable errors diff --git a/protos/rdc.proto b/protos/rdc.proto index 8af03999..1eb1360e 100755 --- a/protos/rdc.proto +++ b/protos/rdc.proto @@ -53,6 +53,7 @@ service RdcAPI { // Discovery API // rdc_status_t rdc_get_all_devices(uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) rpc GetAllDevices(Empty) returns (GetAllDevicesResponse) {} + // rdc_status_t rdc_get_device_attributes(uint32_t gpu_index, rdc_device_attributes_t* p_rdc_attr) rpc GetDeviceAttributes(GetDeviceAttributesRequest) returns (GetDeviceAttributesResponse) {} diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index f9893458..630713d8 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -49,8 +49,21 @@ class smi_initializer { smi_initializer() { // Make sure smi will not be initialized multiple times amdsmi_shut_down(); - amdsmi_status_t ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS); + amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR; + uint64_t init_flag_ = AMDSMI_INIT_AMD_GPUS; +#ifdef ENABLE_ESMI_LIB + init_flag_ |= AMDSMI_INIT_AMD_CPUS; +#endif + ret = amdsmi_init(init_flag_); +#ifdef ENABLE_ESMI_LIB if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to initalize amdsmi with CPUs enabled.. Disabling CPUs."); + init_flag_ &= ~AMDSMI_INIT_AMD_CPUS; + ret = amdsmi_init(init_flag_); + } +#endif + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "SMI FAILED with" << ret); throw amd::rdc::RdcException(RDC_ST_FAIL_LOAD_MODULE, "SMI initialize fail"); } } diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 5a11ec7e..dada777a 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -88,7 +88,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { } uint64_t RdcMetricFetcherImpl::now() { - struct timeval tv {}; + struct timeval tv{}; gettimeofday(&tv, NULL); return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } @@ -485,7 +485,8 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } else { info_str = std::to_string(info.device_index); } - RDC_LOG(RDC_ERROR, "Failed to get processor handle for GPU " << info_str << " error: " << ret); + RDC_LOG(RDC_ERROR, + "Failed to get processor handle for device " << info_str << " error: " << ret); return Smi2RdcError(ret); } @@ -502,7 +503,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field uint16_t num_partitions = 0; amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions); if (st != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Failed to get partition info for GPU " << info.device_index); + RDC_LOG(RDC_ERROR, "Failed to get partition info for device " << info.device_index); return RDC_ST_UNKNOWN_ERROR; } @@ -621,6 +622,10 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->status = RDC_ST_OK; return RDC_ST_OK; } + case RDC_FI_CPU_COUNT: { + // CPU_COUNT is not supported in partitions + return RDC_ST_NO_DATA; + } default: // for now we must let other plugins return valid data for partition metrics @@ -1119,8 +1124,18 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field if (value->status == AMDSMI_STATUS_SUCCESS) { value->value.l_int = static_cast(gpu_busy_percent); } + break; } + case RDC_FI_CPU_COUNT: { + uint32_t socket_count = 0; + value->status = amdsmi_get_cpu_socket_count(&socket_count); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(socket_count); + } + } break; + default: break; } diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index ef4d5d53..ecf58f5c 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -187,6 +187,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED, RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION, RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT, + RDC_FI_CPU_COUNT, }; // clang-format on std::copy(fields.begin(), fields.end(), field_ids); From 3e16785fc5cd0737b4aad3da09f33e47ee4213ad Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 10 Jul 2025 21:58:25 -0500 Subject: [PATCH 2/6] CPU - WIP Signed-off-by: Galantsev, Dmitrii --- include/rdc/rdc.h | 2 +- include/rdc_lib/impl/RdcMetricFetcherImpl.h | 10 + rdc_libs/rdc/src/RdcEmbeddedHandler.cc | 10 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 436 +++++++++++--------- 4 files changed, 249 insertions(+), 209 deletions(-) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 425418b7..4863e315 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -427,7 +427,7 @@ typedef enum { /** * @brief RDC CPU related fields */ - RDC_FI_CPU_COUNT = 10001, //!< CPU count + RDC_FI_CPU_COUNT = 10000, //!< CPU count RDC_FI_CPU_UTIL_TOTAL, //!< CPU total percentage of time in use RDC_FI_CPU_UTIL_USER, //!< Percent of time in use by the user RDC_FI_CPU_UTIL_NICE, //!< Percent of time in use by low priority programs diff --git a/include/rdc_lib/impl/RdcMetricFetcherImpl.h b/include/rdc_lib/impl/RdcMetricFetcherImpl.h index 63745a66..6a5e0254 100644 --- a/include/rdc_lib/impl/RdcMetricFetcherImpl.h +++ b/include/rdc_lib/impl/RdcMetricFetcherImpl.h @@ -71,6 +71,11 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher { public: rdc_status_t fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) override; + rdc_status_t fetch_gpu_field_(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value, + amdsmi_processor_handle& processor_handle); + rdc_status_t fetch_gpu_partition_field_(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value); + rdc_status_t fetch_cpu_field_(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value); rdc_status_t bulk_fetch_smi_fields( rdc_gpu_field_t* fields, uint32_t fields_count, std::vector& results) override; // NOLINT @@ -91,6 +96,11 @@ class RdcMetricFetcherImpl final : public RdcMetricFetcher { bool async_get_pcie_throughput(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value); void get_pcie_throughput(const RdcFieldKey& key); + //!< is ESMI/CPU mode enabled? + bool is_cpu_enabled = false; + + bool async_fetching = false; + //!< Async metric retreive std::map async_metrics_; std::map> smi_data_; diff --git a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc index 630713d8..0c8fff1d 100644 --- a/rdc_libs/rdc/src/RdcEmbeddedHandler.cc +++ b/rdc_libs/rdc/src/RdcEmbeddedHandler.cc @@ -50,16 +50,16 @@ class smi_initializer { // Make sure smi will not be initialized multiple times amdsmi_shut_down(); amdsmi_status_t ret = AMDSMI_STATUS_UNKNOWN_ERROR; - uint64_t init_flag_ = AMDSMI_INIT_AMD_GPUS; + uint64_t init_flag = AMDSMI_INIT_AMD_GPUS; #ifdef ENABLE_ESMI_LIB - init_flag_ |= AMDSMI_INIT_AMD_CPUS; + init_flag |= AMDSMI_INIT_AMD_CPUS; #endif - ret = amdsmi_init(init_flag_); + ret = amdsmi_init(init_flag); #ifdef ENABLE_ESMI_LIB if (ret != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "Failed to initalize amdsmi with CPUs enabled.. Disabling CPUs."); - init_flag_ &= ~AMDSMI_INIT_AMD_CPUS; - ret = amdsmi_init(init_flag_); + init_flag &= ~AMDSMI_INIT_AMD_CPUS; + ret = amdsmi_init(init_flag); } #endif if (ret != AMDSMI_STATUS_SUCCESS) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index dada777a..4f5bb094 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -25,7 +25,6 @@ THE SOFTWARE. #include #include -#include //NOLINT #include #include #include @@ -88,7 +87,7 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { } uint64_t RdcMetricFetcherImpl::now() { - struct timeval tv{}; + struct timeval tv {}; gettimeofday(&tv, NULL); return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } @@ -96,9 +95,9 @@ uint64_t RdcMetricFetcherImpl::now() { void RdcMetricFetcherImpl::get_ecc(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; - amdsmi_ras_err_state_t err_state; + amdsmi_ras_err_state_t err_state = AMDSMI_RAS_ERR_STATE_INVALID; - amdsmi_processor_handle processor_handle; + amdsmi_processor_handle processor_handle = nullptr; err = get_processor_handle_from_id(gpu_index, &processor_handle); assert(err == AMDSMI_STATUS_SUCCESS); @@ -208,9 +207,9 @@ void RdcMetricFetcherImpl::get_ecc_total(uint32_t gpu_index, rdc_field_t field_i amdsmi_status_t err = AMDSMI_STATUS_SUCCESS; uint64_t correctable_count = 0; uint64_t uncorrectable_count = 0; - amdsmi_ras_err_state_t err_state; + amdsmi_ras_err_state_t err_state = AMDSMI_RAS_ERR_STATE_INVALID; - amdsmi_processor_handle processor_handle; + amdsmi_processor_handle processor_handle = nullptr; err = get_processor_handle_from_id(gpu_index, &processor_handle); if (!value) { @@ -279,10 +278,10 @@ bool RdcMetricFetcherImpl::async_get_pcie_throughput(uint32_t gpu_index, rdc_fie void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { uint32_t gpu_index = key.first; - uint64_t sent, received, max_pkt_sz; - amdsmi_status_t ret; + uint64_t sent = 0, received = 0, max_pkt_sz = 0; + amdsmi_status_t ret = AMDSMI_STATUS_INVAL; - amdsmi_processor_handle processor_handle; + amdsmi_processor_handle processor_handle = nullptr; ret = get_processor_handle_from_id(gpu_index, &processor_handle); // Return if the cache does not expire yet @@ -298,7 +297,7 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { ret = amdsmi_get_gpu_pci_throughput(processor_handle, &sent, &received, &max_pkt_sz); uint64_t curTime = now(); - MetricValue value; + MetricValue value{}; value.cache_ttl = 30 * 1000; // cache 30 seconds value.value.type = INTEGER; do { @@ -437,11 +436,11 @@ constexpr double kGig = 1000000000.0; static uint64_t sum_xgmi_read(const amdsmi_gpu_metrics_t& gpu_metrics) { uint64_t total = 0; const auto not_supported_metrics_data = std::numeric_limits::max(); - for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) { - if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data) { + for (unsigned long i : gpu_metrics.xgmi_read_data_acc) { + if (i == not_supported_metrics_data) { continue; } - total += gpu_metrics.xgmi_read_data_acc[i]; + total += i; } if (total == 0) { return not_supported_metrics_data; @@ -452,11 +451,11 @@ static uint64_t sum_xgmi_read(const amdsmi_gpu_metrics_t& gpu_metrics) { static uint64_t sum_xgmi_write(const amdsmi_gpu_metrics_t& gpu_metrics) { uint64_t total = 0; const auto not_supported_metrics_data = std::numeric_limits::max(); - for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) { - if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data) { + for (unsigned long i : gpu_metrics.xgmi_write_data_acc) { + if (i == not_supported_metrics_data) { continue; } - total += gpu_metrics.xgmi_write_data_acc[i]; + total += i; } if (total == 0) { return not_supported_metrics_data; @@ -464,178 +463,20 @@ static uint64_t sum_xgmi_write(const amdsmi_gpu_metrics_t& gpu_metrics) { return total; } -rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, - rdc_field_value* value) { - if (!value) { - return RDC_ST_BAD_PARAMETER; - } - bool async_fetching = false; - std::shared_ptr smi_data; - - amdsmi_processor_handle processor_handle = {}; - - rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); - - amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &processor_handle); - if (ret != AMDSMI_STATUS_SUCCESS) { - std::string info_str; - if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { - info_str = - "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index); - } else { - info_str = std::to_string(info.device_index); - } - RDC_LOG(RDC_ERROR, - "Failed to get processor handle for device " << info_str << " error: " << ret); - return Smi2RdcError(ret); - } +std::shared_ptr RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) { + auto r_info = smi_data_.find(key); - if (!is_field_valid(field_id)) { - RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported"); - return RDC_ST_NOT_SUPPORTED; + if (r_info != smi_data_.end()) { + return r_info->second; } + return nullptr; +} - value->ts = now(); - value->field_id = field_id; - value->status = AMDSMI_STATUS_NOT_SUPPORTED; - - if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { - uint16_t num_partitions = 0; - amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions); - if (st != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Failed to get partition info for device " << info.device_index); - return RDC_ST_UNKNOWN_ERROR; - } - - amdsmi_processor_handle processor_handle = {}; - amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); - if (ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Cannot get processor handle for partition " << info.instance_index); - return Smi2RdcError(ret); - } - - amdsmi_gpu_metrics_t gpu_metrics = {}; - ret = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); - if (ret != AMDSMI_STATUS_SUCCESS) { - RDC_LOG(RDC_ERROR, "Failed to get GPU metrics info for partition " << info.instance_index); - return Smi2RdcError(ret); - } - - switch (field_id) { - case RDC_FI_GPU_CLOCK: { - const uint16_t* clock_array = gpu_metrics.current_gfxclks; - std::vector valid_clocks; - valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS); - - for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) { - uint16_t clk = clock_array[i]; - if (clk != 0 && clk != 0xFFFF) { - valid_clocks.push_back(clk); - } - } - - uint32_t vc = static_cast(valid_clocks.size()); - uint32_t pCount = static_cast(num_partitions); - uint32_t partIdx = info.instance_index; - - if (valid_clocks.empty() || vc < num_partitions) { - RDC_LOG(RDC_ERROR, "No valid clocks, or less than total partitions"); - return RDC_ST_NO_DATA; - } - - if (vc == num_partitions) { - value->value.l_int = static_cast(clock_array[info.instance_index]) * 1000000; - value->type = INTEGER; - value->status = RDC_ST_OK; - return RDC_ST_OK; - } - - uint32_t chunk_size = vc / pCount; - uint32_t start_idx = partIdx * chunk_size; - uint32_t end_idx = start_idx + chunk_size; - - // Average partition clocks - uint64_t sum = 0; - for (uint32_t i = start_idx; i < end_idx; i++) { - sum += valid_clocks[i]; - } - uint32_t count = end_idx - start_idx; - if (count == 0) { - return RDC_ST_NO_DATA; - } - uint64_t avg_clock = sum / count; - - value->value.l_int = avg_clock * 1000000; - value->type = INTEGER; - value->status = RDC_ST_OK; - return RDC_ST_OK; - } - - case RDC_FI_GPU_UTIL: { - uint32_t p = info.instance_index; - if (p >= AMDSMI_MAX_NUM_XCP) { - return RDC_ST_NO_DATA; - } - const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; - - uint64_t sum = 0; - uint32_t count = 0; - for (uint32_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { - uint32_t busy = xcp.gfx_busy_inst[i]; - if (busy != UINT32_MAX) { - sum += busy; - count++; - } - } - if (count == 0) { - return RDC_ST_NO_DATA; - } - uint64_t avg_busy = sum / count; - value->value.l_int = avg_busy; - value->type = INTEGER; - value->status = RDC_ST_OK; - return RDC_ST_OK; - } - - case RDC_FI_GPU_MM_DEC_UTIL: { - uint32_t p = info.instance_index; - if (p >= AMDSMI_MAX_NUM_XCP) { - return RDC_ST_NO_DATA; - } - const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; - - uint64_t sum = 0; - uint32_t count = 0; - for (uint32_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { - uint16_t vcn = xcp.vcn_busy[i]; - if (vcn != UINT16_MAX) { - sum += vcn; - count++; - } - } - if (count == 0) { - return RDC_ST_NO_DATA; - } - uint64_t avg_decode = sum / count; - value->value.l_int = avg_decode; - value->type = INTEGER; - value->status = RDC_ST_OK; - return RDC_ST_OK; - } - case RDC_FI_CPU_COUNT: { - // CPU_COUNT is not supported in partitions - return RDC_ST_NO_DATA; - } - - default: - // for now we must let other plugins return valid data for partition metrics - - // TODO: All other fields => N/A for partition IN AMDSMI - // RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) - // << " not supported => NO_DATA."); - break; - } - } // end if partition +rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value, + amdsmi_processor_handle& processor_handle) { + std::shared_ptr smi_data; + amdsmi_status_t ret = AMDSMI_STATUS_INVAL; auto read_smi_counter = [&](void) { RdcFieldKey f_key(gpu_index, field_id); @@ -650,7 +491,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->type = INTEGER; }; - auto read_gpu_metrics_uint64_t = [&](void) { + auto read_gpu_metrics_uint64_t = [&]() { amdsmi_gpu_metrics_t gpu_metrics; value->status = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); RDC_LOG(RDC_DEBUG, "Read the gpu metrics:" << value->status); @@ -859,15 +700,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; } - case RDC_FI_GPU_PAGE_RETRIED: - uint32_t num_pages; - amdsmi_retired_page_record_t info; - value->status = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, &info); + case RDC_FI_GPU_PAGE_RETRIED: { + uint32_t num_pages = 0; + amdsmi_retired_page_record_t page_record; + value->status = amdsmi_get_gpu_bad_page_info(processor_handle, &num_pages, &page_record); value->type = INTEGER; if (value->status == AMDSMI_STATUS_SUCCESS) { value->value.l_int = num_pages; } break; + } case RDC_FI_OAM_ID: case RDC_FI_DEV_ID: case RDC_FI_REV_ID: @@ -1139,6 +981,203 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field default: break; } +} + +rdc_status_t RdcMetricFetcherImpl::fetch_gpu_partition_field_(uint32_t gpu_index, + rdc_field_t field_id, + rdc_field_value* value) { + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); + uint16_t num_partitions = 0; + amdsmi_status_t st = get_num_partition(info.device_index, &num_partitions); + if (st != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get partition info for device " << info.device_index); + return RDC_ST_UNKNOWN_ERROR; + } + + amdsmi_processor_handle processor_handle = {}; + amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Cannot get processor handle for partition " << info.instance_index); + return Smi2RdcError(ret); + } + + amdsmi_gpu_metrics_t gpu_metrics = {}; + ret = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Failed to get GPU metrics info for partition " << info.instance_index); + return Smi2RdcError(ret); + } + + switch (field_id) { + case RDC_FI_GPU_CLOCK: { + const uint16_t* clock_array = gpu_metrics.current_gfxclks; + std::vector valid_clocks; + valid_clocks.reserve(AMDSMI_MAX_NUM_GFX_CLKS); + + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_GFX_CLKS; i++) { + uint16_t clk = clock_array[i]; + if (clk != 0 && clk != 0xFFFF) { + valid_clocks.push_back(clk); + } + } + + uint32_t vc = static_cast(valid_clocks.size()); + uint32_t pCount = static_cast(num_partitions); + uint32_t partIdx = info.instance_index; + + if (valid_clocks.empty() || vc < num_partitions) { + RDC_LOG(RDC_ERROR, "No valid clocks, or less than total partitions"); + return RDC_ST_NO_DATA; + } + + if (vc == num_partitions) { + value->value.l_int = static_cast(clock_array[info.instance_index]) * 1000000; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + uint32_t chunk_size = vc / pCount; + uint32_t start_idx = partIdx * chunk_size; + uint32_t end_idx = start_idx + chunk_size; + + // Average partition clocks + uint64_t sum = 0; + for (uint32_t i = start_idx; i < end_idx; i++) { + sum += valid_clocks[i]; + } + uint32_t count = end_idx - start_idx; + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_clock = sum / count; + + value->value.l_int = avg_clock * 1000000; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + case RDC_FI_GPU_UTIL: { + uint32_t p = info.instance_index; + if (p >= AMDSMI_MAX_NUM_XCP) { + return RDC_ST_NO_DATA; + } + const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; + + uint64_t sum = 0; + uint32_t count = 0; + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_XCC; i++) { + uint32_t busy = xcp.gfx_busy_inst[i]; + if (busy != UINT32_MAX) { + sum += busy; + count++; + } + } + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_busy = sum / count; + value->value.l_int = avg_busy; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + + case RDC_FI_GPU_MM_DEC_UTIL: { + uint32_t p = info.instance_index; + if (p >= AMDSMI_MAX_NUM_XCP) { + return RDC_ST_NO_DATA; + } + const amdsmi_gpu_xcp_metrics_t& xcp = gpu_metrics.xcp_stats[p]; + + uint64_t sum = 0; + uint32_t count = 0; + for (uint32_t i = 0; i < AMDSMI_MAX_NUM_VCN; i++) { + uint16_t vcn = xcp.vcn_busy[i]; + if (vcn != UINT16_MAX) { + sum += vcn; + count++; + } + } + if (count == 0) { + return RDC_ST_NO_DATA; + } + uint64_t avg_decode = sum / count; + value->value.l_int = avg_decode; + value->type = INTEGER; + value->status = RDC_ST_OK; + return RDC_ST_OK; + } + case RDC_FI_CPU_COUNT: { + // CPU_COUNT is not supported in partitions + return RDC_ST_NO_DATA; + } + + default: + // for now we must let other plugins return valid data for partition metrics + + // TODO: All other fields => N/A for partition IN AMDSMI + // RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) + // << " not supported => NO_DATA."); + break; + } +} + +rdc_status_t RdcMetricFetcherImpl::fetch_cpu_field_(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) {} + +rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, + rdc_field_value* value) { + if (!value) { + return RDC_ST_BAD_PARAMETER; + } + + amdsmi_processor_handle processor_handle = {}; + rdc_status_t status = RDC_ST_UNKNOWN_ERROR; + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); + + amdsmi_status_t ret = get_processor_handle_from_id(info.device_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + std::string info_str; + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + info_str = + "g" + std::to_string(info.device_index) + "." + std::to_string(info.instance_index); + } else { + info_str = std::to_string(info.device_index); + } + RDC_LOG(RDC_ERROR, + "Failed to get processor handle for device " << info_str << " error: " << ret); + return Smi2RdcError(ret); + } + + if ((field_id > RDC_FI_CPU_COUNT) && (info.device_type != RDC_DEVICE_TYPE_CPU)) { + RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " because of incorrect device type"); + return RDC_ST_NOT_SUPPORTED; + } + + if (is_field_valid(field_id) == false) { + RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " which is not supported"); + return RDC_ST_NOT_SUPPORTED; + } + + value->ts = now(); + value->field_id = field_id; + value->status = AMDSMI_STATUS_NOT_SUPPORTED; + if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { + status = fetch_gpu_partition_field_(gpu_index, field_id, value); + } else if (info.device_type == RDC_DEVICE_TYPE_GPU) { + status = fetch_gpu_field_(gpu_index, field_id, value, processor_handle); + } else if (info.device_type == RDC_DEVICE_TYPE_CPU) { + status = fetch_cpu_field_(gpu_index, field_id, value); + } else { + RDC_LOG(RDC_ERROR, "Unsupported device type for fetching field: " << field_id_string(field_id)); + return RDC_ST_NOT_SUPPORTED; + } + + if (status != RDC_ST_OK) { + return status; + } int64_t latency = now() - value->ts; if (value->status != AMDSMI_STATUS_SUCCESS) { @@ -1163,25 +1202,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field return value->status == AMDSMI_STATUS_SUCCESS ? RDC_ST_OK : RDC_ST_SMI_ERROR; } -std::shared_ptr RdcMetricFetcherImpl::get_smi_data(RdcFieldKey key) { - std::map>::iterator r_info = smi_data_.find(key); - - if (r_info != smi_data_.end()) { - return r_info->second; - } - return nullptr; -} - static rdc_status_t init_smi_counter(RdcFieldKey fk, amdsmi_event_group_t grp, amdsmi_event_handle_t* handle) { - amdsmi_status_t ret; - uint32_t counters_available; + amdsmi_status_t ret = AMDSMI_STATUS_INVAL; + uint32_t counters_available = 0; uint32_t dv_ind = fk.first; rdc_field_t f = fk.second; assert(handle != nullptr); - amdsmi_processor_handle processor_handle; + amdsmi_processor_handle processor_handle = nullptr; ret = get_processor_handle_from_id(dv_ind, &processor_handle); ret = amdsmi_gpu_counter_group_supported(processor_handle, grp); @@ -1227,7 +1257,7 @@ static rdc_status_t init_smi_counter(RdcFieldKey fk, amdsmi_event_group_t grp, } rdc_status_t RdcMetricFetcherImpl::delete_smi_handle(RdcFieldKey fk) { - amdsmi_status_t ret; + amdsmi_status_t ret = AMDSMI_STATUS_INVAL; switch (fk.second) { case RDC_EVNT_XGMI_0_NOP_TX: From dfd846ccd5d9e7e3bf636af4ee1f1417acd7938b Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 14 Jul 2025 19:29:35 -0500 Subject: [PATCH 3/6] CPU - Fix enum Signed-off-by: Galantsev, Dmitrii --- python_binding/rdc_bootstrap.py | 17 +++++++++++++++-- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 16 +++++----------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index c7e7c889..51aa1da6 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -250,6 +250,19 @@ class rdc_field_t(c_int): RDC_HEALTH_POWER_THROTTLE_TIME = 3006 RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007 + RDC_FI_CPU_COUNT = 10000 + RDC_FI_CPU_UTIL_TOTAL = 10001 + RDC_FI_CPU_UTIL_USER = 10002 + RDC_FI_CPU_UTIL_NICE = 10003 + RDC_FI_CPU_UTIL_SYS = 10004 + RDC_FI_CPU_UTIL_IRQ = 10005 + RDC_FI_CPU_TEMP_CURRENT = 10006 + RDC_FI_CPU_CLOCK_CURRENT = 10007 + RDC_FI_CPU_POWER_UTIL_CURRENT = 10008 + RDC_FI_CPU_POWER_LIMIT = 10009 + RDC_FI_CPU_VENDOR = 10010 + RDC_FI_CPU_MODEL = 10011 + _rdc_metric_type_lookup = { RDC_FI_INVALID: rdc_metric_type_t.INVALID, RDC_FI_GPU_COUNT: rdc_metric_type_t.LABEL, @@ -277,7 +290,7 @@ class rdc_field_t(c_int): def get_rdc_metric_type(cls, rdc_metric_t): if isinstance(rdc_metric_t, str): rdc_metric_t = getattr(cls, rdc_metric_t, None) - + # If the metric was found, do the lookup, otherwise default GAUGE if rdc_metric_t is not None: return cls._rdc_metric_type_lookup.get(rdc_metric_t, rdc_metric_type_t.GAUGE) @@ -288,7 +301,7 @@ def get_field_name(cls, value): for attr_name, attr_value in cls.__dict__.items(): if isinstance(attr_value, int) and attr_value == value: return attr_name - return "Unknown field value" + return "Unknown field value" rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 4f5bb094..acbcaae7 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -612,14 +612,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel } break; case RDC_FI_POWER_USAGE: { amdsmi_power_info_t power_info = {}; -// Handle API breaking change in amdsmi commit dc4a16da6fb45d581a6e23c78d340172989418a0 -// Breaking change is only in rocm 6.4.0 (amdsmi 25.2) -// It is reverted to old signature in 6.4.1 (amdsmi 25.3) -#if (((AMDSMI_LIB_VERSION_MAJOR) == 25) && ((AMDSMI_LIB_VERSION_MINOR) == 2)) - value->status = amdsmi_get_power_info(processor_handle, 0, &power_info); -#else value->status = amdsmi_get_power_info(processor_handle, &power_info); -#endif value->type = INTEGER; if (value->status != AMDSMI_STATUS_SUCCESS) { RDC_LOG(RDC_ERROR, "amdsmi_get_power_info failed!"); @@ -873,12 +866,12 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel break; case RDC_HEALTH_XGMI_ERROR: { - amdsmi_xgmi_status_t status; - ret = amdsmi_gpu_xgmi_error_status(processor_handle, &status); + amdsmi_xgmi_status_t xgmi_status = AMDSMI_XGMI_STATUS_NO_ERRORS; + ret = amdsmi_gpu_xgmi_error_status(processor_handle, &xgmi_status); value->status = Smi2RdcError(ret); value->type = INTEGER; if (value->status == AMDSMI_STATUS_SUCCESS) { - value->value.l_int = static_cast(status); + value->value.l_int = static_cast(xgmi_status); } break; } @@ -981,6 +974,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel default: break; } + return Smi2RdcError(static_cast(value->status)); } rdc_status_t RdcMetricFetcherImpl::fetch_gpu_partition_field_(uint32_t gpu_index, @@ -1176,7 +1170,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } if (status != RDC_ST_OK) { - return status; + RDC_LOG(RDC_ERROR, "Fetch status is not ok error: " << status); } int64_t latency = now() - value->ts; From c47b3a1be65df06ce2977805c8e72d15794f17aa Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 14 Jul 2025 20:10:53 -0500 Subject: [PATCH 4/6] CPU - WIP Signed-off-by: Galantsev, Dmitrii --- CMakeLists.txt | 2 +- rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 20 +++++++++++++++++++- rdc_libs/rdc/src/RdcSmiLib.cc | 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b50ca55e..3ec3c281 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -115,7 +115,7 @@ option(BUILD_EXAMPLES "Build examples" OFF) # Enable shared libraries for gtest option(BUILD_SHARED_LIBS "Build shared library (.so) or not." ON) -option(BUILD_ESMI "Enable AMDSMI ESMI Library" OFF) +option(BUILD_ESMI "Enable AMDSMI ESMI Library" ON) if(BUILD_ESMI) add_definitions("-DENABLE_ESMI_LIB=1") endif() diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index acbcaae7..13354298 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -1119,7 +1119,25 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_partition_field_(uint32_t gpu_index } rdc_status_t RdcMetricFetcherImpl::fetch_cpu_field_(uint32_t gpu_index, rdc_field_t field_id, - rdc_field_value* value) {} + rdc_field_value* value) { + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); + + amdsmi_processor_handle processor_handle = {}; + amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + + switch (field_id) { + case RDC_FI_CPU_MODEL: { + amdsmi_cpu_info_t cpu_info = {}; + value->status = amdsmi_get_cpu_model_name(processor_handle, &cpu_info); + memcpy(value->value.str, cpu_info.model_name, sizeof(cpu_info.model_name)); + value->type = STRING; + break; + } + default: + value->status = AMDSMI_STATUS_NOT_SUPPORTED; + } + return Smi2RdcError(static_cast(value->status)); +} rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index ecf58f5c..95f6f4a5 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -187,7 +187,7 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED, RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION, RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT, - RDC_FI_CPU_COUNT, + RDC_FI_CPU_COUNT }; // clang-format on std::copy(fields.begin(), fields.end(), field_ids); From 3c0b55f629ae52ec9a4a3f13cc4809001d209946 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 16 Jul 2025 21:19:36 -0500 Subject: [PATCH 5/6] CPU - WIP - Add a lot of fields Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 39 ++ include/rdc/rdc.h | 40 ++ rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 485 ++++++++++++++++++++++- rdc_libs/rdc/src/RdcSmiLib.cc | 40 +- 4 files changed, 602 insertions(+), 2 deletions(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index b1f65c09..8ba85c5f 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -252,3 +252,42 @@ FLD_DESC_ENT(RDC_FI_CPU_POWER_UTIL_CURRENT, "Power usage (watts)", FLD_DESC_ENT(RDC_FI_CPU_POWER_LIMIT, "Power limit (watts)", "CPU_POWER_LIMIT", false) FLD_DESC_ENT(RDC_FI_CPU_VENDOR, "Name of the vendor", "CPU_VENDOR", false) FLD_DESC_ENT(RDC_FI_CPU_MODEL, "Name of the model", "CPU_MODEL", false) + +// AI generated: +// Field Definitions +FLD_DESC_ENT(RDC_FI_CPU_CORE_ENERGY, "CPU core energy consumption (microjoules)", "CPU_CORE_ENERGY", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_ENERGY, "CPU socket energy consumption (microjoules)", "CPU_SOCKET_ENERGY", false) +FLD_DESC_ENT(RDC_FI_CPU_THREADS_PER_CORE, "Number of threads per CPU core", "CPU_THREADS_PER_CORE", false) +FLD_DESC_ENT(RDC_FI_CPU_HSMP_DRIVER_VERSION, "HSMP driver version", "CPU_HSMP_DRIVER_VERSION", false) +FLD_DESC_ENT(RDC_FI_CPU_SMU_FW_VERSION, "SMU firmware version", "CPU_SMU_FW_VERSION", false) +FLD_DESC_ENT(RDC_FI_CPU_HSMP_PROTO_VERSION, "HSMP protocol version", "CPU_HSMP_PROTO_VERSION", false) +FLD_DESC_ENT(RDC_FI_CPU_PROCHOT_STATUS, "CPU PROCHOT status", "CPU_PROCHOT_STATUS", false) +FLD_DESC_ENT(RDC_FI_CPU_FCLK_FREQUENCY, "CPU fabric clock frequency (MHz)", "CPU_FCLK_FREQUENCY", false) +FLD_DESC_ENT(RDC_FI_CPU_MCLK_FREQUENCY, "CPU memory clock frequency (MHz)", "CPU_MCLK_FREQUENCY", false) +FLD_DESC_ENT(RDC_FI_CPU_CCLK_LIMIT, "CPU core clock limit (MHz)", "CPU_CCLK_LIMIT", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT, "CPU socket active frequency limit (MHz)", "CPU_SOCKET_ACTIVE_FREQ_LIMIT", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC, "CPU socket frequency limit source type", "CPU_SOCKET_FREQ_LIMIT_SRC", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX, "CPU socket maximum frequency range (MHz)", "CPU_SOCKET_FREQ_RANGE_MAX", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN, "CPU socket minimum frequency range (MHz)", "CPU_SOCKET_FREQ_RANGE_MIN", false) +FLD_DESC_ENT(RDC_FI_CPU_CORE_FREQ_LIMIT, "CPU core current frequency limit (MHz)", "CPU_CORE_FREQ_LIMIT", false) +FLD_DESC_ENT(RDC_FI_CPU_CORE_BOOST_LIMIT, "CPU core boost limit (MHz)", "CPU_CORE_BOOST_LIMIT", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_C0_RESIDENCY, "CPU socket C0 residency percentage", "CPU_SOCKET_C0_RESIDENCY", false) +FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_MAX_BW, "CPU DDR maximum bandwidth (MB/s)", "CPU_DDR_BW_MAX_BW", false) +FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_UTILIZED_BW, "CPU DDR utilized bandwidth (MB/s)", "CPU_DDR_BW_UTILIZED_BW", false) +FLD_DESC_ENT(RDC_FI_CPU_DDR_BW_UTILIZED_PCT, "CPU DDR utilized bandwidth percentage", "CPU_DDR_BW_UTILIZED_PCT", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_TEMPERATURE, "CPU socket temperature (millidegrees Celsius)", "CPU_SOCKET_TEMPERATURE", false) +FLD_DESC_ENT(RDC_FI_CPU_DIMM_TEMP_RANGE, "CPU DIMM temperature range", "CPU_DIMM_TEMP_RANGE", false) +FLD_DESC_ENT(RDC_FI_CPU_DIMM_REFRESH_RATE, "CPU DIMM refresh rate", "CPU_DIMM_REFRESH_RATE", false) +FLD_DESC_ENT(RDC_FI_CPU_DIMM_POWER_CONSUMPTION, "CPU DIMM power consumption (milliwatts)", "CPU_DIMM_POWER_CONSUMPTION", false) +FLD_DESC_ENT(RDC_FI_CPU_DIMM_THERMAL_SENSOR, "CPU DIMM thermal sensor temperature (millidegrees)", "CPU_DIMM_THERMAL_SENSOR", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL, "CPU socket LCLK DPM level", "CPU_SOCKET_LCLK_DPM_LEVEL", false) +FLD_DESC_ENT(RDC_FI_CPU_IO_BANDWIDTH, "CPU I/O bandwidth (MB/s)", "CPU_IO_BANDWIDTH", false) +FLD_DESC_ENT(RDC_FI_CPU_XGMI_BANDWIDTH, "CPU XGMI bandwidth (MB/s)", "CPU_XGMI_BANDWIDTH", false) +FLD_DESC_ENT(RDC_FI_CPU_HSMP_METRICS_VERSION, "HSMP metrics table version", "CPU_HSMP_METRICS_VERSION", false) +FLD_DESC_ENT(RDC_FI_CPU_HSMP_METRICS_TABLE, "HSMP metrics table data", "CPU_HSMP_METRICS_TABLE", false) +FLD_DESC_ENT(RDC_FI_CPU_FIRST_ONLINE_CORE, "First online core on CPU socket", "CPU_FIRST_ONLINE_CORE", false) +FLD_DESC_ENT(RDC_FI_CPU_FAMILY, "CPU family identifier", "CPU_FAMILY", false) +FLD_DESC_ENT(RDC_FI_CPU_MODEL_ID, "CPU model identifier", "CPU_MODEL_ID", false) +FLD_DESC_ENT(RDC_FI_CPU_CORES_PER_SOCKET, "Number of CPU cores per socket", "CPU_CORES_PER_SOCKET", false) +FLD_DESC_ENT(RDC_FI_CPU_SOCKET_COUNT, "Number of CPU sockets", "CPU_SOCKET_COUNT", false) + diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index 4863e315..c4f1f93c 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -439,6 +439,46 @@ typedef enum { RDC_FI_CPU_POWER_LIMIT, //!< Power limit (watts) RDC_FI_CPU_VENDOR, //!< Name of the vendor RDC_FI_CPU_MODEL, //!< Name of the model + + /** TODO: Clean up duplicates + * @brief Below are AI generated from functions + */ + RDC_FI_CPU_CORE_ENERGY, //!< CPU core energy consumption (microjoules) + RDC_FI_CPU_SOCKET_ENERGY, //!< CPU socket energy consumption (microjoules) + RDC_FI_CPU_THREADS_PER_CORE, //!< Number of threads per CPU core + RDC_FI_CPU_HSMP_DRIVER_VERSION, //!< HSMP driver version + RDC_FI_CPU_SMU_FW_VERSION, //!< SMU firmware version + RDC_FI_CPU_HSMP_PROTO_VERSION, //!< HSMP protocol version + RDC_FI_CPU_PROCHOT_STATUS, //!< CPU PROCHOT status + RDC_FI_CPU_FCLK_FREQUENCY, //!< CPU fabric clock frequency (MHz) + RDC_FI_CPU_MCLK_FREQUENCY, //!< CPU memory clock frequency (MHz) + RDC_FI_CPU_CCLK_LIMIT, //!< CPU core clock limit (MHz) + RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT, //!< CPU socket active frequency limit (MHz) + RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC, //!< CPU socket frequency limit source type + RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX, //!< CPU socket maximum frequency range (MHz) + RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN, //!< CPU socket minimum frequency range (MHz) + RDC_FI_CPU_CORE_FREQ_LIMIT, //!< CPU core current frequency limit (MHz) + RDC_FI_CPU_CORE_BOOST_LIMIT, //!< CPU core boost limit (MHz) + RDC_FI_CPU_SOCKET_C0_RESIDENCY, //!< CPU socket C0 residency percentage + RDC_FI_CPU_DDR_BW_MAX_BW, //!< CPU DDR maximum bandwidth (MB/s) + RDC_FI_CPU_DDR_BW_UTILIZED_BW, //!< CPU DDR utilized bandwidth (MB/s) + RDC_FI_CPU_DDR_BW_UTILIZED_PCT, //!< CPU DDR utilized bandwidth percentage + RDC_FI_CPU_SOCKET_TEMPERATURE, //!< CPU socket temperature (millidegrees Celsius) + RDC_FI_CPU_DIMM_TEMP_RANGE, //!< CPU DIMM temperature range + RDC_FI_CPU_DIMM_REFRESH_RATE, //!< CPU DIMM refresh rate + RDC_FI_CPU_DIMM_POWER_CONSUMPTION, //!< CPU DIMM power consumption (milliwatts) + RDC_FI_CPU_DIMM_THERMAL_SENSOR, //!< CPU DIMM thermal sensor temperature (millidegrees) + RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL, //!< CPU socket LCLK DPM level + RDC_FI_CPU_IO_BANDWIDTH, //!< CPU I/O bandwidth (MB/s) + RDC_FI_CPU_XGMI_BANDWIDTH, //!< CPU XGMI bandwidth (MB/s) + RDC_FI_CPU_HSMP_METRICS_VERSION, //!< HSMP metrics table version + RDC_FI_CPU_HSMP_METRICS_TABLE, //!< HSMP metrics table data + RDC_FI_CPU_FIRST_ONLINE_CORE, //!< First online core on CPU socket + RDC_FI_CPU_FAMILY, //!< CPU family identifier + RDC_FI_CPU_MODEL_ID, //!< CPU model identifier + RDC_FI_CPU_CORES_PER_SOCKET, //!< Number of CPU cores per socket + RDC_FI_CPU_SOCKET_COUNT, //!< Number of CPU sockets + } rdc_field_t; // even and odd numbers are used for correctable and uncorrectable errors diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 13354298..8f527788 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -28,6 +28,7 @@ THE SOFTWARE. #include #include #include +#include #include #include "amd_smi/amdsmi.h" @@ -1118,6 +1119,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_partition_field_(uint32_t gpu_index } } +// Switch Case Implementation for fetch_cpu_field_ function rdc_status_t RdcMetricFetcherImpl::fetch_cpu_field_(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); @@ -1125,17 +1127,491 @@ rdc_status_t RdcMetricFetcherImpl::fetch_cpu_field_(uint32_t gpu_index, rdc_fiel amdsmi_processor_handle processor_handle = {}; amdsmi_status_t ret = get_processor_handle_from_id(gpu_index, &processor_handle); + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Cannot get processor handle for CPU " << gpu_index); + return Smi2RdcError(ret); + } + switch (field_id) { case RDC_FI_CPU_MODEL: { amdsmi_cpu_info_t cpu_info = {}; value->status = amdsmi_get_cpu_model_name(processor_handle, &cpu_info); - memcpy(value->value.str, cpu_info.model_name, sizeof(cpu_info.model_name)); value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS) { + memcpy(value->value.str, cpu_info.model_name, sizeof(cpu_info.model_name)); + } + break; + } + + case RDC_FI_CPU_CORE_ENERGY: { + uint64_t energy = 0; + value->status = amdsmi_get_cpu_core_energy(processor_handle, &energy); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(energy); + } + break; + } + + case RDC_FI_CPU_SOCKET_ENERGY: { + uint64_t energy = 0; + value->status = amdsmi_get_cpu_socket_energy(processor_handle, &energy); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(energy); + } + break; + } + + case RDC_FI_CPU_THREADS_PER_CORE: { + uint32_t threads_per_core = 0; + value->status = amdsmi_get_threads_per_core(&threads_per_core); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(threads_per_core); + } + break; + } + + case RDC_FI_CPU_HSMP_DRIVER_VERSION: { + amdsmi_hsmp_driver_version_t hsmp_driver_ver = {}; + value->status = amdsmi_get_cpu_hsmp_driver_version(processor_handle, &hsmp_driver_ver); + value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS) { + snprintf(value->value.str, sizeof(value->value.str), "%u.%u", hsmp_driver_ver.major, + hsmp_driver_ver.minor); + } + break; + } + + case RDC_FI_CPU_SMU_FW_VERSION: { + amdsmi_smu_fw_version_t smu_fw = {}; + value->status = amdsmi_get_cpu_smu_fw_version(processor_handle, &smu_fw); + value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS) { + snprintf(value->value.str, sizeof(value->value.str), "%u.%u.%u", smu_fw.major, smu_fw.minor, + smu_fw.debug); + } + break; + } + + case RDC_FI_CPU_HSMP_PROTO_VERSION: { + uint32_t proto_ver = 0; + value->status = amdsmi_get_cpu_hsmp_proto_ver(processor_handle, &proto_ver); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(proto_ver); + } + break; + } + + case RDC_FI_CPU_PROCHOT_STATUS: { + uint32_t prochot = 0; + value->status = amdsmi_get_cpu_prochot_status(processor_handle, &prochot); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(prochot); + } + break; + } + + case RDC_FI_CPU_FCLK_FREQUENCY: + case RDC_FI_CPU_MCLK_FREQUENCY: { + uint32_t fclk = 0, mclk = 0; + value->status = amdsmi_get_cpu_fclk_mclk(processor_handle, &fclk, &mclk); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + if (field_id == RDC_FI_CPU_FCLK_FREQUENCY) { + value->value.l_int = static_cast(fclk); + } else { + value->value.l_int = static_cast(mclk); + } + } + break; + } + + case RDC_FI_CPU_CCLK_LIMIT: { + uint32_t cclk = 0; + value->status = amdsmi_get_cpu_cclk_limit(processor_handle, &cclk); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(cclk); + } + break; + } + + case RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT: + case RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC: { + uint16_t freq = 0; + char* src_type = nullptr; + value->status = + amdsmi_get_cpu_socket_current_active_freq_limit(processor_handle, &freq, &src_type); + if (field_id == RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT) { + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(freq); + } + } else { + value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS && src_type != nullptr) { + strncpy(value->value.str, src_type, sizeof(value->value.str) - 1); + value->value.str[sizeof(value->value.str) - 1] = '\0'; + } + } + break; + } + + case RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX: + case RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN: { + uint16_t fmax = 0, fmin = 0; + value->status = amdsmi_get_cpu_socket_freq_range(processor_handle, &fmax, &fmin); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + if (field_id == RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX) { + value->value.l_int = static_cast(fmax); + } else { + value->value.l_int = static_cast(fmin); + } + } + break; + } + + case RDC_FI_CPU_CORE_FREQ_LIMIT: { + uint32_t freq = 0; + value->status = amdsmi_get_cpu_core_current_freq_limit(processor_handle, &freq); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(freq); + } + break; + } + + case RDC_FI_CPU_CORE_BOOST_LIMIT: { + uint32_t boostlimit = 0; + value->status = amdsmi_get_cpu_core_boostlimit(processor_handle, &boostlimit); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(boostlimit); + } + break; + } + + case RDC_FI_CPU_SOCKET_C0_RESIDENCY: { + uint32_t c0_residency = 0; + value->status = amdsmi_get_cpu_socket_c0_residency(processor_handle, &c0_residency); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(c0_residency); + } + break; + } + + case RDC_FI_CPU_DDR_BW_MAX_BW: + case RDC_FI_CPU_DDR_BW_UTILIZED_BW: + case RDC_FI_CPU_DDR_BW_UTILIZED_PCT: { + amdsmi_ddr_bw_metrics_t ddr_bw = {}; + value->status = amdsmi_get_cpu_ddr_bw(processor_handle, &ddr_bw); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + switch (field_id) { + case RDC_FI_CPU_DDR_BW_MAX_BW: + value->value.l_int = static_cast(ddr_bw.max_bw); + break; + case RDC_FI_CPU_DDR_BW_UTILIZED_BW: + value->value.l_int = static_cast(ddr_bw.utilized_bw); + break; + case RDC_FI_CPU_DDR_BW_UTILIZED_PCT: + value->value.l_int = static_cast(ddr_bw.utilized_pct); + break; + // no default case needed, all fields handled above + default: + value->status = AMDSMI_STATUS_INVAL; + } + } + break; + } + + case RDC_FI_CPU_SOCKET_TEMPERATURE: { + uint32_t temperature = 0; + value->status = amdsmi_get_cpu_socket_temperature(processor_handle, &temperature); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(temperature); + } + break; + } + + case RDC_FI_CPU_DIMM_TEMP_RANGE: + case RDC_FI_CPU_DIMM_REFRESH_RATE: { + // Note: This requires dimm_addr parameter, using 0 as default + uint8_t dimm_addr = 0; + amdsmi_temp_range_refresh_rate_t rate = {}; + value->status = + amdsmi_get_cpu_dimm_temp_range_and_refresh_rate(processor_handle, dimm_addr, &rate); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + if (field_id == RDC_FI_CPU_DIMM_TEMP_RANGE) { + value->value.l_int = static_cast(rate.range); + } else { + value->value.l_int = static_cast(rate.ref_rate); + } + } + break; + } + + case RDC_FI_CPU_DIMM_POWER_CONSUMPTION: { + uint8_t dimm_addr = 0; + amdsmi_dimm_power_t dimm_pow = {}; + value->status = amdsmi_get_cpu_dimm_power_consumption(processor_handle, dimm_addr, &dimm_pow); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(dimm_pow.power); + } + break; + } + + case RDC_FI_CPU_DIMM_THERMAL_SENSOR: { + uint8_t dimm_addr = 0; + amdsmi_dimm_thermal_t dimm_temp = {}; + value->status = amdsmi_get_cpu_dimm_thermal_sensor(processor_handle, dimm_addr, &dimm_temp); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(dimm_temp.sensor); + } + break; + } + + case RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL: { + uint8_t nbio_id = 0; + amdsmi_dpm_level_t nbio = {}; + value->status = amdsmi_get_cpu_socket_lclk_dpm_level(processor_handle, nbio_id, &nbio); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(nbio.max_dpm_level); + } + break; + } + + case RDC_FI_CPU_IO_BANDWIDTH: { + amdsmi_link_id_bw_type_t link = {}; + uint32_t io_bw = 0; + value->status = amdsmi_get_cpu_current_io_bandwidth(processor_handle, link, &io_bw); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(io_bw); + } + break; + } + + case RDC_FI_CPU_XGMI_BANDWIDTH: { + amdsmi_link_id_bw_type_t link = {}; + uint32_t xgmi_bw = 0; + value->status = amdsmi_get_cpu_current_xgmi_bw(processor_handle, link, &xgmi_bw); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(xgmi_bw); + } + break; + } + + case RDC_FI_CPU_HSMP_METRICS_VERSION: { + uint32_t metrics_version = 0; + value->status = amdsmi_get_hsmp_metrics_table_version(processor_handle, &metrics_version); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(metrics_version); + } + break; + } + + case RDC_FI_CPU_HSMP_METRICS_TABLE: { + amdsmi_hsmp_metrics_table_t metrics_table = {}; + value->status = amdsmi_get_hsmp_metrics_table(processor_handle, &metrics_table); + value->type = STRING; + if (value->status == AMDSMI_STATUS_SUCCESS) { + std::ostringstream oss; + + // Basic counters and timestamps + oss << "accumulation_counter:" << metrics_table.accumulation_counter << ";"; + oss << "timestamp:" << metrics_table.timestamp << ";"; + + // Temperature metrics + oss << "max_socket_temperature:" << metrics_table.max_socket_temperature << ";"; + oss << "max_vr_temperature:" << metrics_table.max_vr_temperature << ";"; + oss << "max_hbm_temperature:" << metrics_table.max_hbm_temperature << ";"; + oss << "max_socket_temperature_acc:" << metrics_table.max_socket_temperature_acc << ";"; + oss << "max_vr_temperature_acc:" << metrics_table.max_vr_temperature_acc << ";"; + oss << "max_hbm_temperature_acc:" << metrics_table.max_hbm_temperature_acc << ";"; + + // Power metrics + oss << "socket_power_limit:" << metrics_table.socket_power_limit << ";"; + oss << "max_socket_power_limit:" << metrics_table.max_socket_power_limit << ";"; + oss << "socket_power:" << metrics_table.socket_power << ";"; + + // Energy accumulators + oss << "socket_energy_acc:" << metrics_table.socket_energy_acc << ";"; + oss << "ccd_energy_acc:" << metrics_table.ccd_energy_acc << ";"; + oss << "xcd_energy_acc:" << metrics_table.xcd_energy_acc << ";"; + oss << "aid_energy_acc:" << metrics_table.aid_energy_acc << ";"; + oss << "hbm_energy_acc:" << metrics_table.hbm_energy_acc << ";"; + + // Frequency limits and current frequencies + oss << "cclk_frequency_limit:" << metrics_table.cclk_frequency_limit << ";"; + oss << "gfxclk_frequency_limit:" << metrics_table.gfxclk_frequency_limit << ";"; + oss << "fclk_frequency:" << metrics_table.fclk_frequency << ";"; + oss << "uclk_frequency:" << metrics_table.uclk_frequency << ";"; + + // Per-AID frequencies + for (int i = 0; i < 4; i++) { + oss << "socclk_frequency[" << i << "]:" << metrics_table.socclk_frequency[i] << ";"; + oss << "vclk_frequency[" << i << "]:" << metrics_table.vclk_frequency[i] << ";"; + oss << "dclk_frequency[" << i << "]:" << metrics_table.dclk_frequency[i] << ";"; + oss << "lclk_frequency[" << i << "]:" << metrics_table.lclk_frequency[i] << ";"; + } + + // Frequency accumulators + for (int i = 0; i < 8; i++) { + oss << "gfxclk_frequency_acc[" << i << "]:" << metrics_table.gfxclk_frequency_acc[i] + << ";"; + oss << "gfxclk_frequency[" << i << "]:" << metrics_table.gfxclk_frequency[i] << ";"; + } + + for (int i = 0; i < 96; i++) { + oss << "cclk_frequency_acc[" << i << "]:" << metrics_table.cclk_frequency_acc[i] << ";"; + } + + // Min/Max frequency support + oss << "max_cclk_frequency:" << metrics_table.max_cclk_frequency << ";"; + oss << "min_cclk_frequency:" << metrics_table.min_cclk_frequency << ";"; + oss << "max_gfxclk_frequency:" << metrics_table.max_gfxclk_frequency << ";"; + oss << "min_gfxclk_frequency:" << metrics_table.min_gfxclk_frequency << ";"; + + // Frequency tables + for (int i = 0; i < 4; i++) { + oss << "fclk_frequency_table[" << i << "]:" << metrics_table.fclk_frequency_table[i] + << ";"; + oss << "uclk_frequency_table[" << i << "]:" << metrics_table.uclk_frequency_table[i] + << ";"; + oss << "socclk_frequency_table[" << i << "]:" << metrics_table.socclk_frequency_table[i] + << ";"; + oss << "vclk_frequency_table[" << i << "]:" << metrics_table.vclk_frequency_table[i] + << ";"; + oss << "dclk_frequency_table[" << i << "]:" << metrics_table.dclk_frequency_table[i] + << ";"; + oss << "lclk_frequency_table[" << i << "]:" << metrics_table.lclk_frequency_table[i] + << ";"; + } + + // DPM ranges + oss << "max_lclk_dpm_range:" << metrics_table.max_lclk_dpm_range << ";"; + oss << "min_lclk_dpm_range:" << metrics_table.min_lclk_dpm_range << ";"; + + // XGMI metrics + oss << "xgmi_width:" << metrics_table.xgmi_width << ";"; + oss << "xgmi_bitrate:" << metrics_table.xgmi_bitrate << ";"; + + for (int i = 0; i < 8; i++) { + oss << "xgmi_read_bandwidth_acc[" << i << "]:" << metrics_table.xgmi_read_bandwidth_acc[i] + << ";"; + oss << "xgmi_write_bandwidth_acc[" << i + << "]:" << metrics_table.xgmi_write_bandwidth_acc[i] << ";"; + } + + // Utilization and bandwidth metrics + oss << "socket_c0_residency:" << metrics_table.socket_c0_residency << ";"; + oss << "socket_gfx_busy:" << metrics_table.socket_gfx_busy << ";"; + oss << "dram_bandwidth_utilization:" << metrics_table.dram_bandwidth_utilization << ";"; + oss << "socket_c0_residency_acc:" << metrics_table.socket_c0_residency_acc << ";"; + oss << "socket_gfx_busy_acc:" << metrics_table.socket_gfx_busy_acc << ";"; + oss << "dram_bandwidth_acc:" << metrics_table.dram_bandwidth_acc << ";"; + oss << "max_dram_bandwidth:" << metrics_table.max_dram_bandwidth << ";"; + oss << "dram_bandwidth_utilization_acc:" << metrics_table.dram_bandwidth_utilization_acc + << ";"; + + // PCIe bandwidth + for (int i = 0; i < 4; i++) { + oss << "pcie_bandwidth_acc[" << i << "]:" << metrics_table.pcie_bandwidth_acc[i] << ";"; + } + + // Throttling residency accumulators + oss << "prochot_residency_acc:" << metrics_table.prochot_residency_acc << ";"; + oss << "ppt_residency_acc:" << metrics_table.ppt_residency_acc << ";"; + oss << "socket_thm_residency_acc:" << metrics_table.socket_thm_residency_acc << ";"; + oss << "vr_thm_residency_acc:" << metrics_table.vr_thm_residency_acc << ";"; + oss << "hbm_thm_residency_acc:" << metrics_table.hbm_thm_residency_acc << ";"; + oss << "spare:" << metrics_table.spare; + + std::string result = oss.str(); + strncpy(value->value.str, result.c_str(), sizeof(value->value.str) - 1); + value->value.str[sizeof(value->value.str) - 1] = '\0'; + } break; } + + case RDC_FI_CPU_FIRST_ONLINE_CORE: { + uint32_t core_ind = 0; + value->status = amdsmi_first_online_core_on_cpu_socket(processor_handle, &core_ind); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(core_ind); + } + break; + } + + case RDC_FI_CPU_FAMILY: { + uint32_t cpu_family = 0; + value->status = amdsmi_get_cpu_family(&cpu_family); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(cpu_family); + } + break; + } + + case RDC_FI_CPU_MODEL_ID: { + uint32_t cpu_model = 0; + value->status = amdsmi_get_cpu_model(&cpu_model); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(cpu_model); + } + break; + } + + case RDC_FI_CPU_CORES_PER_SOCKET: { + uint32_t sock_count = 0; + amdsmi_status_t ret_count = amdsmi_get_cpu_socket_count(&sock_count); + if (ret_count == AMDSMI_STATUS_SUCCESS && sock_count > 0) { + auto* soc_info = new amdsmi_sock_info_t[sock_count]; + value->status = amdsmi_get_cpu_cores_per_socket(sock_count, soc_info); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + // Return cores for the first socket or specific socket based on processor_handle + value->value.l_int = static_cast(soc_info[0].cores_per_socket); + } + delete[] soc_info; + } else { + value->status = ret_count; + } + break; + } + + case RDC_FI_CPU_SOCKET_COUNT: { + uint32_t sock_count = 0; + value->status = amdsmi_get_cpu_socket_count(&sock_count); + value->type = INTEGER; + if (value->status == AMDSMI_STATUS_SUCCESS) { + value->value.l_int = static_cast(sock_count); + } + break; + } + default: value->status = AMDSMI_STATUS_NOT_SUPPORTED; + RDC_LOG(RDC_DEBUG, "CPU field " << field_id << " not supported"); + break; } + return Smi2RdcError(static_cast(value->status)); } @@ -1163,6 +1639,13 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field return Smi2RdcError(ret); } + // temporarily force cpu type + // TODO: Remove once entities support CPUs + if (field_id > RDC_FI_CPU_COUNT) { + info.device_type = RDC_DEVICE_TYPE_CPU; + RDC_LOG(RDC_ERROR, "Forcing device type to CPU for field " << field_id_string(field_id)); + } + if ((field_id > RDC_FI_CPU_COUNT) && (info.device_type != RDC_DEVICE_TYPE_CPU)) { RDC_LOG(RDC_ERROR, "Fail to fetch field " << field_id << " because of incorrect device type"); return RDC_ST_NOT_SUPPORTED; diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index 95f6f4a5..50e56048 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -187,7 +187,45 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_BUSY_PERCENT, RDC_FI_GPU_PAGE_RETRIED, RDC_FI_DEV_ID, RDC_FI_REV_ID, RDC_FI_TARGET_GRAPHICS_VERSION, RDC_FI_NUM_OF_COMPUTE_UNITS, RDC_FI_UUID, RDC_FI_GPU_PARTITION_COUNT, - RDC_FI_CPU_COUNT + RDC_FI_CPU_COUNT, + /** TODO: Clean up duplicates + * @brief Below are AI generated from functions + */ + RDC_FI_CPU_CORE_ENERGY, //!< CPU core energy consumption (microjoules) + RDC_FI_CPU_SOCKET_ENERGY, //!< CPU socket energy consumption (microjoules) + RDC_FI_CPU_THREADS_PER_CORE, //!< Number of threads per CPU core + RDC_FI_CPU_HSMP_DRIVER_VERSION, //!< HSMP driver version + RDC_FI_CPU_SMU_FW_VERSION, //!< SMU firmware version + RDC_FI_CPU_HSMP_PROTO_VERSION, //!< HSMP protocol version + RDC_FI_CPU_PROCHOT_STATUS, //!< CPU PROCHOT status + RDC_FI_CPU_FCLK_FREQUENCY, //!< CPU fabric clock frequency (MHz) + RDC_FI_CPU_MCLK_FREQUENCY, //!< CPU memory clock frequency (MHz) + RDC_FI_CPU_CCLK_LIMIT, //!< CPU core clock limit (MHz) + RDC_FI_CPU_SOCKET_ACTIVE_FREQ_LIMIT, //!< CPU socket active frequency limit (MHz) + RDC_FI_CPU_SOCKET_FREQ_LIMIT_SRC, //!< CPU socket frequency limit source type + RDC_FI_CPU_SOCKET_FREQ_RANGE_MAX, //!< CPU socket maximum frequency range (MHz) + RDC_FI_CPU_SOCKET_FREQ_RANGE_MIN, //!< CPU socket minimum frequency range (MHz) + RDC_FI_CPU_CORE_FREQ_LIMIT, //!< CPU core current frequency limit (MHz) + RDC_FI_CPU_CORE_BOOST_LIMIT, //!< CPU core boost limit (MHz) + RDC_FI_CPU_SOCKET_C0_RESIDENCY, //!< CPU socket C0 residency percentage + RDC_FI_CPU_DDR_BW_MAX_BW, //!< CPU DDR maximum bandwidth (MB/s) + RDC_FI_CPU_DDR_BW_UTILIZED_BW, //!< CPU DDR utilized bandwidth (MB/s) + RDC_FI_CPU_DDR_BW_UTILIZED_PCT, //!< CPU DDR utilized bandwidth percentage + RDC_FI_CPU_SOCKET_TEMPERATURE, //!< CPU socket temperature (millidegrees Celsius) + RDC_FI_CPU_DIMM_TEMP_RANGE, //!< CPU DIMM temperature range + RDC_FI_CPU_DIMM_REFRESH_RATE, //!< CPU DIMM refresh rate + RDC_FI_CPU_DIMM_POWER_CONSUMPTION, //!< CPU DIMM power consumption (milliwatts) + RDC_FI_CPU_DIMM_THERMAL_SENSOR, //!< CPU DIMM thermal sensor temperature (millidegrees) + RDC_FI_CPU_SOCKET_LCLK_DPM_LEVEL, //!< CPU socket LCLK DPM level + RDC_FI_CPU_IO_BANDWIDTH, //!< CPU I/O bandwidth (MB/s) + RDC_FI_CPU_XGMI_BANDWIDTH, //!< CPU XGMI bandwidth (MB/s) + RDC_FI_CPU_HSMP_METRICS_VERSION, //!< HSMP metrics table version + RDC_FI_CPU_HSMP_METRICS_TABLE, //!< HSMP metrics table data + RDC_FI_CPU_FIRST_ONLINE_CORE, //!< First online core on CPU socket + RDC_FI_CPU_FAMILY, //!< CPU family identifier + RDC_FI_CPU_MODEL_ID, //!< CPU model identifier + RDC_FI_CPU_CORES_PER_SOCKET, //!< Number of CPU cores per socket + RDC_FI_CPU_SOCKET_COUNT, //!< Number of CPU sockets }; // clang-format on std::copy(fields.begin(), fields.end(), field_ids); From 278ea26937c409367dc606602f10cd7ab5c06161 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Wed, 23 Jul 2025 14:15:07 -0500 Subject: [PATCH 6/6] Apply linting fixes Signed-off-by: Galantsev, Dmitrii --- CPPLINT.cfg | 3 - include/rdc_modules/rdc_rvs/RvsBase.h | 4 +- rdc_libs/bootstrap/src/RdcBootStrap.cc | 1 + rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 34 +++++++--- rdc_libs/rdc/src/SmiUtils.cc | 21 +++--- rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc | 4 +- rdci/src/RdciDiscoverySubSystem.cc | 1 + rdci/src/RdciDmonSubSystem.cc | 70 +++++++++----------- 8 files changed, 72 insertions(+), 66 deletions(-) delete mode 100644 CPPLINT.cfg diff --git a/CPPLINT.cfg b/CPPLINT.cfg deleted file mode 100644 index b63692c6..00000000 --- a/CPPLINT.cfg +++ /dev/null @@ -1,3 +0,0 @@ -set noparent -linelength=100 -filter=-build/include_subdir,-legal/copyright,-runtime/printf,-build/c++11,-runtime/int,-build/header_guard diff --git a/include/rdc_modules/rdc_rvs/RvsBase.h b/include/rdc_modules/rdc_rvs/RvsBase.h index 913e50f5..5fb6a932 100644 --- a/include/rdc_modules/rdc_rvs/RvsBase.h +++ b/include/rdc_modules/rdc_rvs/RvsBase.h @@ -71,8 +71,8 @@ namespace rdc { inline amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, amdsmi_processor_handle* processor_handle) { - uint32_t socket_count; - uint32_t processor_count; + uint32_t socket_count = 0; + uint32_t processor_count = 0; auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); if (ret != AMDSMI_STATUS_SUCCESS) { return ret; diff --git a/rdc_libs/bootstrap/src/RdcBootStrap.cc b/rdc_libs/bootstrap/src/RdcBootStrap.cc index 757dd626..657584a6 100644 --- a/rdc_libs/bootstrap/src/RdcBootStrap.cc +++ b/rdc_libs/bootstrap/src/RdcBootStrap.cc @@ -164,6 +164,7 @@ rdc_status_t rdc_group_gpu_add(rdc_handle_t p_rdc_handle, rdc_gpu_group_t groupI return static_cast(p_rdc_handle)->rdc_group_gpu_add(groupId, gpuIndex); } +// TODO: rewrite get_all to allow different types rdc_status_t rdc_device_get_all(rdc_handle_t p_rdc_handle, uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES], uint32_t* count) { if (!p_rdc_handle || !count) { diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 8f527788..2badc903 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -25,8 +25,8 @@ THE SOFTWARE. #include #include -#include #include +#include #include #include #include @@ -88,8 +88,8 @@ RdcMetricFetcherImpl::~RdcMetricFetcherImpl() { } uint64_t RdcMetricFetcherImpl::now() { - struct timeval tv {}; - gettimeofday(&tv, NULL); + struct timeval tv{}; + gettimeofday(&tv, nullptr); return static_cast(tv.tv_sec) * 1000 + tv.tv_usec / 1000; } @@ -299,7 +299,7 @@ void RdcMetricFetcherImpl::get_pcie_throughput(const RdcFieldKey& key) { uint64_t curTime = now(); MetricValue value{}; - value.cache_ttl = 30 * 1000; // cache 30 seconds + value.cache_ttl = static_cast(30) * 1000; // cache 30 seconds value.value.type = INTEGER; do { std::lock_guard guard(task_mutex_); @@ -370,7 +370,7 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( auto ite = bulk_fields.begin(); for (; ite != bulk_fields.end(); ite++) { amdsmi_gpu_metrics_t gpu_metrics; - amdsmi_processor_handle processor_handle; + amdsmi_processor_handle processor_handle = nullptr; rs = get_processor_handle_from_id(ite->first, &processor_handle); rs = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); @@ -758,7 +758,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_field_(uint32_t gpu_index, rdc_fiel constexpr uint32_t kUTILIZATION_COUNTERS(1); amdsmi_utilization_counter_t utilization_counters[kUTILIZATION_COUNTERS]; utilization_counters[0].type = AMDSMI_COARSE_DECODER_ACTIVITY; - uint64_t timestamp; + uint64_t timestamp = 0; value->status = amdsmi_get_utilization_count(processor_handle, utilization_counters, kUTILIZATION_COUNTERS, ×tamp); @@ -1115,6 +1115,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_gpu_partition_field_(uint32_t gpu_index // TODO: All other fields => N/A for partition IN AMDSMI // RDC_LOG(RDC_DEBUG, "Partition " << gpu_index << ": Field " << field_id_string(field_id) // << " not supported => NO_DATA."); + return RDC_ST_NOT_SUPPORTED; break; } } @@ -1132,6 +1133,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_cpu_field_(uint32_t gpu_index, rdc_fiel return Smi2RdcError(ret); } + processor_type_t processor_type = AMDSMI_PROCESSOR_TYPE_UNKNOWN; + ret = amdsmi_get_processor_type(processor_handle, &processor_type); + + if (ret != AMDSMI_STATUS_SUCCESS) { + RDC_LOG(RDC_ERROR, "Cannot get processor type for CPU " << gpu_index); + return Smi2RdcError(ret); + } + + RDC_LOG(RDC_DEBUG, "Processor type for CPU " << gpu_index << ": " << processor_type); + switch (field_id) { case RDC_FI_CPU_MODEL: { amdsmi_cpu_info_t cpu_info = {}; @@ -1642,8 +1653,9 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field // temporarily force cpu type // TODO: Remove once entities support CPUs if (field_id > RDC_FI_CPU_COUNT) { + RDC_LOG(RDC_ERROR, "Forcing device type to CPU for field " + << field_id_string(field_id) << " current type is " << info.device_type); info.device_type = RDC_DEVICE_TYPE_CPU; - RDC_LOG(RDC_ERROR, "Forcing device type to CPU for field " << field_id_string(field_id)); } if ((field_id > RDC_FI_CPU_COUNT) && (info.device_type != RDC_DEVICE_TYPE_CPU)) { @@ -1769,7 +1781,7 @@ rdc_status_t RdcMetricFetcherImpl::delete_smi_handle(RdcFieldKey fk) { case RDC_EVNT_XGMI_3_THRPUT: case RDC_EVNT_XGMI_4_THRPUT: case RDC_EVNT_XGMI_5_THRPUT: { - amdsmi_event_handle_t h; + amdsmi_event_handle_t h = 0L; if (smi_data_.find(fk) == smi_data_.end()) { return RDC_ST_NOT_SUPPORTED; } @@ -1802,8 +1814,8 @@ rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) { rdc_status_t ret = RDC_ST_OK; auto get_evnt_handle = [&](amdsmi_event_group_t grp) { - amdsmi_event_handle_t handle; - rdc_status_t result; + amdsmi_event_handle_t handle = 0L; + rdc_status_t result = RDC_ST_UNKNOWN_ERROR; if (get_smi_data(fk) != nullptr) { // This event has already been initialized. @@ -1815,7 +1827,7 @@ rdc_status_t RdcMetricFetcherImpl::acquire_smi_handle(RdcFieldKey fk) { RDC_LOG(RDC_ERROR, "Failed to init SMI counter. Return:" << result); return result; } - auto fsh = std::shared_ptr(new FieldSMIData); + auto fsh = std::make_shared(); if (fsh == nullptr) { return RDC_ST_INSUFF_RESOURCES; diff --git a/rdc_libs/rdc/src/SmiUtils.cc b/rdc_libs/rdc/src/SmiUtils.cc index 2fb5023d..2f92d15e 100644 --- a/rdc_libs/rdc/src/SmiUtils.cc +++ b/rdc_libs/rdc/src/SmiUtils.cc @@ -28,7 +28,6 @@ THE SOFTWARE. #include "amd_smi/amdsmi.h" #include "rdc/rdc.h" -#include "rdc_lib/RdcLogger.h" namespace amd { namespace rdc { @@ -108,13 +107,13 @@ amdsmi_status_t get_processor_handle_from_id(uint32_t gpu_id, return ret; } - for (auto& proc : procs) { - processor_type_t proc_type = {}; - ret = amdsmi_get_processor_type(proc, &proc_type); - if (proc_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { - return AMDSMI_STATUS_NOT_SUPPORTED; - } - } + // for (auto& proc : procs) { + // processor_type_t proc_type = {}; + // ret = amdsmi_get_processor_type(proc, &proc_type); + // if (proc_type != AMDSMI_PROCESSOR_TYPE_AMD_GPU) { + // return AMDSMI_STATUS_NOT_SUPPORTED; + // } + // } procs_by_socket[s] = procs; } @@ -169,7 +168,7 @@ amdsmi_status_t get_gpu_id_from_processor_handle(amdsmi_processor_handle process amdsmi_status_t get_processor_count(uint32_t& all_processor_count) { uint32_t total_processor_count = 0; - uint32_t socket_count; + uint32_t socket_count = 0; auto ret = amdsmi_get_socket_handles(&socket_count, nullptr); if (ret != AMDSMI_STATUS_SUCCESS) { return ret; @@ -177,7 +176,7 @@ amdsmi_status_t get_processor_count(uint32_t& all_processor_count) { std::vector sockets(socket_count); ret = amdsmi_get_socket_handles(&socket_count, sockets.data()); for (auto& socket : sockets) { - uint32_t processor_count; + uint32_t processor_count = 0; ret = amdsmi_get_processor_handles(socket, &processor_count, nullptr); if (ret != AMDSMI_STATUS_SUCCESS) { return ret; @@ -234,7 +233,7 @@ amdsmi_status_t get_metrics_info(amdsmi_processor_handle proc, amdsmi_gpu_metric amdsmi_status_t get_num_partition(uint32_t index, uint16_t* num_partition) { // Get the processor handle for the physical device. - amdsmi_processor_handle proc_handle; + amdsmi_processor_handle proc_handle = nullptr; amdsmi_status_t ret = get_processor_handle_from_id(index, &proc_handle); if (ret != AMDSMI_STATUS_SUCCESS) { return ret; diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 1576f0f1..bf2a9193 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -35,7 +35,6 @@ THE SOFTWARE. #include #include #include -#include #include #include @@ -45,7 +44,6 @@ THE SOFTWARE. #include "rdc_lib/RdcLogger.h" #include "rdc_lib/RdcTelemetryLibInterface.h" #include "rdc_lib/impl/SmiUtils.h" -#include "rdc_lib/rdc_common.h" #include "rdc_modules/rdc_rocp/RdcRocpCounterSampler.h" namespace amd { @@ -157,6 +155,7 @@ const char* RdcRocpBase::get_field_id_from_name(rdc_field_t field) { const std::vector RdcRocpBase::get_field_ids() { std::vector field_ids; + field_ids.reserve(field_to_metric.size()); for (auto& [k, v] : field_to_metric) { field_ids.push_back(k); } @@ -297,6 +296,7 @@ void RdcRocpBase::init_rocp_if_not() { } // populate fields + all_fields.reserve(temp_field_map_k.size()); for (const auto& [k, v] : temp_field_map_k) { all_fields.emplace_back(v); } diff --git a/rdci/src/RdciDiscoverySubSystem.cc b/rdci/src/RdciDiscoverySubSystem.cc index c58d5a2d..6dd8384d 100644 --- a/rdci/src/RdciDiscoverySubSystem.cc +++ b/rdci/src/RdciDiscoverySubSystem.cc @@ -111,6 +111,7 @@ void RdciDiscoverySubSystem::show_help() const { void RdciDiscoverySubSystem::show_attributes() { uint32_t gpu_index_list[RDC_MAX_NUM_DEVICES]; uint32_t count = 0; + rdc_status_t result = rdc_device_get_all(rdc_handle_, gpu_index_list, &count); if (result != RDC_ST_OK) { throw RdcException(result, "Fail to get device information"); diff --git a/rdci/src/RdciDmonSubSystem.cc b/rdci/src/RdciDmonSubSystem.cc index 0cb9d108..0a0f383c 100644 --- a/rdci/src/RdciDmonSubSystem.cc +++ b/rdci/src/RdciDmonSubSystem.cc @@ -26,7 +26,6 @@ THE SOFTWARE. #include #include -#include #include #include #include @@ -35,14 +34,12 @@ THE SOFTWARE. #include #include #include -#include #include #include "common/rdc_fields_supported.h" #include "common/rdc_utils.h" #include "rdc/rdc.h" #include "rdc_lib/RdcException.h" -#include "rdc_lib/rdc_common.h" namespace amd { namespace rdc { @@ -166,16 +163,15 @@ void RdciDmonSubSystem::parse_cmd_opts(int argc, char** argv) { throw RdcException(RDC_ST_BAD_PARAMETER, "Need to specify the fields or field group id"); } else { std::vector vec_ids = split_string(field_ids, ','); - for (uint32_t i = 0; i < vec_ids.size(); i++) { - if (!IsNumber(vec_ids[i])) { + for (const auto& vec_id : vec_ids) { + if (!IsNumber(vec_id)) { rdc_field_t field_id = RDC_FI_INVALID; - if (!amd::rdc::get_field_id_from_name(vec_ids[i], &field_id)) { - throw RdcException(RDC_ST_BAD_PARAMETER, - "The field name " + vec_ids[i] + " is not valid"); + if (!amd::rdc::get_field_id_from_name(vec_id, &field_id)) { + throw RdcException(RDC_ST_BAD_PARAMETER, "The field name " + vec_id + " is not valid"); } field_ids_.push_back(field_id); } else { - field_ids_.push_back(static_cast(std::stoi(vec_ids[i]))); + field_ids_.push_back(static_cast(std::stoi(vec_id))); } } } @@ -259,7 +255,7 @@ void RdciDmonSubSystem::create_temp_group() { } const std::string group_name("rdci-dmon-group"); - rdc_gpu_group_t group_id; + rdc_gpu_group_t group_id = 0; rdc_status_t result = rdc_group_gpu_create(rdc_handle_, RDC_GROUP_EMPTY, group_name.c_str(), &group_id); if (result != RDC_ST_OK) { @@ -267,10 +263,10 @@ void RdciDmonSubSystem::create_temp_group() { } need_cleanup_ = true; - for (uint32_t i = 0; i < gpu_indexes_.size(); i++) { - result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_indexes_[i]); + for (unsigned int gpu_index : gpu_indexes_) { + result = rdc_group_gpu_add(rdc_handle_, group_id, gpu_index); if (result != RDC_ST_OK) { - rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_indexes_[i]); + rdc_entity_info_t info = rdc_get_info_from_entity_index(gpu_index); std::string info_str; if (info.entity_role == RDC_DEVICE_ROLE_PARTITION_INSTANCE) { info_str = @@ -290,7 +286,7 @@ void RdciDmonSubSystem::create_temp_field_group() { } const std::string field_group_name("rdci-dmon-field-group"); - rdc_field_grp_t group_id; + rdc_field_grp_t group_id = 0; rdc_field_t field_ids[RDC_MAX_FIELD_IDS_PER_FIELD_GROUP]; for (uint32_t i = 0; i < field_ids_.size(); i++) { field_ids[i] = field_ids_[i]; @@ -321,12 +317,12 @@ void RdciDmonSubSystem::resolve_gpu_indexes() { } std::vector vec_ids = split_string(raw_gpu_indexes_, ','); - for (uint32_t i = 0; i < vec_ids.size(); i++) { - if (rdc_is_partition_string(vec_ids[i].c_str())) { - uint32_t logicalPhysicalGpu; - uint32_t partition; - if (!rdc_parse_partition_string(vec_ids[i].c_str(), &logicalPhysicalGpu, &partition)) { - throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid partition format: " + vec_ids[i]); + for (const auto& vec_id : vec_ids) { + if (rdc_is_partition_string(vec_id.c_str())) { + uint32_t logicalPhysicalGpu = 0; + uint32_t partition = 0; + if (!rdc_parse_partition_string(vec_id.c_str(), &logicalPhysicalGpu, &partition)) { + throw RdcException(RDC_ST_BAD_PARAMETER, "Invalid partition format: " + vec_id); } if (logicalPhysicalGpu >= count) { @@ -365,16 +361,16 @@ void RdciDmonSubSystem::resolve_gpu_indexes() { phys_info.device_type = RDC_DEVICE_TYPE_GPU; uint32_t phys_entity_index = rdc_get_entity_index_from_info(phys_info); gpu_indexes_.push_back(phys_entity_index); - } else if (IsNumber(vec_ids[i])) { - uint32_t logicalIndex = std::stoi(vec_ids[i]); + } else if (IsNumber(vec_id)) { + uint32_t logicalIndex = std::stoi(vec_id); if (logicalIndex >= count) { throw RdcException(RDC_ST_BAD_PARAMETER, "GPU " + std::to_string(logicalIndex) + " is out of range"); } - gpu_indexes_.push_back(std::stoi(vec_ids[i])); + gpu_indexes_.push_back(std::stoi(vec_id)); } else { - throw RdcException(RDC_ST_BAD_PARAMETER, "The GPU index " + vec_ids[i] + - " needs to be a number or a valid partition"); + throw RdcException(RDC_ST_BAD_PARAMETER, + "The GPU index " + vec_id + " needs to be a number or a valid partition"); } } } @@ -383,9 +379,9 @@ void RdciDmonSubSystem::show_field_usage() const { std::cout << "Supported fields Ids:" << std::endl; amd::rdc::fld_id2name_map_t& field_id_to_descript = amd::rdc::get_field_id_description_from_id(); - for (auto i = field_id_to_descript.begin(); i != field_id_to_descript.end(); i++) { - if (i->second.do_display || dmon_ops_ == DMON_LIST_ALL_FIELDS) { - std::cout << i->first << " " << i->second.enum_name << " : " << i->second.description << "." + for (const auto& i : field_id_to_descript) { + if (i.second.do_display || dmon_ops_ == DMON_LIST_ALL_FIELDS) { + std::cout << i.first << " " << i.second.enum_name << " : " << i.second.description << "." << std::endl; } } @@ -428,10 +424,10 @@ typedef std::priority_queue, Compa static void collect_new_notifs(rdc_handle_t h, const rdc_group_info_t& group_info, const std::vector& notif_fields, std::vector* notif_ts, field_pq_t* notif_pq) { - rdc_status_t ret; + rdc_status_t ret = RDC_ST_UNKNOWN_ERROR; notif_dev_value value; std::string error_msg; - uint64_t next_ts; + uint64_t next_ts = 0; assert(notif_ts != nullptr); @@ -461,7 +457,7 @@ static void collect_new_notifs(rdc_handle_t h, const rdc_group_info_t& group_inf // ts is milliseconds static std::string ts_string(const time_t ts) { - struct tm* timeinfo; + struct tm* timeinfo = nullptr; time_t tmp_ts = ts / 1000; std::string ret; @@ -504,7 +500,7 @@ void RdciDmonSubSystem::process() { return; } - rdc_status_t result; + rdc_status_t result = RDC_ST_UNKNOWN_ERROR; rdc_group_info_t group_info; rdc_field_group_info_t field_info; @@ -549,9 +545,9 @@ void RdciDmonSubSystem::process() { // keep extra 1 minute data double max_keep_age = options_[OPTIONS_DELAY] / 1000.0 + 60; const int max_keep_samples = 10; // keep only 10 samples - result = - rdc_field_watch(rdc_handle_, options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], - options_[OPTIONS_DELAY] * 1000, max_keep_age, max_keep_samples); + result = rdc_field_watch( + rdc_handle_, options_[OPTIONS_GROUP_ID], options_[OPTIONS_FIELD_GROUP_ID], + static_cast(options_[OPTIONS_DELAY]) * 1000, max_keep_age, max_keep_samples); need_cleanup_ = true; std::stringstream ss; @@ -570,8 +566,8 @@ void RdciDmonSubSystem::process() { ss << std::left << std::setw(25) << "TIMESTAMP"; ss << " "; } - for (uint32_t findex = 0; findex < reg_fields.size(); findex++) { - ss << std::left << std::setw(20) << field_id_string(reg_fields[findex]); + for (auto& reg_field : reg_fields) { + ss << std::left << std::setw(20) << field_id_string(reg_field); } ss << std::endl;