From a729bdee3b68f99179394e9b249f1f876111259c Mon Sep 17 00:00:00 2001 From: moosecrap Date: Sat, 6 Dec 2025 14:23:36 -0800 Subject: [PATCH] Added DCGM metrics for those with Quadro cards. Updated include paths for certain CUDA installations --- Makefile | 10 +++- board-sensors.h | 6 +-- evga-icx.c | 20 ++++++-- nvidia-dcgm.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++ nvidia-dcgm.h | 51 +++++++++++++++++++ nvidia-sensors.c | 4 +- 6 files changed, 212 insertions(+), 9 deletions(-) create mode 100644 nvidia-dcgm.c create mode 100644 nvidia-dcgm.h diff --git a/Makefile b/Makefile index 63170e7..9e4235c 100644 --- a/Makefile +++ b/Makefile @@ -2,14 +2,20 @@ OBJS = evga-icx.o evga-card.o icx3.o board-sensors.o zen3-rapl.o LDLIBS = -li2c -lm -CFLAGS = -MD +CFLAGS = -MD -I/usr/local/cuda/targets/x86_64-linux/include/ ifdef USE_NVML -LDLIBS += -lnvidia-ml +LDLIBS += -lnvidia-ml -L/usr/local/cuda/targets/x86_64-linux/lib/stubs/ CFLAGS += -DUSE_NVML OBJS += nvidia-sensors.o endif +ifdef USE_DCGM +LDLIBS += -ldcgm +CFLAGS += -DUSE_DCGM -I/usr/include/datacenter-gpu-manager-4/ +OBJS += nvidia-dcgm.o +endif + ifdef USE_LIBPCI LDLIBS += -lpci CFLAGS += -DUSE_LIBPCI diff --git a/board-sensors.h b/board-sensors.h index b890017..4812cda 100644 --- a/board-sensors.h +++ b/board-sensors.h @@ -32,10 +32,10 @@ static struct hwmon_sensor hwmon_sensor_info[] = {"asusec", "temp1", "Motherboard ", "CHIP", "°C", 1000.0, -40.0 }, /* Chipset */ {"asusec", "temp5", "Motherboard ", "VRM", "°C", 1000.0, -40.0 }, /* VRM */ {"asusec", "temp3", "", "MOBO", "°C", 1000.0, -40.0 }, /* Motherboard */ - {"asusec", "temp4", "Motherboard ", "SENS", "°C", 1000.0, -40.0 }, /* T_Sensor */ - {"asusec", "temp6", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_In */ - {"asusec", "temp7", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_Out */ {"nvme", "temp1", "NVMe ", "NVME", "°C", 1000.0, -40.0 }, /* NVME Composite */ + {"asusec", "temp6", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_In */ + {"asusec", "temp4", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* T_Sensor */ + {"asusec", "temp7", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_Out */ {"nct6798", "fan2", "CPU fan", "CPU", "%", 15.0, 0.0 }, /* cpu_fan, cpu mid */ {"asusec", "fan1", "", "CPU", "%", 15.0, 0.0 }, /* cpu_opt, cpu front */ {"nct6798", "fan5", "H amp", "CHA", "%", 12.0, 0.0 }, /* h_amp, front fan */ diff --git a/evga-icx.c b/evga-icx.c index 52bd43a..314049c 100644 --- a/evga-icx.c +++ b/evga-icx.c @@ -6,11 +6,15 @@ #include #ifdef USE_NVML -#include "nvidia-sensors.h" + #include "nvidia-sensors.h" +#endif + +#ifdef USE_DCGM + #include "nvidia-dcgm.h" #endif #ifdef USE_LIBPCI -#include "gddr6.h" + #include "gddr6.h" #endif #include "icx3.h" @@ -153,7 +157,6 @@ int main (int argc, char **argv) return -1; } else if (gpu_count == 0) { printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n"); - return -1; } /* Check for invalid GPUs */ if (gpu_num > gpu_count - 1) { @@ -195,6 +198,10 @@ int main (int argc, char **argv) } #endif +#ifdef USE_DCGM + init_dcgm(); +#endif + /* PCI init for VRAM/hotspot temps */ #ifdef USE_LIBPCI for (int i = 0; i < gpu_count; i++) @@ -209,6 +216,10 @@ int main (int argc, char **argv) if (print_board_sensors) print_board_info(board_sensors, board_sensor_count); + +#ifdef USE_DCGM + print_dcgm(compact, overwrite); +#endif if (gpu_num == -1) { /* No GPU specified on command line, loop over all supported GPUs */ @@ -238,6 +249,9 @@ int main (int argc, char **argv) #ifdef USE_NVML nvmlShutdown(); #endif +#ifdef USE_DCGM + shutdown_dcgm(); +#endif } void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors) diff --git a/nvidia-dcgm.c b/nvidia-dcgm.c new file mode 100644 index 0000000..4946b57 --- /dev/null +++ b/nvidia-dcgm.c @@ -0,0 +1,130 @@ +#include + +#include "nvidia-dcgm.h" + +/* Yeah I know this is duplicated */ +#define HEADER_COLOR_START "\x1b[36m" +#define HEADER_COLOR_END "\x1b[39m" + +dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL; +dcgmFieldGrp_t field_group_id; + +unsigned long long clock_reasons = 0; + +void init_dcgm() +{ + /* Init DCGM */ + dcgmReturn_t result; + result = dcgmInit(); + if (result != DCGM_ST_OK) + printf("Could not init DCGM: %s\n", errorString(result)); + + /* Start embedded */ + result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle); + if (result != DCGM_ST_OK) + printf("Could not start embedded DCGM engine: %s\n", errorString(result)); + + /* Create metric group */ + int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]); + unsigned short field_ids[num_metrics + 1]; + for (int i = 0; i < num_metrics; i++) { + field_ids[i] = metric_readings[i].field_id; + metric_readings[i].last_reading = 0.0; + } + field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS; + result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id); + if (result != DCGM_ST_OK) + printf("Could not create DCGM metric group: %s\n", errorString(result)); + + /* Start recording */ + result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1); + if (result != DCGM_ST_OK) + printf("Could not record DCGM metrics: %s\n", errorString(result)); +} + +void shutdown_dcgm() +{ + dcgmStopEmbedded(dcgm_handle); + dcgmShutdown(); +} + +void print_dcgm(int compact, int overwrite) +{ + dcgmReturn_t result; + dcgmUpdateAllFields(dcgm_handle, 1); + result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL); + if (result != DCGM_ST_OK) + printf("Could not read DCGM metrics: %s\n", errorString(result)); + + /* Print the float metrics */ + for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) { + if (compact) + printf("%s%s%s%s %4.1f", + i==0 ? "": " ", + HEADER_COLOR_START, + metric_readings[i].short_name, + HEADER_COLOR_END, + metric_readings[i].last_reading * 100); + else + printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100); + } + if (compact) + printf("%%"); + + /* Print clock reasons */ + int single_reason = 1; + if (compact) + printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END); + else + printf("Clock reasons: "); + + for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) { + if (clock_reasons & clock_reason_names[i].mask) { + if (!single_reason) { + if (compact) + printf(","); + else + printf(", "); + } + single_reason = 0; + + if (compact) + printf("%s", clock_reason_names[i].short_name); + else + printf("%s", clock_reason_names[i].long_name); + } + } + + if (single_reason) + printf("None"); + + if (!compact) + printf(" (0x%llx)", clock_reasons); + else + printf("%-15s", " "); + + if (overwrite) + printf("\x1b[1G"); +} + +int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata) +{ + for (int j = 0; j < num_values; j++) { + if (values[j].fieldType == DCGM_FT_DOUBLE) { + /* A double reading so read it into our readings array */ + for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) { + if (metric_readings[i].field_id == values[j].fieldId) { + double val = values[j].value.dbl; + if (val >= 1) + metric_readings[i].last_reading = 0.0; + else + metric_readings[i].last_reading = values[j].value.dbl; + } + } + } else if (values[j].fieldType == DCGM_FT_INT64) { + /* Int is probably the bitmask of our clock reasons */ + clock_reasons = values[j].value.i64; + } + } + return 0; +} diff --git a/nvidia-dcgm.h b/nvidia-dcgm.h new file mode 100644 index 0000000..ffd049d --- /dev/null +++ b/nvidia-dcgm.h @@ -0,0 +1,51 @@ +#include +#include + +#include "evga-card.h" + +struct metric_reading { + unsigned short field_id; + char *short_name; + char *long_name; + double last_reading; +}; + +static struct metric_reading metric_readings[] = +{ + {DCGM_FI_PROF_GR_ENGINE_ACTIVE, "GFX", "Graphics Engine Activity"}, + {DCGM_FI_PROF_SM_ACTIVE, "SM ACT", "SM Activity"}, + {DCGM_FI_PROF_SM_OCCUPANCY, "OCC", "SM Occupancy"}, + {DCGM_FI_PROF_PIPE_FP16_ACTIVE, "FP16", "FP16 Engine Activity"}, + {DCGM_FI_PROF_PIPE_FP32_ACTIVE, "FP32", "FP32 Engine Activity"}, + {DCGM_FI_PROF_PIPE_FP64_ACTIVE, "FP64", "FP64 Engine Activity"}, + {DCGM_FI_PROF_PIPE_INT_ACTIVE, "INT", "Integer Activity"}, + {DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, "TENS", "Tensor Activity"}, + {DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE, "HMMA", "Half Precision Tensor"}, + {DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE, "IMMA", "Integer Tensor"}, + {DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE, "DFMA", "Double Precision Tensor"}, + {DCGM_FI_PROF_DRAM_ACTIVE, "DRAM", "Memory BW Utilization"} +}; + +struct clock_reason { + unsigned long long mask; + char *short_name; + char *long_name; +}; + +static struct clock_reason clock_reason_names[] = +{ + {DCGM_CLOCKS_EVENT_REASON_GPU_IDLE, "Idle", "GPU idle"}, + {DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING, "AppClk", "Application clocks"}, + {DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP, "Pwr", "Power cap"}, + {DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN, "HWSlow", "Hardware slowdown"}, + {DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST, "Sync", "Sync boost"}, + {DCGM_CLOCKS_EVENT_REASON_SW_THERMAL, "SWTherm", "Software thermal"}, + {DCGM_CLOCKS_EVENT_REASON_HW_THERMAL, "HWTherm", "Hardware thermal"}, + {DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE, "HWPower", "Hardware power brake"}, + {DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS, "DispClk", "Display clock"} +}; + +void init_dcgm(); +void shutdown_dcgm(); +void print_dcgm(int compact, int overwrite); +int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata); \ No newline at end of file diff --git a/nvidia-sensors.c b/nvidia-sensors.c index e8855ed..b098681 100644 --- a/nvidia-sensors.c +++ b/nvidia-sensors.c @@ -36,7 +36,7 @@ void print_nvml_clock_reason(int compact, struct card_info *card) single_reason = 0; if (compact) - printf("%-15s", clock_reason_names[i].short_name); + printf("%s", clock_reason_names[i].short_name); else printf("%s", clock_reason_names[i].long_name); } @@ -47,6 +47,8 @@ void print_nvml_clock_reason(int compact, struct card_info *card) if (!compact) printf(" (0x%llx)", reasons); + else + printf("%-15s", " "); } unsigned int get_nvml_temp(struct card_info *card)