Added DCGM metrics for those with Quadro cards.

Updated include paths for certain CUDA installations
This commit is contained in:
moosecrap 2025-12-06 14:23:36 -08:00
parent af2ceec405
commit a729bdee3b
6 changed files with 212 additions and 9 deletions

View File

@ -2,14 +2,20 @@
OBJS = evga-icx.o evga-card.o icx3.o board-sensors.o zen3-rapl.o
LDLIBS = -li2c -lm
CFLAGS = -MD
CFLAGS = -MD -I/usr/local/cuda/targets/x86_64-linux/include/
ifdef USE_NVML
LDLIBS += -lnvidia-ml
LDLIBS += -lnvidia-ml -L/usr/local/cuda/targets/x86_64-linux/lib/stubs/
CFLAGS += -DUSE_NVML
OBJS += nvidia-sensors.o
endif
ifdef USE_DCGM
LDLIBS += -ldcgm
CFLAGS += -DUSE_DCGM -I/usr/include/datacenter-gpu-manager-4/
OBJS += nvidia-dcgm.o
endif
ifdef USE_LIBPCI
LDLIBS += -lpci
CFLAGS += -DUSE_LIBPCI

View File

@ -32,10 +32,10 @@ static struct hwmon_sensor hwmon_sensor_info[] =
{"asusec", "temp1", "Motherboard ", "CHIP", "°C", 1000.0, -40.0 }, /* Chipset */
{"asusec", "temp5", "Motherboard ", "VRM", "°C", 1000.0, -40.0 }, /* VRM */
{"asusec", "temp3", "", "MOBO", "°C", 1000.0, -40.0 }, /* Motherboard */
{"asusec", "temp4", "Motherboard ", "SENS", "°C", 1000.0, -40.0 }, /* T_Sensor */
{"asusec", "temp6", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_In */
{"asusec", "temp7", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_Out */
{"nvme", "temp1", "NVMe ", "NVME", "°C", 1000.0, -40.0 }, /* NVME Composite */
{"asusec", "temp6", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_In */
{"asusec", "temp4", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* T_Sensor */
{"asusec", "temp7", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_Out */
{"nct6798", "fan2", "CPU fan", "CPU", "%", 15.0, 0.0 }, /* cpu_fan, cpu mid */
{"asusec", "fan1", "", "CPU", "%", 15.0, 0.0 }, /* cpu_opt, cpu front */
{"nct6798", "fan5", "H amp", "CHA", "%", 12.0, 0.0 }, /* h_amp, front fan */

View File

@ -6,11 +6,15 @@
#include <stdlib.h>
#ifdef USE_NVML
#include "nvidia-sensors.h"
#include "nvidia-sensors.h"
#endif
#ifdef USE_DCGM
#include "nvidia-dcgm.h"
#endif
#ifdef USE_LIBPCI
#include "gddr6.h"
#include "gddr6.h"
#endif
#include "icx3.h"
@ -153,7 +157,6 @@ int main (int argc, char **argv)
return -1;
} else if (gpu_count == 0) {
printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n");
return -1;
}
/* Check for invalid GPUs */
if (gpu_num > gpu_count - 1) {
@ -195,6 +198,10 @@ int main (int argc, char **argv)
}
#endif
#ifdef USE_DCGM
init_dcgm();
#endif
/* PCI init for VRAM/hotspot temps */
#ifdef USE_LIBPCI
for (int i = 0; i < gpu_count; i++)
@ -210,6 +217,10 @@ int main (int argc, char **argv)
if (print_board_sensors)
print_board_info(board_sensors, board_sensor_count);
#ifdef USE_DCGM
print_dcgm(compact, overwrite);
#endif
if (gpu_num == -1) {
/* No GPU specified on command line, loop over all supported GPUs */
for (int i = 0; i < gpu_count; i++){
@ -238,6 +249,9 @@ int main (int argc, char **argv)
#ifdef USE_NVML
nvmlShutdown();
#endif
#ifdef USE_DCGM
shutdown_dcgm();
#endif
}
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors)

130
nvidia-dcgm.c Normal file
View File

@ -0,0 +1,130 @@
#include <stdio.h>
#include "nvidia-dcgm.h"
/* Yeah I know this is duplicated */
#define HEADER_COLOR_START "\x1b[36m"
#define HEADER_COLOR_END "\x1b[39m"
dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL;
dcgmFieldGrp_t field_group_id;
unsigned long long clock_reasons = 0;
void init_dcgm()
{
/* Init DCGM */
dcgmReturn_t result;
result = dcgmInit();
if (result != DCGM_ST_OK)
printf("Could not init DCGM: %s\n", errorString(result));
/* Start embedded */
result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle);
if (result != DCGM_ST_OK)
printf("Could not start embedded DCGM engine: %s\n", errorString(result));
/* Create metric group */
int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]);
unsigned short field_ids[num_metrics + 1];
for (int i = 0; i < num_metrics; i++) {
field_ids[i] = metric_readings[i].field_id;
metric_readings[i].last_reading = 0.0;
}
field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS;
result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id);
if (result != DCGM_ST_OK)
printf("Could not create DCGM metric group: %s\n", errorString(result));
/* Start recording */
result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1);
if (result != DCGM_ST_OK)
printf("Could not record DCGM metrics: %s\n", errorString(result));
}
void shutdown_dcgm()
{
dcgmStopEmbedded(dcgm_handle);
dcgmShutdown();
}
void print_dcgm(int compact, int overwrite)
{
dcgmReturn_t result;
dcgmUpdateAllFields(dcgm_handle, 1);
result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL);
if (result != DCGM_ST_OK)
printf("Could not read DCGM metrics: %s\n", errorString(result));
/* Print the float metrics */
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
if (compact)
printf("%s%s%s%s %4.1f",
i==0 ? "": " ",
HEADER_COLOR_START,
metric_readings[i].short_name,
HEADER_COLOR_END,
metric_readings[i].last_reading * 100);
else
printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100);
}
if (compact)
printf("%%");
/* Print clock reasons */
int single_reason = 1;
if (compact)
printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END);
else
printf("Clock reasons: ");
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
if (clock_reasons & clock_reason_names[i].mask) {
if (!single_reason) {
if (compact)
printf(",");
else
printf(", ");
}
single_reason = 0;
if (compact)
printf("%s", clock_reason_names[i].short_name);
else
printf("%s", clock_reason_names[i].long_name);
}
}
if (single_reason)
printf("None");
if (!compact)
printf(" (0x%llx)", clock_reasons);
else
printf("%-15s", " ");
if (overwrite)
printf("\x1b[1G");
}
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata)
{
for (int j = 0; j < num_values; j++) {
if (values[j].fieldType == DCGM_FT_DOUBLE) {
/* A double reading so read it into our readings array */
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
if (metric_readings[i].field_id == values[j].fieldId) {
double val = values[j].value.dbl;
if (val >= 1)
metric_readings[i].last_reading = 0.0;
else
metric_readings[i].last_reading = values[j].value.dbl;
}
}
} else if (values[j].fieldType == DCGM_FT_INT64) {
/* Int is probably the bitmask of our clock reasons */
clock_reasons = values[j].value.i64;
}
}
return 0;
}

51
nvidia-dcgm.h Normal file
View File

@ -0,0 +1,51 @@
#include <dcgm_agent.h>
#include <dcgm_structs.h>
#include "evga-card.h"
struct metric_reading {
unsigned short field_id;
char *short_name;
char *long_name;
double last_reading;
};
static struct metric_reading metric_readings[] =
{
{DCGM_FI_PROF_GR_ENGINE_ACTIVE, "GFX", "Graphics Engine Activity"},
{DCGM_FI_PROF_SM_ACTIVE, "SM ACT", "SM Activity"},
{DCGM_FI_PROF_SM_OCCUPANCY, "OCC", "SM Occupancy"},
{DCGM_FI_PROF_PIPE_FP16_ACTIVE, "FP16", "FP16 Engine Activity"},
{DCGM_FI_PROF_PIPE_FP32_ACTIVE, "FP32", "FP32 Engine Activity"},
{DCGM_FI_PROF_PIPE_FP64_ACTIVE, "FP64", "FP64 Engine Activity"},
{DCGM_FI_PROF_PIPE_INT_ACTIVE, "INT", "Integer Activity"},
{DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, "TENS", "Tensor Activity"},
{DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE, "HMMA", "Half Precision Tensor"},
{DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE, "IMMA", "Integer Tensor"},
{DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE, "DFMA", "Double Precision Tensor"},
{DCGM_FI_PROF_DRAM_ACTIVE, "DRAM", "Memory BW Utilization"}
};
struct clock_reason {
unsigned long long mask;
char *short_name;
char *long_name;
};
static struct clock_reason clock_reason_names[] =
{
{DCGM_CLOCKS_EVENT_REASON_GPU_IDLE, "Idle", "GPU idle"},
{DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING, "AppClk", "Application clocks"},
{DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP, "Pwr", "Power cap"},
{DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN, "HWSlow", "Hardware slowdown"},
{DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST, "Sync", "Sync boost"},
{DCGM_CLOCKS_EVENT_REASON_SW_THERMAL, "SWTherm", "Software thermal"},
{DCGM_CLOCKS_EVENT_REASON_HW_THERMAL, "HWTherm", "Hardware thermal"},
{DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE, "HWPower", "Hardware power brake"},
{DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS, "DispClk", "Display clock"}
};
void init_dcgm();
void shutdown_dcgm();
void print_dcgm(int compact, int overwrite);
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata);

View File

@ -36,7 +36,7 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
single_reason = 0;
if (compact)
printf("%-15s", clock_reason_names[i].short_name);
printf("%s", clock_reason_names[i].short_name);
else
printf("%s", clock_reason_names[i].long_name);
}
@ -47,6 +47,8 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
if (!compact)
printf(" (0x%llx)", reasons);
else
printf("%-15s", " ");
}
unsigned int get_nvml_temp(struct card_info *card)