evga-icx/nvidia-dcgm.c
moosecrap a729bdee3b Added DCGM metrics for those with Quadro cards.
Updated include paths for certain CUDA installations
2025-12-06 14:23:36 -08:00

131 lines
3.6 KiB
C

#include <stdio.h>
#include "nvidia-dcgm.h"
/* Yeah I know this is duplicated */
#define HEADER_COLOR_START "\x1b[36m"
#define HEADER_COLOR_END "\x1b[39m"
dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL;
dcgmFieldGrp_t field_group_id;
unsigned long long clock_reasons = 0;
void init_dcgm()
{
/* Init DCGM */
dcgmReturn_t result;
result = dcgmInit();
if (result != DCGM_ST_OK)
printf("Could not init DCGM: %s\n", errorString(result));
/* Start embedded */
result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle);
if (result != DCGM_ST_OK)
printf("Could not start embedded DCGM engine: %s\n", errorString(result));
/* Create metric group */
int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]);
unsigned short field_ids[num_metrics + 1];
for (int i = 0; i < num_metrics; i++) {
field_ids[i] = metric_readings[i].field_id;
metric_readings[i].last_reading = 0.0;
}
field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS;
result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id);
if (result != DCGM_ST_OK)
printf("Could not create DCGM metric group: %s\n", errorString(result));
/* Start recording */
result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1);
if (result != DCGM_ST_OK)
printf("Could not record DCGM metrics: %s\n", errorString(result));
}
void shutdown_dcgm()
{
dcgmStopEmbedded(dcgm_handle);
dcgmShutdown();
}
void print_dcgm(int compact, int overwrite)
{
dcgmReturn_t result;
dcgmUpdateAllFields(dcgm_handle, 1);
result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL);
if (result != DCGM_ST_OK)
printf("Could not read DCGM metrics: %s\n", errorString(result));
/* Print the float metrics */
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
if (compact)
printf("%s%s%s%s %4.1f",
i==0 ? "": " ",
HEADER_COLOR_START,
metric_readings[i].short_name,
HEADER_COLOR_END,
metric_readings[i].last_reading * 100);
else
printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100);
}
if (compact)
printf("%%");
/* Print clock reasons */
int single_reason = 1;
if (compact)
printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END);
else
printf("Clock reasons: ");
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
if (clock_reasons & clock_reason_names[i].mask) {
if (!single_reason) {
if (compact)
printf(",");
else
printf(", ");
}
single_reason = 0;
if (compact)
printf("%s", clock_reason_names[i].short_name);
else
printf("%s", clock_reason_names[i].long_name);
}
}
if (single_reason)
printf("None");
if (!compact)
printf(" (0x%llx)", clock_reasons);
else
printf("%-15s", " ");
if (overwrite)
printf("\x1b[1G");
}
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata)
{
for (int j = 0; j < num_values; j++) {
if (values[j].fieldType == DCGM_FT_DOUBLE) {
/* A double reading so read it into our readings array */
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
if (metric_readings[i].field_id == values[j].fieldId) {
double val = values[j].value.dbl;
if (val >= 1)
metric_readings[i].last_reading = 0.0;
else
metric_readings[i].last_reading = values[j].value.dbl;
}
}
} else if (values[j].fieldType == DCGM_FT_INT64) {
/* Int is probably the bitmask of our clock reasons */
clock_reasons = values[j].value.i64;
}
}
return 0;
}