#include #include "nvidia-dcgm.h" /* Yeah I know this is duplicated */ #define HEADER_COLOR_START "\x1b[36m" #define HEADER_COLOR_END "\x1b[39m" dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL; dcgmFieldGrp_t field_group_id; unsigned long long clock_reasons = 0; void init_dcgm() { /* Init DCGM */ dcgmReturn_t result; result = dcgmInit(); if (result != DCGM_ST_OK) printf("Could not init DCGM: %s\n", errorString(result)); /* Start embedded */ result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle); if (result != DCGM_ST_OK) printf("Could not start embedded DCGM engine: %s\n", errorString(result)); /* Create metric group */ int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]); unsigned short field_ids[num_metrics + 1]; for (int i = 0; i < num_metrics; i++) { field_ids[i] = metric_readings[i].field_id; metric_readings[i].last_reading = 0.0; } field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS; result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id); if (result != DCGM_ST_OK) printf("Could not create DCGM metric group: %s\n", errorString(result)); /* Start recording */ result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1); if (result != DCGM_ST_OK) printf("Could not record DCGM metrics: %s\n", errorString(result)); } void shutdown_dcgm() { dcgmStopEmbedded(dcgm_handle); dcgmShutdown(); } void print_dcgm(int compact, int overwrite) { dcgmReturn_t result; dcgmUpdateAllFields(dcgm_handle, 1); result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL); if (result != DCGM_ST_OK) printf("Could not read DCGM metrics: %s\n", errorString(result)); /* Print the float metrics */ for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) { if (compact) printf("%s%s%s%s %4.1f", i==0 ? "": " ", HEADER_COLOR_START, metric_readings[i].short_name, HEADER_COLOR_END, metric_readings[i].last_reading * 100); else printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100); } if (compact) printf("%%"); /* Print clock reasons */ int single_reason = 1; if (compact) printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END); else printf("Clock reasons: "); for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) { if (clock_reasons & clock_reason_names[i].mask) { if (!single_reason) { if (compact) printf(","); else printf(", "); } single_reason = 0; if (compact) printf("%s", clock_reason_names[i].short_name); else printf("%s", clock_reason_names[i].long_name); } } if (single_reason) printf("None"); if (!compact) printf(" (0x%llx)", clock_reasons); else printf("%-15s", " "); if (overwrite) printf("\x1b[1G"); } int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata) { for (int j = 0; j < num_values; j++) { if (values[j].fieldType == DCGM_FT_DOUBLE) { /* A double reading so read it into our readings array */ for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) { if (metric_readings[i].field_id == values[j].fieldId) { double val = values[j].value.dbl; if (val >= 1) metric_readings[i].last_reading = 0.0; else metric_readings[i].last_reading = values[j].value.dbl; } } } else if (values[j].fieldType == DCGM_FT_INT64) { /* Int is probably the bitmask of our clock reasons */ clock_reasons = values[j].value.i64; } } return 0; }