131 lines
3.6 KiB
C
131 lines
3.6 KiB
C
#include <stdio.h>
|
|
|
|
#include "nvidia-dcgm.h"
|
|
|
|
/* Yeah I know this is duplicated */
|
|
#define HEADER_COLOR_START "\x1b[36m"
|
|
#define HEADER_COLOR_END "\x1b[39m"
|
|
|
|
dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL;
|
|
dcgmFieldGrp_t field_group_id;
|
|
|
|
unsigned long long clock_reasons = 0;
|
|
|
|
void init_dcgm()
|
|
{
|
|
/* Init DCGM */
|
|
dcgmReturn_t result;
|
|
result = dcgmInit();
|
|
if (result != DCGM_ST_OK)
|
|
printf("Could not init DCGM: %s\n", errorString(result));
|
|
|
|
/* Start embedded */
|
|
result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle);
|
|
if (result != DCGM_ST_OK)
|
|
printf("Could not start embedded DCGM engine: %s\n", errorString(result));
|
|
|
|
/* Create metric group */
|
|
int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]);
|
|
unsigned short field_ids[num_metrics + 1];
|
|
for (int i = 0; i < num_metrics; i++) {
|
|
field_ids[i] = metric_readings[i].field_id;
|
|
metric_readings[i].last_reading = 0.0;
|
|
}
|
|
field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS;
|
|
result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id);
|
|
if (result != DCGM_ST_OK)
|
|
printf("Could not create DCGM metric group: %s\n", errorString(result));
|
|
|
|
/* Start recording */
|
|
result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1);
|
|
if (result != DCGM_ST_OK)
|
|
printf("Could not record DCGM metrics: %s\n", errorString(result));
|
|
}
|
|
|
|
void shutdown_dcgm()
|
|
{
|
|
dcgmStopEmbedded(dcgm_handle);
|
|
dcgmShutdown();
|
|
}
|
|
|
|
void print_dcgm(int compact, int overwrite)
|
|
{
|
|
dcgmReturn_t result;
|
|
dcgmUpdateAllFields(dcgm_handle, 1);
|
|
result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL);
|
|
if (result != DCGM_ST_OK)
|
|
printf("Could not read DCGM metrics: %s\n", errorString(result));
|
|
|
|
/* Print the float metrics */
|
|
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
|
if (compact)
|
|
printf("%s%s%s%s %4.1f",
|
|
i==0 ? "": " ",
|
|
HEADER_COLOR_START,
|
|
metric_readings[i].short_name,
|
|
HEADER_COLOR_END,
|
|
metric_readings[i].last_reading * 100);
|
|
else
|
|
printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100);
|
|
}
|
|
if (compact)
|
|
printf("%%");
|
|
|
|
/* Print clock reasons */
|
|
int single_reason = 1;
|
|
if (compact)
|
|
printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END);
|
|
else
|
|
printf("Clock reasons: ");
|
|
|
|
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
|
|
if (clock_reasons & clock_reason_names[i].mask) {
|
|
if (!single_reason) {
|
|
if (compact)
|
|
printf(",");
|
|
else
|
|
printf(", ");
|
|
}
|
|
single_reason = 0;
|
|
|
|
if (compact)
|
|
printf("%s", clock_reason_names[i].short_name);
|
|
else
|
|
printf("%s", clock_reason_names[i].long_name);
|
|
}
|
|
}
|
|
|
|
if (single_reason)
|
|
printf("None");
|
|
|
|
if (!compact)
|
|
printf(" (0x%llx)", clock_reasons);
|
|
else
|
|
printf("%-15s", " ");
|
|
|
|
if (overwrite)
|
|
printf("\x1b[1G");
|
|
}
|
|
|
|
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata)
|
|
{
|
|
for (int j = 0; j < num_values; j++) {
|
|
if (values[j].fieldType == DCGM_FT_DOUBLE) {
|
|
/* A double reading so read it into our readings array */
|
|
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
|
if (metric_readings[i].field_id == values[j].fieldId) {
|
|
double val = values[j].value.dbl;
|
|
if (val >= 1)
|
|
metric_readings[i].last_reading = 0.0;
|
|
else
|
|
metric_readings[i].last_reading = values[j].value.dbl;
|
|
}
|
|
}
|
|
} else if (values[j].fieldType == DCGM_FT_INT64) {
|
|
/* Int is probably the bitmask of our clock reasons */
|
|
clock_reasons = values[j].value.i64;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|