Added DCGM metrics for those with Quadro cards.
Updated include paths for certain CUDA installations
This commit is contained in:
parent
af2ceec405
commit
a729bdee3b
10
Makefile
10
Makefile
@ -2,14 +2,20 @@
|
|||||||
|
|
||||||
OBJS = evga-icx.o evga-card.o icx3.o board-sensors.o zen3-rapl.o
|
OBJS = evga-icx.o evga-card.o icx3.o board-sensors.o zen3-rapl.o
|
||||||
LDLIBS = -li2c -lm
|
LDLIBS = -li2c -lm
|
||||||
CFLAGS = -MD
|
CFLAGS = -MD -I/usr/local/cuda/targets/x86_64-linux/include/
|
||||||
|
|
||||||
ifdef USE_NVML
|
ifdef USE_NVML
|
||||||
LDLIBS += -lnvidia-ml
|
LDLIBS += -lnvidia-ml -L/usr/local/cuda/targets/x86_64-linux/lib/stubs/
|
||||||
CFLAGS += -DUSE_NVML
|
CFLAGS += -DUSE_NVML
|
||||||
OBJS += nvidia-sensors.o
|
OBJS += nvidia-sensors.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef USE_DCGM
|
||||||
|
LDLIBS += -ldcgm
|
||||||
|
CFLAGS += -DUSE_DCGM -I/usr/include/datacenter-gpu-manager-4/
|
||||||
|
OBJS += nvidia-dcgm.o
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef USE_LIBPCI
|
ifdef USE_LIBPCI
|
||||||
LDLIBS += -lpci
|
LDLIBS += -lpci
|
||||||
CFLAGS += -DUSE_LIBPCI
|
CFLAGS += -DUSE_LIBPCI
|
||||||
|
|||||||
@ -32,10 +32,10 @@ static struct hwmon_sensor hwmon_sensor_info[] =
|
|||||||
{"asusec", "temp1", "Motherboard ", "CHIP", "°C", 1000.0, -40.0 }, /* Chipset */
|
{"asusec", "temp1", "Motherboard ", "CHIP", "°C", 1000.0, -40.0 }, /* Chipset */
|
||||||
{"asusec", "temp5", "Motherboard ", "VRM", "°C", 1000.0, -40.0 }, /* VRM */
|
{"asusec", "temp5", "Motherboard ", "VRM", "°C", 1000.0, -40.0 }, /* VRM */
|
||||||
{"asusec", "temp3", "", "MOBO", "°C", 1000.0, -40.0 }, /* Motherboard */
|
{"asusec", "temp3", "", "MOBO", "°C", 1000.0, -40.0 }, /* Motherboard */
|
||||||
{"asusec", "temp4", "Motherboard ", "SENS", "°C", 1000.0, -40.0 }, /* T_Sensor */
|
|
||||||
{"asusec", "temp6", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_In */
|
|
||||||
{"asusec", "temp7", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_Out */
|
|
||||||
{"nvme", "temp1", "NVMe ", "NVME", "°C", 1000.0, -40.0 }, /* NVME Composite */
|
{"nvme", "temp1", "NVMe ", "NVME", "°C", 1000.0, -40.0 }, /* NVME Composite */
|
||||||
|
{"asusec", "temp6", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_In */
|
||||||
|
{"asusec", "temp4", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* T_Sensor */
|
||||||
|
{"asusec", "temp7", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_Out */
|
||||||
{"nct6798", "fan2", "CPU fan", "CPU", "%", 15.0, 0.0 }, /* cpu_fan, cpu mid */
|
{"nct6798", "fan2", "CPU fan", "CPU", "%", 15.0, 0.0 }, /* cpu_fan, cpu mid */
|
||||||
{"asusec", "fan1", "", "CPU", "%", 15.0, 0.0 }, /* cpu_opt, cpu front */
|
{"asusec", "fan1", "", "CPU", "%", 15.0, 0.0 }, /* cpu_opt, cpu front */
|
||||||
{"nct6798", "fan5", "H amp", "CHA", "%", 12.0, 0.0 }, /* h_amp, front fan */
|
{"nct6798", "fan5", "H amp", "CHA", "%", 12.0, 0.0 }, /* h_amp, front fan */
|
||||||
|
|||||||
20
evga-icx.c
20
evga-icx.c
@ -6,11 +6,15 @@
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#ifdef USE_NVML
|
#ifdef USE_NVML
|
||||||
#include "nvidia-sensors.h"
|
#include "nvidia-sensors.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_DCGM
|
||||||
|
#include "nvidia-dcgm.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_LIBPCI
|
#ifdef USE_LIBPCI
|
||||||
#include "gddr6.h"
|
#include "gddr6.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "icx3.h"
|
#include "icx3.h"
|
||||||
@ -153,7 +157,6 @@ int main (int argc, char **argv)
|
|||||||
return -1;
|
return -1;
|
||||||
} else if (gpu_count == 0) {
|
} else if (gpu_count == 0) {
|
||||||
printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n");
|
printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n");
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
/* Check for invalid GPUs */
|
/* Check for invalid GPUs */
|
||||||
if (gpu_num > gpu_count - 1) {
|
if (gpu_num > gpu_count - 1) {
|
||||||
@ -195,6 +198,10 @@ int main (int argc, char **argv)
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef USE_DCGM
|
||||||
|
init_dcgm();
|
||||||
|
#endif
|
||||||
|
|
||||||
/* PCI init for VRAM/hotspot temps */
|
/* PCI init for VRAM/hotspot temps */
|
||||||
#ifdef USE_LIBPCI
|
#ifdef USE_LIBPCI
|
||||||
for (int i = 0; i < gpu_count; i++)
|
for (int i = 0; i < gpu_count; i++)
|
||||||
@ -209,6 +216,10 @@ int main (int argc, char **argv)
|
|||||||
|
|
||||||
if (print_board_sensors)
|
if (print_board_sensors)
|
||||||
print_board_info(board_sensors, board_sensor_count);
|
print_board_info(board_sensors, board_sensor_count);
|
||||||
|
|
||||||
|
#ifdef USE_DCGM
|
||||||
|
print_dcgm(compact, overwrite);
|
||||||
|
#endif
|
||||||
|
|
||||||
if (gpu_num == -1) {
|
if (gpu_num == -1) {
|
||||||
/* No GPU specified on command line, loop over all supported GPUs */
|
/* No GPU specified on command line, loop over all supported GPUs */
|
||||||
@ -238,6 +249,9 @@ int main (int argc, char **argv)
|
|||||||
#ifdef USE_NVML
|
#ifdef USE_NVML
|
||||||
nvmlShutdown();
|
nvmlShutdown();
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef USE_DCGM
|
||||||
|
shutdown_dcgm();
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors)
|
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors)
|
||||||
|
|||||||
130
nvidia-dcgm.c
Normal file
130
nvidia-dcgm.c
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "nvidia-dcgm.h"
|
||||||
|
|
||||||
|
/* Yeah I know this is duplicated */
|
||||||
|
#define HEADER_COLOR_START "\x1b[36m"
|
||||||
|
#define HEADER_COLOR_END "\x1b[39m"
|
||||||
|
|
||||||
|
dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL;
|
||||||
|
dcgmFieldGrp_t field_group_id;
|
||||||
|
|
||||||
|
unsigned long long clock_reasons = 0;
|
||||||
|
|
||||||
|
void init_dcgm()
|
||||||
|
{
|
||||||
|
/* Init DCGM */
|
||||||
|
dcgmReturn_t result;
|
||||||
|
result = dcgmInit();
|
||||||
|
if (result != DCGM_ST_OK)
|
||||||
|
printf("Could not init DCGM: %s\n", errorString(result));
|
||||||
|
|
||||||
|
/* Start embedded */
|
||||||
|
result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle);
|
||||||
|
if (result != DCGM_ST_OK)
|
||||||
|
printf("Could not start embedded DCGM engine: %s\n", errorString(result));
|
||||||
|
|
||||||
|
/* Create metric group */
|
||||||
|
int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]);
|
||||||
|
unsigned short field_ids[num_metrics + 1];
|
||||||
|
for (int i = 0; i < num_metrics; i++) {
|
||||||
|
field_ids[i] = metric_readings[i].field_id;
|
||||||
|
metric_readings[i].last_reading = 0.0;
|
||||||
|
}
|
||||||
|
field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS;
|
||||||
|
result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id);
|
||||||
|
if (result != DCGM_ST_OK)
|
||||||
|
printf("Could not create DCGM metric group: %s\n", errorString(result));
|
||||||
|
|
||||||
|
/* Start recording */
|
||||||
|
result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1);
|
||||||
|
if (result != DCGM_ST_OK)
|
||||||
|
printf("Could not record DCGM metrics: %s\n", errorString(result));
|
||||||
|
}
|
||||||
|
|
||||||
|
void shutdown_dcgm()
|
||||||
|
{
|
||||||
|
dcgmStopEmbedded(dcgm_handle);
|
||||||
|
dcgmShutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_dcgm(int compact, int overwrite)
|
||||||
|
{
|
||||||
|
dcgmReturn_t result;
|
||||||
|
dcgmUpdateAllFields(dcgm_handle, 1);
|
||||||
|
result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL);
|
||||||
|
if (result != DCGM_ST_OK)
|
||||||
|
printf("Could not read DCGM metrics: %s\n", errorString(result));
|
||||||
|
|
||||||
|
/* Print the float metrics */
|
||||||
|
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
||||||
|
if (compact)
|
||||||
|
printf("%s%s%s%s %4.1f",
|
||||||
|
i==0 ? "": " ",
|
||||||
|
HEADER_COLOR_START,
|
||||||
|
metric_readings[i].short_name,
|
||||||
|
HEADER_COLOR_END,
|
||||||
|
metric_readings[i].last_reading * 100);
|
||||||
|
else
|
||||||
|
printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100);
|
||||||
|
}
|
||||||
|
if (compact)
|
||||||
|
printf("%%");
|
||||||
|
|
||||||
|
/* Print clock reasons */
|
||||||
|
int single_reason = 1;
|
||||||
|
if (compact)
|
||||||
|
printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END);
|
||||||
|
else
|
||||||
|
printf("Clock reasons: ");
|
||||||
|
|
||||||
|
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
|
||||||
|
if (clock_reasons & clock_reason_names[i].mask) {
|
||||||
|
if (!single_reason) {
|
||||||
|
if (compact)
|
||||||
|
printf(",");
|
||||||
|
else
|
||||||
|
printf(", ");
|
||||||
|
}
|
||||||
|
single_reason = 0;
|
||||||
|
|
||||||
|
if (compact)
|
||||||
|
printf("%s", clock_reason_names[i].short_name);
|
||||||
|
else
|
||||||
|
printf("%s", clock_reason_names[i].long_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (single_reason)
|
||||||
|
printf("None");
|
||||||
|
|
||||||
|
if (!compact)
|
||||||
|
printf(" (0x%llx)", clock_reasons);
|
||||||
|
else
|
||||||
|
printf("%-15s", " ");
|
||||||
|
|
||||||
|
if (overwrite)
|
||||||
|
printf("\x1b[1G");
|
||||||
|
}
|
||||||
|
|
||||||
|
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata)
|
||||||
|
{
|
||||||
|
for (int j = 0; j < num_values; j++) {
|
||||||
|
if (values[j].fieldType == DCGM_FT_DOUBLE) {
|
||||||
|
/* A double reading so read it into our readings array */
|
||||||
|
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
||||||
|
if (metric_readings[i].field_id == values[j].fieldId) {
|
||||||
|
double val = values[j].value.dbl;
|
||||||
|
if (val >= 1)
|
||||||
|
metric_readings[i].last_reading = 0.0;
|
||||||
|
else
|
||||||
|
metric_readings[i].last_reading = values[j].value.dbl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (values[j].fieldType == DCGM_FT_INT64) {
|
||||||
|
/* Int is probably the bitmask of our clock reasons */
|
||||||
|
clock_reasons = values[j].value.i64;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
51
nvidia-dcgm.h
Normal file
51
nvidia-dcgm.h
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
#include <dcgm_agent.h>
|
||||||
|
#include <dcgm_structs.h>
|
||||||
|
|
||||||
|
#include "evga-card.h"
|
||||||
|
|
||||||
|
struct metric_reading {
|
||||||
|
unsigned short field_id;
|
||||||
|
char *short_name;
|
||||||
|
char *long_name;
|
||||||
|
double last_reading;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct metric_reading metric_readings[] =
|
||||||
|
{
|
||||||
|
{DCGM_FI_PROF_GR_ENGINE_ACTIVE, "GFX", "Graphics Engine Activity"},
|
||||||
|
{DCGM_FI_PROF_SM_ACTIVE, "SM ACT", "SM Activity"},
|
||||||
|
{DCGM_FI_PROF_SM_OCCUPANCY, "OCC", "SM Occupancy"},
|
||||||
|
{DCGM_FI_PROF_PIPE_FP16_ACTIVE, "FP16", "FP16 Engine Activity"},
|
||||||
|
{DCGM_FI_PROF_PIPE_FP32_ACTIVE, "FP32", "FP32 Engine Activity"},
|
||||||
|
{DCGM_FI_PROF_PIPE_FP64_ACTIVE, "FP64", "FP64 Engine Activity"},
|
||||||
|
{DCGM_FI_PROF_PIPE_INT_ACTIVE, "INT", "Integer Activity"},
|
||||||
|
{DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, "TENS", "Tensor Activity"},
|
||||||
|
{DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE, "HMMA", "Half Precision Tensor"},
|
||||||
|
{DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE, "IMMA", "Integer Tensor"},
|
||||||
|
{DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE, "DFMA", "Double Precision Tensor"},
|
||||||
|
{DCGM_FI_PROF_DRAM_ACTIVE, "DRAM", "Memory BW Utilization"}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct clock_reason {
|
||||||
|
unsigned long long mask;
|
||||||
|
char *short_name;
|
||||||
|
char *long_name;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct clock_reason clock_reason_names[] =
|
||||||
|
{
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_GPU_IDLE, "Idle", "GPU idle"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING, "AppClk", "Application clocks"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP, "Pwr", "Power cap"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN, "HWSlow", "Hardware slowdown"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST, "Sync", "Sync boost"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_SW_THERMAL, "SWTherm", "Software thermal"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_HW_THERMAL, "HWTherm", "Hardware thermal"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE, "HWPower", "Hardware power brake"},
|
||||||
|
{DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS, "DispClk", "Display clock"}
|
||||||
|
};
|
||||||
|
|
||||||
|
void init_dcgm();
|
||||||
|
void shutdown_dcgm();
|
||||||
|
void print_dcgm(int compact, int overwrite);
|
||||||
|
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata);
|
||||||
@ -36,7 +36,7 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
|
|||||||
single_reason = 0;
|
single_reason = 0;
|
||||||
|
|
||||||
if (compact)
|
if (compact)
|
||||||
printf("%-15s", clock_reason_names[i].short_name);
|
printf("%s", clock_reason_names[i].short_name);
|
||||||
else
|
else
|
||||||
printf("%s", clock_reason_names[i].long_name);
|
printf("%s", clock_reason_names[i].long_name);
|
||||||
}
|
}
|
||||||
@ -47,6 +47,8 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
|
|||||||
|
|
||||||
if (!compact)
|
if (!compact)
|
||||||
printf(" (0x%llx)", reasons);
|
printf(" (0x%llx)", reasons);
|
||||||
|
else
|
||||||
|
printf("%-15s", " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int get_nvml_temp(struct card_info *card)
|
unsigned int get_nvml_temp(struct card_info *card)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user