Added DCGM metrics for those with Quadro cards.
Updated include paths for certain CUDA installations
This commit is contained in:
parent
af2ceec405
commit
a729bdee3b
10
Makefile
10
Makefile
@ -2,14 +2,20 @@
|
||||
|
||||
OBJS = evga-icx.o evga-card.o icx3.o board-sensors.o zen3-rapl.o
|
||||
LDLIBS = -li2c -lm
|
||||
CFLAGS = -MD
|
||||
CFLAGS = -MD -I/usr/local/cuda/targets/x86_64-linux/include/
|
||||
|
||||
ifdef USE_NVML
|
||||
LDLIBS += -lnvidia-ml
|
||||
LDLIBS += -lnvidia-ml -L/usr/local/cuda/targets/x86_64-linux/lib/stubs/
|
||||
CFLAGS += -DUSE_NVML
|
||||
OBJS += nvidia-sensors.o
|
||||
endif
|
||||
|
||||
ifdef USE_DCGM
|
||||
LDLIBS += -ldcgm
|
||||
CFLAGS += -DUSE_DCGM -I/usr/include/datacenter-gpu-manager-4/
|
||||
OBJS += nvidia-dcgm.o
|
||||
endif
|
||||
|
||||
ifdef USE_LIBPCI
|
||||
LDLIBS += -lpci
|
||||
CFLAGS += -DUSE_LIBPCI
|
||||
|
||||
@ -32,10 +32,10 @@ static struct hwmon_sensor hwmon_sensor_info[] =
|
||||
{"asusec", "temp1", "Motherboard ", "CHIP", "°C", 1000.0, -40.0 }, /* Chipset */
|
||||
{"asusec", "temp5", "Motherboard ", "VRM", "°C", 1000.0, -40.0 }, /* VRM */
|
||||
{"asusec", "temp3", "", "MOBO", "°C", 1000.0, -40.0 }, /* Motherboard */
|
||||
{"asusec", "temp4", "Motherboard ", "SENS", "°C", 1000.0, -40.0 }, /* T_Sensor */
|
||||
{"asusec", "temp6", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_In */
|
||||
{"asusec", "temp7", "Motherboard ", "H2O", "°C", 1000.0, -40.0 }, /* Water_Out */
|
||||
{"nvme", "temp1", "NVMe ", "NVME", "°C", 1000.0, -40.0 }, /* NVME Composite */
|
||||
{"asusec", "temp6", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_In */
|
||||
{"asusec", "temp4", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* T_Sensor */
|
||||
{"asusec", "temp7", "Motherboard ", "AIR", "°C", 1000.0, -40.0 }, /* Water_Out */
|
||||
{"nct6798", "fan2", "CPU fan", "CPU", "%", 15.0, 0.0 }, /* cpu_fan, cpu mid */
|
||||
{"asusec", "fan1", "", "CPU", "%", 15.0, 0.0 }, /* cpu_opt, cpu front */
|
||||
{"nct6798", "fan5", "H amp", "CHA", "%", 12.0, 0.0 }, /* h_amp, front fan */
|
||||
|
||||
20
evga-icx.c
20
evga-icx.c
@ -6,11 +6,15 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef USE_NVML
|
||||
#include "nvidia-sensors.h"
|
||||
#include "nvidia-sensors.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_DCGM
|
||||
#include "nvidia-dcgm.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_LIBPCI
|
||||
#include "gddr6.h"
|
||||
#include "gddr6.h"
|
||||
#endif
|
||||
|
||||
#include "icx3.h"
|
||||
@ -153,7 +157,6 @@ int main (int argc, char **argv)
|
||||
return -1;
|
||||
} else if (gpu_count == 0) {
|
||||
printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n");
|
||||
return -1;
|
||||
}
|
||||
/* Check for invalid GPUs */
|
||||
if (gpu_num > gpu_count - 1) {
|
||||
@ -195,6 +198,10 @@ int main (int argc, char **argv)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_DCGM
|
||||
init_dcgm();
|
||||
#endif
|
||||
|
||||
/* PCI init for VRAM/hotspot temps */
|
||||
#ifdef USE_LIBPCI
|
||||
for (int i = 0; i < gpu_count; i++)
|
||||
@ -210,6 +217,10 @@ int main (int argc, char **argv)
|
||||
if (print_board_sensors)
|
||||
print_board_info(board_sensors, board_sensor_count);
|
||||
|
||||
#ifdef USE_DCGM
|
||||
print_dcgm(compact, overwrite);
|
||||
#endif
|
||||
|
||||
if (gpu_num == -1) {
|
||||
/* No GPU specified on command line, loop over all supported GPUs */
|
||||
for (int i = 0; i < gpu_count; i++){
|
||||
@ -238,6 +249,9 @@ int main (int argc, char **argv)
|
||||
#ifdef USE_NVML
|
||||
nvmlShutdown();
|
||||
#endif
|
||||
#ifdef USE_DCGM
|
||||
shutdown_dcgm();
|
||||
#endif
|
||||
}
|
||||
|
||||
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors)
|
||||
|
||||
130
nvidia-dcgm.c
Normal file
130
nvidia-dcgm.c
Normal file
@ -0,0 +1,130 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "nvidia-dcgm.h"
|
||||
|
||||
/* Yeah I know this is duplicated */
|
||||
#define HEADER_COLOR_START "\x1b[36m"
|
||||
#define HEADER_COLOR_END "\x1b[39m"
|
||||
|
||||
dcgmHandle_t dcgm_handle = (dcgmHandle_t)NULL;
|
||||
dcgmFieldGrp_t field_group_id;
|
||||
|
||||
unsigned long long clock_reasons = 0;
|
||||
|
||||
void init_dcgm()
|
||||
{
|
||||
/* Init DCGM */
|
||||
dcgmReturn_t result;
|
||||
result = dcgmInit();
|
||||
if (result != DCGM_ST_OK)
|
||||
printf("Could not init DCGM: %s\n", errorString(result));
|
||||
|
||||
/* Start embedded */
|
||||
result = dcgmStartEmbedded(DCGM_OPERATION_MODE_MANUAL, &dcgm_handle);
|
||||
if (result != DCGM_ST_OK)
|
||||
printf("Could not start embedded DCGM engine: %s\n", errorString(result));
|
||||
|
||||
/* Create metric group */
|
||||
int num_metrics = sizeof(metric_readings) / sizeof(metric_readings[0]);
|
||||
unsigned short field_ids[num_metrics + 1];
|
||||
for (int i = 0; i < num_metrics; i++) {
|
||||
field_ids[i] = metric_readings[i].field_id;
|
||||
metric_readings[i].last_reading = 0.0;
|
||||
}
|
||||
field_ids[num_metrics] = DCGM_FI_DEV_CLOCKS_EVENT_REASONS;
|
||||
result = dcgmFieldGroupCreate(dcgm_handle, num_metrics + 1, &field_ids[0], (char *)"evga_icx_fields", &field_group_id);
|
||||
if (result != DCGM_ST_OK)
|
||||
printf("Could not create DCGM metric group: %s\n", errorString(result));
|
||||
|
||||
/* Start recording */
|
||||
result = dcgmWatchFields(dcgm_handle, DCGM_GROUP_ALL_GPUS , field_group_id, 1000000, 30, 1);
|
||||
if (result != DCGM_ST_OK)
|
||||
printf("Could not record DCGM metrics: %s\n", errorString(result));
|
||||
}
|
||||
|
||||
void shutdown_dcgm()
|
||||
{
|
||||
dcgmStopEmbedded(dcgm_handle);
|
||||
dcgmShutdown();
|
||||
}
|
||||
|
||||
void print_dcgm(int compact, int overwrite)
|
||||
{
|
||||
dcgmReturn_t result;
|
||||
dcgmUpdateAllFields(dcgm_handle, 1);
|
||||
result = dcgmGetLatestValues(dcgm_handle, DCGM_GROUP_ALL_GPUS, field_group_id, &update_field_values, NULL);
|
||||
if (result != DCGM_ST_OK)
|
||||
printf("Could not read DCGM metrics: %s\n", errorString(result));
|
||||
|
||||
/* Print the float metrics */
|
||||
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
||||
if (compact)
|
||||
printf("%s%s%s%s %4.1f",
|
||||
i==0 ? "": " ",
|
||||
HEADER_COLOR_START,
|
||||
metric_readings[i].short_name,
|
||||
HEADER_COLOR_END,
|
||||
metric_readings[i].last_reading * 100);
|
||||
else
|
||||
printf("%s: %.1f%\n", metric_readings[i].long_name, metric_readings[i].last_reading * 100);
|
||||
}
|
||||
if (compact)
|
||||
printf("%%");
|
||||
|
||||
/* Print clock reasons */
|
||||
int single_reason = 1;
|
||||
if (compact)
|
||||
printf("%s CLK %s", HEADER_COLOR_START, HEADER_COLOR_END);
|
||||
else
|
||||
printf("Clock reasons: ");
|
||||
|
||||
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
|
||||
if (clock_reasons & clock_reason_names[i].mask) {
|
||||
if (!single_reason) {
|
||||
if (compact)
|
||||
printf(",");
|
||||
else
|
||||
printf(", ");
|
||||
}
|
||||
single_reason = 0;
|
||||
|
||||
if (compact)
|
||||
printf("%s", clock_reason_names[i].short_name);
|
||||
else
|
||||
printf("%s", clock_reason_names[i].long_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (single_reason)
|
||||
printf("None");
|
||||
|
||||
if (!compact)
|
||||
printf(" (0x%llx)", clock_reasons);
|
||||
else
|
||||
printf("%-15s", " ");
|
||||
|
||||
if (overwrite)
|
||||
printf("\x1b[1G");
|
||||
}
|
||||
|
||||
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata)
|
||||
{
|
||||
for (int j = 0; j < num_values; j++) {
|
||||
if (values[j].fieldType == DCGM_FT_DOUBLE) {
|
||||
/* A double reading so read it into our readings array */
|
||||
for (int i = 0; i < sizeof(metric_readings) / sizeof(metric_readings[0]); i++) {
|
||||
if (metric_readings[i].field_id == values[j].fieldId) {
|
||||
double val = values[j].value.dbl;
|
||||
if (val >= 1)
|
||||
metric_readings[i].last_reading = 0.0;
|
||||
else
|
||||
metric_readings[i].last_reading = values[j].value.dbl;
|
||||
}
|
||||
}
|
||||
} else if (values[j].fieldType == DCGM_FT_INT64) {
|
||||
/* Int is probably the bitmask of our clock reasons */
|
||||
clock_reasons = values[j].value.i64;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
51
nvidia-dcgm.h
Normal file
51
nvidia-dcgm.h
Normal file
@ -0,0 +1,51 @@
|
||||
#include <dcgm_agent.h>
|
||||
#include <dcgm_structs.h>
|
||||
|
||||
#include "evga-card.h"
|
||||
|
||||
struct metric_reading {
|
||||
unsigned short field_id;
|
||||
char *short_name;
|
||||
char *long_name;
|
||||
double last_reading;
|
||||
};
|
||||
|
||||
static struct metric_reading metric_readings[] =
|
||||
{
|
||||
{DCGM_FI_PROF_GR_ENGINE_ACTIVE, "GFX", "Graphics Engine Activity"},
|
||||
{DCGM_FI_PROF_SM_ACTIVE, "SM ACT", "SM Activity"},
|
||||
{DCGM_FI_PROF_SM_OCCUPANCY, "OCC", "SM Occupancy"},
|
||||
{DCGM_FI_PROF_PIPE_FP16_ACTIVE, "FP16", "FP16 Engine Activity"},
|
||||
{DCGM_FI_PROF_PIPE_FP32_ACTIVE, "FP32", "FP32 Engine Activity"},
|
||||
{DCGM_FI_PROF_PIPE_FP64_ACTIVE, "FP64", "FP64 Engine Activity"},
|
||||
{DCGM_FI_PROF_PIPE_INT_ACTIVE, "INT", "Integer Activity"},
|
||||
{DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, "TENS", "Tensor Activity"},
|
||||
{DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE, "HMMA", "Half Precision Tensor"},
|
||||
{DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE, "IMMA", "Integer Tensor"},
|
||||
{DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE, "DFMA", "Double Precision Tensor"},
|
||||
{DCGM_FI_PROF_DRAM_ACTIVE, "DRAM", "Memory BW Utilization"}
|
||||
};
|
||||
|
||||
struct clock_reason {
|
||||
unsigned long long mask;
|
||||
char *short_name;
|
||||
char *long_name;
|
||||
};
|
||||
|
||||
static struct clock_reason clock_reason_names[] =
|
||||
{
|
||||
{DCGM_CLOCKS_EVENT_REASON_GPU_IDLE, "Idle", "GPU idle"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_CLOCKS_SETTING, "AppClk", "Application clocks"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_SW_POWER_CAP, "Pwr", "Power cap"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_HW_SLOWDOWN, "HWSlow", "Hardware slowdown"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_SYNC_BOOST, "Sync", "Sync boost"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_SW_THERMAL, "SWTherm", "Software thermal"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_HW_THERMAL, "HWTherm", "Hardware thermal"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_HW_POWER_BRAKE, "HWPower", "Hardware power brake"},
|
||||
{DCGM_CLOCKS_EVENT_REASON_DISPLAY_CLOCKS, "DispClk", "Display clock"}
|
||||
};
|
||||
|
||||
void init_dcgm();
|
||||
void shutdown_dcgm();
|
||||
void print_dcgm(int compact, int overwrite);
|
||||
int update_field_values(unsigned int gpu_id, dcgmFieldValue_v1 *values, int num_values, void *userdata);
|
||||
@ -36,7 +36,7 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
|
||||
single_reason = 0;
|
||||
|
||||
if (compact)
|
||||
printf("%-15s", clock_reason_names[i].short_name);
|
||||
printf("%s", clock_reason_names[i].short_name);
|
||||
else
|
||||
printf("%s", clock_reason_names[i].long_name);
|
||||
}
|
||||
@ -47,6 +47,8 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
|
||||
|
||||
if (!compact)
|
||||
printf(" (0x%llx)", reasons);
|
||||
else
|
||||
printf("%-15s", " ");
|
||||
}
|
||||
|
||||
unsigned int get_nvml_temp(struct card_info *card)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user