Added support for some NVML sensors.

GPU temp sensor and clock reasons.
This commit is contained in:
moosecrap 2025-02-02 21:42:30 -08:00
parent 4cb9ef42a6
commit 4c5321bc4e
6 changed files with 185 additions and 22 deletions

View File

@ -4,13 +4,20 @@ OBJS = evga-icx.o evga-card.o icx3.o
LDLIBS = -li2c LDLIBS = -li2c
CFLAGS = -MD CFLAGS = -MD
ifdef USE_NVML
LDLIBS += -lnvidia-ml
CFLAGS += -DUSE_NVML
OBJS += nvidia-sensors.o
endif
evga-icx : $(OBJS) evga-icx : $(OBJS)
debug : CFLAGS += -g -O0 debug : CFLAGS += -g -O0
debug : evga-icx debug : evga-icx
clean : clean :
rm evga-icx $(OBJS) rm evga-icx
rm *.o
rm *.d rm *.d
-include $(OBJS:.o=.d) -include $(OBJS:.o=.d)

View File

@ -24,6 +24,11 @@ Access to the `/dev/i2c` device files, which means either:
## Building ## Building
`make` `make`
## Optional features
### NVML support
Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason.
## Usage ## Usage
Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode. Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode.

View File

@ -5,6 +5,10 @@
#include <fcntl.h> #include <fcntl.h>
#include <stdlib.h> #include <stdlib.h>
#ifdef USE_NVML
#include "nvidia-sensors.h"
#endif
#include "icx3.h" #include "icx3.h"
#include "evga-card.h" #include "evga-card.h"
@ -32,7 +36,7 @@ int main (int argc, char **argv)
int print_info = 0; int print_info = 0;
int compact = 0; int compact = 0;
int gpu_num = -1; /* Card to control */ int gpu_num = -1; /* Card to control */
int watch = -1; unsigned int watch = 0;
char *fan_speed[ICX3_MAX_FANS] = {NULL}; char *fan_speed[ICX3_MAX_FANS] = {NULL};
/* Input parsing */ /* Input parsing */
@ -125,33 +129,57 @@ int main (int argc, char **argv)
} }
} }
/* NVML init */
#ifdef USE_NVML
int nvml_ok = init_nvml();
nvmlDevice_t* device[MAX_GPUS];
#endif
/* print sensor info */ /* print sensor info */
print:
if (print_info) { if (print_info) {
if (gpu_num == -1) { do {
for (int i = 0; i < gpu_count; i++){ if (gpu_num == -1) {
print_gpu_info(i, gpus, compact); for (int i = 0; i < gpu_count; i++){
} print_gpu_info(i, &gpus[i], compact);
} else if (gpu_num <= gpu_count - 1) { }
print_gpu_info(gpu_num, gpus, compact); } else if (gpu_num <= gpu_count - 1) {
} print_gpu_info(gpu_num, &gpus[gpu_num], compact);
} }
if (watch > 0) { sleep(watch);
sleep(watch); } while (watch > 0);
goto print;
} }
#ifdef USE_NVML
nvmlShutdown();
#endif
} }
void print_gpu_info(int gpu_num, struct card_info gpus[], int compact) { void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
if (compact) { if (compact) {
/* One line per GPU */
printf("#%d ", gpu_num); printf("#%d ", gpu_num);
print_icx3_fans_oneline(&gpus[gpu_num]); print_icx3_fans_oneline(gpu);
print_icx3_temps_oneline(&gpus[gpu_num]); printf(" GPU");
#ifdef USE_NVML
print_nvml_temp(1, gpu);
#endif
print_icx3_temps_oneline(gpu);
printf("°C");
#ifdef USE_NVML
print_nvml_clock_reason(1, gpu);
#endif
printf("\n"); printf("\n");
} else { } else {
printf("#%d: %s (%s) @ %s\n", gpu_num, gpus[gpu_num].card_name, gpus[gpu_num].i2c_dev_path, gpus[gpu_num].pci_id); /* One line per GPU sensor */
print_icx3_fans(&gpus[gpu_num]); printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id);
print_icx3_temps(&gpus[gpu_num]); print_icx3_fans(gpu);
#ifdef USE_NVML
print_nvml_temp(0, gpu);
#endif
print_icx3_temps(gpu);
#ifdef USE_NVML
print_nvml_clock_reason(0, gpu);
#endif
printf("\n"); printf("\n");
} }

4
icx3.c
View File

@ -89,11 +89,11 @@ void print_icx3_temps_oneline(struct card_info *card)
get_temp_sensors(temps, card); get_temp_sensors(temps, card);
for (int i=0; i<ICX3_NUM_TEMP_SENSORS; i++) { for (int i=0; i<ICX3_NUM_TEMP_SENSORS; i++) {
if (i == 0 || strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3)) /* If this math seems a little jank, it's so we can optionally inject the NVML and vram temps into the oneline */
if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
printf(" %.3s", icx3_temp_sensor_names[i]); printf(" %.3s", icx3_temp_sensor_names[i]);
printf(" %3.0f", temps[i]); printf(" %3.0f", temps[i]);
} }
printf("°C");
} }
void get_available_fans(char *fans_avail, struct card_info *card) void get_available_fans(char *fans_avail, struct card_info *card)

95
nvidia-sensors.c Normal file
View File

@ -0,0 +1,95 @@
#include <stdio.h>
#include "nvidia-sensors.h"
int init_nvml()
{
nvmlReturn_t result;
result = nvmlInit_v2();
if (result != NVML_SUCCESS) {
printf("Could not init NVML: %s\n", nvmlErrorString(result));
return 0;
}
}
void print_nvml_temp(int compact, struct card_info *card)
{
if (compact)
printf(" %3d", get_nvml_temp(card));
else
printf("GPU1: %+d°C\n", get_nvml_temp(card));
}
void print_nvml_clock_reason(int compact, struct card_info *card)
{
unsigned long long reasons = get_nvml_clock_reasons(card);
int single_reason = 1;
if (compact)
printf(" CLK ");
else
printf("Clock reasons: ");
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
if (reasons & clock_reason_names[i].mask) {
if (!single_reason) {
if (compact)
printf(",");
else
printf(", ");
}
single_reason = 0;
if (compact)
printf("%s", clock_reason_names[i].short_name);
else
printf("%s", clock_reason_names[i].long_name);
}
}
if (single_reason)
printf("None");
if (!compact)
printf(" (0x%llx)\n", reasons);
}
unsigned int get_nvml_temp(struct card_info *card)
{
nvmlReturn_t result;
nvmlDevice_t nvml_device;
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device);
if (result != NVML_SUCCESS) {
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
unsigned int temp;
result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp);
if (result != NVML_SUCCESS) {
printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
return temp;
}
unsigned long long get_nvml_clock_reasons(struct card_info *card)
{
nvmlReturn_t result;
nvmlDevice_t nvml_device;
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device);
if (result != NVML_SUCCESS) {
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
unsigned long long reasons;
result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ;
if (result != NVML_SUCCESS) {
printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
return reasons;
}

28
nvidia-sensors.h Normal file
View File

@ -0,0 +1,28 @@
#include <nvml.h>
#include "evga-card.h"
struct clock_reason {
unsigned long long mask;
char *short_name;
char *long_name;
};
static struct clock_reason clock_reason_names[] =
{
{nvmlClocksEventReasonGpuIdle, "Idle", "GPU idle"},
{nvmlClocksEventReasonApplicationsClocksSetting, "AppClk", "Application clocks"},
{nvmlClocksEventReasonSwPowerCap, "Pwr", "Power cap"},
{nvmlClocksThrottleReasonHwSlowdown, "HWSlow", "Hardware slowdown"},
{nvmlClocksEventReasonSyncBoost, "Sync", "Sync boost"},
{nvmlClocksEventReasonSwThermalSlowdown, "SWTherm", "Software thermal"},
{nvmlClocksThrottleReasonHwThermalSlowdown, "HWTherm", "Hardware thermal"},
{nvmlClocksThrottleReasonHwPowerBrakeSlowdown, "HWPower", "Hardware power brake"},
{nvmlClocksEventReasonDisplayClockSetting, "DispClk", "Display clock"}
};
int init_nvml();
void print_nvml_temp(int compact, struct card_info *card);
void print_nvml_clock_reason(int compact, struct card_info *card);
unsigned int get_nvml_temp(struct card_info *card);
unsigned long long get_nvml_clock_reasons(struct card_info *card);