From 4c5321bc4eb4ed5e55d56d7186ae9a6bd0eabf36 Mon Sep 17 00:00:00 2001 From: moosecrap Date: Sun, 2 Feb 2025 21:42:30 -0800 Subject: [PATCH] Added support for some NVML sensors. GPU temp sensor and clock reasons. --- makefile => Makefile | 9 ++++- README.md | 5 +++ evga-icx.c | 66 +++++++++++++++++++++--------- icx3.c | 4 +- nvidia-sensors.c | 95 ++++++++++++++++++++++++++++++++++++++++++++ nvidia-sensors.h | 28 +++++++++++++ 6 files changed, 185 insertions(+), 22 deletions(-) rename makefile => Makefile (55%) create mode 100644 nvidia-sensors.c create mode 100644 nvidia-sensors.h diff --git a/makefile b/Makefile similarity index 55% rename from makefile rename to Makefile index 4444197..b2f91d9 100644 --- a/makefile +++ b/Makefile @@ -4,13 +4,20 @@ OBJS = evga-icx.o evga-card.o icx3.o LDLIBS = -li2c CFLAGS = -MD +ifdef USE_NVML +LDLIBS += -lnvidia-ml +CFLAGS += -DUSE_NVML +OBJS += nvidia-sensors.o +endif + evga-icx : $(OBJS) debug : CFLAGS += -g -O0 debug : evga-icx clean : - rm evga-icx $(OBJS) + rm evga-icx + rm *.o rm *.d -include $(OBJS:.o=.d) \ No newline at end of file diff --git a/README.md b/README.md index e5f4911..24e4840 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,11 @@ Access to the `/dev/i2c` device files, which means either: ## Building `make` +## Optional features + +### NVML support +Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. + ## Usage Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode. diff --git a/evga-icx.c b/evga-icx.c index 0598dc0..2b12a7d 100644 --- a/evga-icx.c +++ b/evga-icx.c @@ -5,6 +5,10 @@ #include #include +#ifdef USE_NVML +#include "nvidia-sensors.h" +#endif + #include "icx3.h" #include "evga-card.h" @@ -32,7 +36,7 @@ int main (int argc, char **argv) int print_info = 0; int compact = 0; int gpu_num = -1; /* Card to control */ - int watch = -1; + unsigned int watch = 0; char *fan_speed[ICX3_MAX_FANS] = {NULL}; /* Input parsing */ @@ -125,33 +129,57 @@ int main (int argc, char **argv) } } + /* NVML init */ +#ifdef USE_NVML + int nvml_ok = init_nvml(); + nvmlDevice_t* device[MAX_GPUS]; +#endif + /* print sensor info */ -print: if (print_info) { - if (gpu_num == -1) { - for (int i = 0; i < gpu_count; i++){ - print_gpu_info(i, gpus, compact); - } - } else if (gpu_num <= gpu_count - 1) { - print_gpu_info(gpu_num, gpus, compact); - } - } - if (watch > 0) { - sleep(watch); - goto print; + do { + if (gpu_num == -1) { + for (int i = 0; i < gpu_count; i++){ + print_gpu_info(i, &gpus[i], compact); + } + } else if (gpu_num <= gpu_count - 1) { + print_gpu_info(gpu_num, &gpus[gpu_num], compact); + } + sleep(watch); + } while (watch > 0); } + +#ifdef USE_NVML + nvmlShutdown(); +#endif } -void print_gpu_info(int gpu_num, struct card_info gpus[], int compact) { +void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) { if (compact) { + /* One line per GPU */ printf("#%d ", gpu_num); - print_icx3_fans_oneline(&gpus[gpu_num]); - print_icx3_temps_oneline(&gpus[gpu_num]); + print_icx3_fans_oneline(gpu); + printf(" GPU"); +#ifdef USE_NVML + print_nvml_temp(1, gpu); +#endif + print_icx3_temps_oneline(gpu); + printf("°C"); +#ifdef USE_NVML + print_nvml_clock_reason(1, gpu); +#endif printf("\n"); } else { - printf("#%d: %s (%s) @ %s\n", gpu_num, gpus[gpu_num].card_name, gpus[gpu_num].i2c_dev_path, gpus[gpu_num].pci_id); - print_icx3_fans(&gpus[gpu_num]); - print_icx3_temps(&gpus[gpu_num]); + /* One line per GPU sensor */ + printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id); + print_icx3_fans(gpu); +#ifdef USE_NVML + print_nvml_temp(0, gpu); +#endif + print_icx3_temps(gpu); +#ifdef USE_NVML + print_nvml_clock_reason(0, gpu); +#endif printf("\n"); } diff --git a/icx3.c b/icx3.c index 7c460d1..dadc94d 100644 --- a/icx3.c +++ b/icx3.c @@ -89,11 +89,11 @@ void print_icx3_temps_oneline(struct card_info *card) get_temp_sensors(temps, card); for (int i=0; i 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3)) printf(" %.3s", icx3_temp_sensor_names[i]); printf(" %3.0f", temps[i]); } - printf("°C"); } void get_available_fans(char *fans_avail, struct card_info *card) diff --git a/nvidia-sensors.c b/nvidia-sensors.c new file mode 100644 index 0000000..c2d7f2a --- /dev/null +++ b/nvidia-sensors.c @@ -0,0 +1,95 @@ +#include + +#include "nvidia-sensors.h" + +int init_nvml() +{ + nvmlReturn_t result; + result = nvmlInit_v2(); + if (result != NVML_SUCCESS) { + printf("Could not init NVML: %s\n", nvmlErrorString(result)); + return 0; + } +} + +void print_nvml_temp(int compact, struct card_info *card) +{ + if (compact) + printf(" %3d", get_nvml_temp(card)); + else + printf("GPU1: %+d°C\n", get_nvml_temp(card)); +} + +void print_nvml_clock_reason(int compact, struct card_info *card) +{ + unsigned long long reasons = get_nvml_clock_reasons(card); + int single_reason = 1; + + if (compact) + printf(" CLK "); + else + printf("Clock reasons: "); + + for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) { + if (reasons & clock_reason_names[i].mask) { + if (!single_reason) { + if (compact) + printf(","); + else + printf(", "); + } + single_reason = 0; + + if (compact) + printf("%s", clock_reason_names[i].short_name); + else + printf("%s", clock_reason_names[i].long_name); + } + } + + if (single_reason) + printf("None"); + + if (!compact) + printf(" (0x%llx)\n", reasons); +} + +unsigned int get_nvml_temp(struct card_info *card) +{ + nvmlReturn_t result; + nvmlDevice_t nvml_device; + result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); + if (result != NVML_SUCCESS) { + printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + + unsigned int temp; + result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp); + if (result != NVML_SUCCESS) { + printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + return temp; +} + +unsigned long long get_nvml_clock_reasons(struct card_info *card) +{ + nvmlReturn_t result; + nvmlDevice_t nvml_device; + result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); + if (result != NVML_SUCCESS) { + printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + + unsigned long long reasons; + result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ; + if (result != NVML_SUCCESS) { + printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + + return reasons; +} + diff --git a/nvidia-sensors.h b/nvidia-sensors.h new file mode 100644 index 0000000..8d74a51 --- /dev/null +++ b/nvidia-sensors.h @@ -0,0 +1,28 @@ +#include + +#include "evga-card.h" + +struct clock_reason { + unsigned long long mask; + char *short_name; + char *long_name; +}; + +static struct clock_reason clock_reason_names[] = +{ + {nvmlClocksEventReasonGpuIdle, "Idle", "GPU idle"}, + {nvmlClocksEventReasonApplicationsClocksSetting, "AppClk", "Application clocks"}, + {nvmlClocksEventReasonSwPowerCap, "Pwr", "Power cap"}, + {nvmlClocksThrottleReasonHwSlowdown, "HWSlow", "Hardware slowdown"}, + {nvmlClocksEventReasonSyncBoost, "Sync", "Sync boost"}, + {nvmlClocksEventReasonSwThermalSlowdown, "SWTherm", "Software thermal"}, + {nvmlClocksThrottleReasonHwThermalSlowdown, "HWTherm", "Hardware thermal"}, + {nvmlClocksThrottleReasonHwPowerBrakeSlowdown, "HWPower", "Hardware power brake"}, + {nvmlClocksEventReasonDisplayClockSetting, "DispClk", "Display clock"} +}; + +int init_nvml(); +void print_nvml_temp(int compact, struct card_info *card); +void print_nvml_clock_reason(int compact, struct card_info *card); +unsigned int get_nvml_temp(struct card_info *card); +unsigned long long get_nvml_clock_reasons(struct card_info *card); \ No newline at end of file