From c4e7f7e8b965b79a4289d89410378f4d2becdcf0 Mon Sep 17 00:00:00 2001 From: moosecrap Date: Fri, 7 Feb 2025 08:23:46 -0800 Subject: [PATCH] Added NVML memory controller load percentage. --- LICENSE | 2 ++ README.md | 35 ++++++++++++++++++----------------- evga-icx.c | 4 ++++ nvidia-sensors.c | 45 ++++++++++++++++++++++++++++++++------------- nvidia-sensors.h | 4 +++- 5 files changed, 59 insertions(+), 31 deletions(-) diff --git a/LICENSE b/LICENSE index d1c763b..2240d34 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,5 @@ +MIT No Attribution + Copyright 2025 admin@long-cat.net Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. diff --git a/README.md b/README.md index 155adbe..498d103 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Access to the `/dev/i2c` device files, which means either: ## Optional features ### NVML support -Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed. +Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason and memory controller utilization. This requires the NVIDIA management library (NVML) to be installed. ### VRAM and Hotspot temperature Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy. @@ -59,29 +59,30 @@ Read sensors: ```text $ ./evga-icx #0: EVGA GeForce RTX 3090 FTW3 Ultra v2 (/dev/i2c-3) @ c:00.0 -Fan 0: 2133 RPM (71/0%, Auto) -Fan 1: 2123 RPM (70/0%, Auto) -Fan 2: 2122 RPM (70/0%, Offset) +Fan 0: 1751 RPM (58/0%, Auto) +Fan 1: 1730 RPM (57/0%, Auto) +Fan 2: 1712 RPM (57/0%, Offset) Ext. fan: 0 RPM (0/0%, Offset) -GPU1: +73°C -GPU2: +70.6°C -VRAM: +86°C -MEM1: +68.9°C -MEM2: +65.1°C -MEM3: +70.2°C -PWR1: +60.9°C -PWR2: +66.4°C -PWR3: +73.0°C -PWR4: +71.5°C -PWR5: +65.6°C -HotSpot: +84°C +GPU1: +65°C +GPU2: +57.8°C +VRAM: +74°C +MEM1: +56.1°C +MEM2: +53.5°C +MEM3: +55.5°C +PWR1: +48.2°C +PWR2: +53.2°C +PWR3: +59.6°C +PWR4: +58.0°C +PWR5: +51.1°C +HotSpot: +75°C +Mem util: 43% Clock reasons: Power cap (0x4) ``` Compact one-line mode: ```text $ ./evga-icx --compact -#0 FAN 72 72 72 0% GPU 76 73 MEM 90 72 67 72 PWR 63 69 76 74 68 HOT 86°C CLK Pwr +#0 FAN 59 59 58 0% GPU 66 60 MEM 74 58 55 58 PWR 49 55 61 60 53 HOT 77°C MEM 42% CLK Pwr ``` Set external fan to follow Nvidia driver controlled speed with a -500 RPM offset: diff --git a/evga-icx.c b/evga-icx.c index 028b39a..38ae0be 100644 --- a/evga-icx.c +++ b/evga-icx.c @@ -226,6 +226,8 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) { printf("°C "); #ifdef USE_NVML + printf("%s MEM %s", header_start, header_end); + printf("%3d%%", get_nvml_mem_util(gpu)); printf("%s CLK %s", header_start, header_end); print_nvml_clock_reason(1, gpu); #endif @@ -256,8 +258,10 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) { #endif #ifdef USE_NVML + printf("Mem util: %d%%\n", get_nvml_mem_util(gpu)); printf("Clock reasons: "); print_nvml_clock_reason(0, gpu); + printf("\n"); #endif } diff --git a/nvidia-sensors.c b/nvidia-sensors.c index 90aeb44..1b258df 100644 --- a/nvidia-sensors.c +++ b/nvidia-sensors.c @@ -10,6 +10,17 @@ void init_nvml() printf("Could not init NVML: %s\n", nvmlErrorString(result)); } +int get_nvml_handle(nvmlDevice_t *device, struct card_info *card) +{ + nvmlReturn_t result; + result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, device); + if (result != NVML_SUCCESS) { + printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + return 1; +} + void print_nvml_clock_reason(int compact, struct card_info *card) { unsigned long long reasons = get_nvml_clock_reasons(card); @@ -36,21 +47,17 @@ void print_nvml_clock_reason(int compact, struct card_info *card) printf("None"); if (!compact) - printf(" (0x%llx)\n", reasons); + printf(" (0x%llx)", reasons); } unsigned int get_nvml_temp(struct card_info *card) { - nvmlReturn_t result; nvmlDevice_t nvml_device; - result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); - if (result != NVML_SUCCESS) { - printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + if (!get_nvml_handle(&nvml_device, card)) return 0; - } unsigned int temp; - result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp); + nvmlReturn_t result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp); if (result != NVML_SUCCESS) { printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); return 0; @@ -60,16 +67,12 @@ unsigned int get_nvml_temp(struct card_info *card) unsigned long long get_nvml_clock_reasons(struct card_info *card) { - nvmlReturn_t result; nvmlDevice_t nvml_device; - result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); - if (result != NVML_SUCCESS) { - printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + if (!get_nvml_handle(&nvml_device, card)) return 0; - } unsigned long long reasons; - result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ; + nvmlReturn_t result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ; if (result != NVML_SUCCESS) { printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); return 0; @@ -78,3 +81,19 @@ unsigned long long get_nvml_clock_reasons(struct card_info *card) return reasons; } +unsigned int get_nvml_mem_util(struct card_info *card) +{ + nvmlDevice_t nvml_device; + if (!get_nvml_handle(&nvml_device, card)) + return 0; + + nvmlUtilization_t util; + nvmlReturn_t result = nvmlDeviceGetUtilizationRates(nvml_device, &util); + if (result != NVML_SUCCESS) { + printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + return 0; + } + + return util.memory; +} + diff --git a/nvidia-sensors.h b/nvidia-sensors.h index c31e4ce..f9b0301 100644 --- a/nvidia-sensors.h +++ b/nvidia-sensors.h @@ -22,6 +22,8 @@ static struct clock_reason clock_reason_names[] = }; void init_nvml(); +int get_nvml_handle(nvmlDevice_t *device, struct card_info *card); void print_nvml_clock_reason(int compact, struct card_info *card); unsigned int get_nvml_temp(struct card_info *card); -unsigned long long get_nvml_clock_reasons(struct card_info *card); \ No newline at end of file +unsigned long long get_nvml_clock_reasons(struct card_info *card); +unsigned int get_nvml_mem_util(struct card_info *card); \ No newline at end of file