Added NVML memory controller load percentage.

This commit is contained in:
moosecrap 2025-02-07 08:23:46 -08:00
parent 442a524243
commit c4e7f7e8b9
5 changed files with 59 additions and 31 deletions

View File

@ -1,3 +1,5 @@
MIT No Attribution
Copyright 2025 admin@long-cat.net Copyright 2025 admin@long-cat.net
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so.

View File

@ -29,7 +29,7 @@ Access to the `/dev/i2c` device files, which means either:
## Optional features ## Optional features
### NVML support ### NVML support
Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed. Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason and memory controller utilization. This requires the NVIDIA management library (NVML) to be installed.
### VRAM and Hotspot temperature ### VRAM and Hotspot temperature
Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy. Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy.
@ -59,29 +59,30 @@ Read sensors:
```text ```text
$ ./evga-icx $ ./evga-icx
#0: EVGA GeForce RTX 3090 FTW3 Ultra v2 (/dev/i2c-3) @ c:00.0 #0: EVGA GeForce RTX 3090 FTW3 Ultra v2 (/dev/i2c-3) @ c:00.0
Fan 0: 2133 RPM (71/0%, Auto) Fan 0: 1751 RPM (58/0%, Auto)
Fan 1: 2123 RPM (70/0%, Auto) Fan 1: 1730 RPM (57/0%, Auto)
Fan 2: 2122 RPM (70/0%, Offset) Fan 2: 1712 RPM (57/0%, Offset)
Ext. fan: 0 RPM (0/0%, Offset) Ext. fan: 0 RPM (0/0%, Offset)
GPU1: +73°C GPU1: +65°C
GPU2: +70.6°C GPU2: +57.8°C
VRAM: +86°C VRAM: +74°C
MEM1: +68.9°C MEM1: +56.1°C
MEM2: +65.1°C MEM2: +53.5°C
MEM3: +70.2°C MEM3: +55.5°C
PWR1: +60.9°C PWR1: +48.2°C
PWR2: +66.4°C PWR2: +53.2°C
PWR3: +73.0°C PWR3: +59.6°C
PWR4: +71.5°C PWR4: +58.0°C
PWR5: +65.6°C PWR5: +51.1°C
HotSpot: +84°C HotSpot: +75°C
Mem util: 43%
Clock reasons: Power cap (0x4) Clock reasons: Power cap (0x4)
``` ```
Compact one-line mode: Compact one-line mode:
```text ```text
$ ./evga-icx --compact $ ./evga-icx --compact
#0 FAN 72 72 72 0% GPU 76 73 MEM 90 72 67 72 PWR 63 69 76 74 68 HOT 86°C CLK Pwr #0 FAN 59 59 58 0% GPU 66 60 MEM 74 58 55 58 PWR 49 55 61 60 53 HOT 77°C MEM 42% CLK Pwr
``` ```
Set external fan to follow Nvidia driver controlled speed with a -500 RPM offset: Set external fan to follow Nvidia driver controlled speed with a -500 RPM offset:

View File

@ -226,6 +226,8 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
printf("°C "); printf("°C ");
#ifdef USE_NVML #ifdef USE_NVML
printf("%s MEM %s", header_start, header_end);
printf("%3d%%", get_nvml_mem_util(gpu));
printf("%s CLK %s", header_start, header_end); printf("%s CLK %s", header_start, header_end);
print_nvml_clock_reason(1, gpu); print_nvml_clock_reason(1, gpu);
#endif #endif
@ -256,8 +258,10 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
#endif #endif
#ifdef USE_NVML #ifdef USE_NVML
printf("Mem util: %d%%\n", get_nvml_mem_util(gpu));
printf("Clock reasons: "); printf("Clock reasons: ");
print_nvml_clock_reason(0, gpu); print_nvml_clock_reason(0, gpu);
printf("\n");
#endif #endif
} }

View File

@ -10,6 +10,17 @@ void init_nvml()
printf("Could not init NVML: %s\n", nvmlErrorString(result)); printf("Could not init NVML: %s\n", nvmlErrorString(result));
} }
int get_nvml_handle(nvmlDevice_t *device, struct card_info *card)
{
nvmlReturn_t result;
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, device);
if (result != NVML_SUCCESS) {
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
return 1;
}
void print_nvml_clock_reason(int compact, struct card_info *card) void print_nvml_clock_reason(int compact, struct card_info *card)
{ {
unsigned long long reasons = get_nvml_clock_reasons(card); unsigned long long reasons = get_nvml_clock_reasons(card);
@ -36,21 +47,17 @@ void print_nvml_clock_reason(int compact, struct card_info *card)
printf("None"); printf("None");
if (!compact) if (!compact)
printf(" (0x%llx)\n", reasons); printf(" (0x%llx)", reasons);
} }
unsigned int get_nvml_temp(struct card_info *card) unsigned int get_nvml_temp(struct card_info *card)
{ {
nvmlReturn_t result;
nvmlDevice_t nvml_device; nvmlDevice_t nvml_device;
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); if (!get_nvml_handle(&nvml_device, card))
if (result != NVML_SUCCESS) {
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0; return 0;
}
unsigned int temp; unsigned int temp;
result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp); nvmlReturn_t result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp);
if (result != NVML_SUCCESS) { if (result != NVML_SUCCESS) {
printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0; return 0;
@ -60,16 +67,12 @@ unsigned int get_nvml_temp(struct card_info *card)
unsigned long long get_nvml_clock_reasons(struct card_info *card) unsigned long long get_nvml_clock_reasons(struct card_info *card)
{ {
nvmlReturn_t result;
nvmlDevice_t nvml_device; nvmlDevice_t nvml_device;
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device); if (!get_nvml_handle(&nvml_device, card))
if (result != NVML_SUCCESS) {
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0; return 0;
}
unsigned long long reasons; unsigned long long reasons;
result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ; nvmlReturn_t result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ;
if (result != NVML_SUCCESS) { if (result != NVML_SUCCESS) {
printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0; return 0;
@ -78,3 +81,19 @@ unsigned long long get_nvml_clock_reasons(struct card_info *card)
return reasons; return reasons;
} }
unsigned int get_nvml_mem_util(struct card_info *card)
{
nvmlDevice_t nvml_device;
if (!get_nvml_handle(&nvml_device, card))
return 0;
nvmlUtilization_t util;
nvmlReturn_t result = nvmlDeviceGetUtilizationRates(nvml_device, &util);
if (result != NVML_SUCCESS) {
printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
return 0;
}
return util.memory;
}

View File

@ -22,6 +22,8 @@ static struct clock_reason clock_reason_names[] =
}; };
void init_nvml(); void init_nvml();
int get_nvml_handle(nvmlDevice_t *device, struct card_info *card);
void print_nvml_clock_reason(int compact, struct card_info *card); void print_nvml_clock_reason(int compact, struct card_info *card);
unsigned int get_nvml_temp(struct card_info *card); unsigned int get_nvml_temp(struct card_info *card);
unsigned long long get_nvml_clock_reasons(struct card_info *card); unsigned long long get_nvml_clock_reasons(struct card_info *card);
unsigned int get_nvml_mem_util(struct card_info *card);