From 8d4a8f7ee1b01a99fa0e79b620e5f9757154983b Mon Sep 17 00:00:00 2001 From: moosecrap Date: Mon, 3 Feb 2025 05:57:38 -0800 Subject: [PATCH] Added support for reading VRAM and HotSpot temperatures. --- Makefile | 6 +++ README.md | 4 ++ evga-card.c | 1 + evga-card.h | 13 +++--- evga-icx.c | 56 ++++++++++++++++++++++--- gddr6.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++ gddr6.h | 26 ++++++++++++ icx3.c | 13 ------ icx3.h | 1 - nvidia-sensors.c | 8 ---- nvidia-sensors.h | 1 - 11 files changed, 200 insertions(+), 34 deletions(-) create mode 100644 gddr6.c create mode 100644 gddr6.h diff --git a/Makefile b/Makefile index b2f91d9..ebb8082 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,12 @@ CFLAGS += -DUSE_NVML OBJS += nvidia-sensors.o endif +ifdef USE_LIBPCI +LDLIBS += -lpci +CFLAGS += -DUSE_LIBPCI +OBJS += gddr6.o +endif + evga-icx : $(OBJS) debug : CFLAGS += -g -O0 diff --git a/README.md b/README.md index efa97c9..cd5b03d 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ Access to the `/dev/i2c` device files, which means either: ## Dependencies * libi2c-dev * libnvidia-ml-dev (if building with `USE_NVML=1`) +* libpci-dev (if building with `USE_LIBPCI=1`) ## Building `make` @@ -30,6 +31,9 @@ Access to the `/dev/i2c` device files, which means either: ### NVML support Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed. +### VRAM and Hotspot temperature +Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy. + ## Usage Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode. diff --git a/evga-card.c b/evga-card.c index b256a12..41f8cd7 100644 --- a/evga-card.c +++ b/evga-card.c @@ -67,6 +67,7 @@ int find_evga_gpu_i2cs(struct card_info *infos, int max_gpus) /* Write our card info into the provided struct array */ infos[num_gpus].card_name = evga_pci_ids[i].card_name; infos[num_gpus].pci_id = pci_addr; + infos[num_gpus].pci_device_id = pci_device; infos[num_gpus].i2c_dev_path = calloc(strlen(dev_file) + 1, sizeof(char)); strcpy(infos[num_gpus].i2c_dev_path, dev_file); num_gpus++; diff --git a/evga-card.h b/evga-card.h index c4a8fe2..87353b2 100644 --- a/evga-card.h +++ b/evga-card.h @@ -84,12 +84,13 @@ #define EVGA_RTX3090TI_FTW3_ULTRA_GAMING_SUB_DEV 0x4985 struct card_info { - char *card_name; - char *pci_id; - unsigned short pci_device_id; - char *i2c_dev_path; - int i2c_fd; - int product_id; + char *card_name; /* The 'nice' name of the card */ + char *pci_id; /* PCI bus address in domain:bus:device.function format. May be shortened (e.g. c:00.0) */ + unsigned short pci_device_id; /* The device ID of the card, i.e. corresponds to the NVIDIA model number */ + char *i2c_dev_path; /* Path to the i2c device file */ + int i2c_fd; /* File descriptor for the i2c device file, for re-use */ + int product_id; /* EVGA internal product ID, as reported by the iCX3 controller */ + unsigned int bar0; /* Address of the card's PCI base address register */ }; struct gpu_pci_info { diff --git a/evga-icx.c b/evga-icx.c index 077186a..ea3bf63 100644 --- a/evga-icx.c +++ b/evga-icx.c @@ -9,6 +9,10 @@ #include "nvidia-sensors.h" #endif +#ifdef USE_LIBPCI +#include "gddr6.h" +#endif + #include "icx3.h" #include "evga-card.h" @@ -141,6 +145,12 @@ int main (int argc, char **argv) #ifdef USE_NVML init_nvml(); #endif + + /* PCI init for VRAM/hotspot temps */ +#ifdef USE_LIBPCI + for (int i = 0; i < gpu_count; i++) + init_gddr6(&gpus[i]); +#endif /* print sensor info */ if (print_info) { @@ -183,22 +193,58 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) { printf("#%d ", gpu_num); print_icx3_fans_oneline(gpu); printf(" GPU"); + #ifdef USE_NVML - print_nvml_temp(1, gpu); + printf(" %3d", get_nvml_temp(gpu)); #endif - print_icx3_temps_oneline(gpu); - printf("°C"); + + float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {}; + get_temp_sensors(icx_temp_sensors, gpu); + for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) { + if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3)) + printf(" %.3s", icx3_temp_sensor_names[i]); +#ifdef USE_LIBPCI + if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0) + printf(" %3.0f", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */ +#endif + printf(" %3.0f", icx_temp_sensors[i]); + } + +#ifdef USE_LIBPCI + printf(" HOT %3.0f", get_hotspot_temp(gpu)); +#endif + + printf("°C "); + #ifdef USE_NVML print_nvml_clock_reason(1, gpu); #endif + + } else { /* One line per GPU sensor */ printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id); print_icx3_fans(gpu); + #ifdef USE_NVML - print_nvml_temp(0, gpu); + printf("GPU1: %+d°C\n", get_nvml_temp(gpu)); #endif - print_icx3_temps(gpu); + float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {}; + get_temp_sensors(icx_temp_sensors, gpu); + for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) { +#ifdef USE_LIBPCI + if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0) + printf("VRAM: +%.0f°C\n", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */ +#endif + printf("%s: %+.1f°C\n", + icx3_temp_sensor_names[i], + icx_temp_sensors[i]); + } + +#ifdef USE_LIBPCI + printf("HotSpot: +%.0f°C\n", get_hotspot_temp(gpu)); +#endif + #ifdef USE_NVML print_nvml_clock_reason(0, gpu); #endif diff --git a/gddr6.c b/gddr6.c new file mode 100644 index 0000000..d0f59ea --- /dev/null +++ b/gddr6.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include + +#define PG_SZ sysconf(_SC_PAGE_SIZE) + +#include "gddr6.h" + +void init_gddr6(struct card_info *card) +{ + /* Parse the address of the card to get the PCI info */ + char pci_address[] = "00000000:00:00.0"; + int len = strlen(card->pci_id); + strcpy(&pci_address[sizeof(pci_address) - len - 1], card->pci_id); + + int domain = 0; + int bus = 0; + int dev = 0; + int func = 0; + + sscanf(pci_address, "%x:%x:%x.%x", &domain, &bus, &dev, &func); + + struct pci_access *pacc = NULL; + struct pci_dev *pci_dev = NULL; + + pacc = pci_alloc(); + pci_init(pacc); + + pci_dev = pci_get_dev(pacc, domain, bus, dev, func); + + pci_fill_info(pci_dev, PCI_FILL_IDENT | PCI_FILL_BASES | PCI_FILL_CLASS); + + card->bar0 = (pci_dev->base_addr[0] & 0xFFFFFFFF); + + pci_cleanup(pacc); +} + +float get_vram_temp(struct card_info *card) +{ + int fd; + float temp = 0.0; + + if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1) + { + printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n"); + return 0; + } + + for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) { + if (card->pci_device_id == device_offset_info[i].device_id){ + unsigned int phys_addr = (card->bar0 + device_offset_info[i].vram_offset); + unsigned int base_offset = phys_addr & ~(PG_SZ-1); + void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset); + if(map_base == (void *) -1) + { + if (fd != -1) + close(fd); + printf("Can't read memory for VRAM temperature. If you are root, enable kernel parameter iomem=relaxed\n"); + } + void *virt_addr = (char *) map_base + (phys_addr - base_offset); + int read_result = *((unsigned int *) virt_addr); + temp = ((read_result & 0x00000fff) / 0x20); + munmap(map_base, PG_SZ); + } + } + + close(fd); + return temp; +} + +float get_hotspot_temp(struct card_info *card) +{ + int fd; + float temp = 0.0; + + if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1) + { + printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n"); + return 0; + } + + for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) { + if (card->pci_device_id == device_offset_info[i].device_id){ + unsigned int phys_addr = (card->bar0 + device_offset_info[i].hotspot_offset); + unsigned int base_offset = phys_addr & ~(PG_SZ-1); + void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset); + if(map_base == (void *) -1) + { + if (fd != -1) + close(fd); + printf("Can't read memory for hotspot. If you are root, enable kernel parameter iomem=relaxed\n"); + } + void *virt_addr = (char *) map_base + (phys_addr - base_offset); + int read_result = *((unsigned int *) virt_addr); + temp = (read_result >> 8) & 0xff; + munmap(map_base, PG_SZ); + } + } + + close(fd); + return temp; +} diff --git a/gddr6.h b/gddr6.h new file mode 100644 index 0000000..839f177 --- /dev/null +++ b/gddr6.h @@ -0,0 +1,26 @@ +#include "evga-card.h" + +struct device_offset { + unsigned short device_id; + int vram_offset; + int hotspot_offset; +}; + +static struct device_offset device_offset_info[] = +{ + {.device_id = NVIDIA_RTX3090TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 Ti */ + {.device_id = NVIDIA_RTX3090_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 */ + {.device_id = NVIDIA_RTX3080TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 Ti */ + {.device_id = NVIDIA_RTX3080_12G_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 12G LHR */ + {.device_id = NVIDIA_RTX3080_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 LHR */ + {.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */ + {.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */ + {.device_id = NVIDIA_RTX3070TI_GA102_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti GA102 TODO:check */ + {.device_id = NVIDIA_RTX3070TI_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti TODO:check */ + {.device_id = NVIDIA_RTX3070_LHR_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 */ + {.device_id = NVIDIA_RTX3070_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 LHR */ +}; + +void init_gddr6(struct card_info *card); +float get_vram_temp(struct card_info *card); +float get_hotspot_temp(struct card_info *card); diff --git a/icx3.c b/icx3.c index dadc94d..289f558 100644 --- a/icx3.c +++ b/icx3.c @@ -69,19 +69,6 @@ void print_icx3_fans_oneline(struct card_info *card) printf("%%"); } -void print_icx3_temps(struct card_info *card) -{ - float temps[ICX3_NUM_TEMP_SENSORS]; - - get_temp_sensors(temps, card); - - for (int i=0; i