Added support for reading VRAM and HotSpot temperatures.

This commit is contained in:
moosecrap 2025-02-03 05:57:38 -08:00
parent 54885e16f0
commit 8d4a8f7ee1
11 changed files with 200 additions and 34 deletions

View File

@ -10,6 +10,12 @@ CFLAGS += -DUSE_NVML
OBJS += nvidia-sensors.o OBJS += nvidia-sensors.o
endif endif
ifdef USE_LIBPCI
LDLIBS += -lpci
CFLAGS += -DUSE_LIBPCI
OBJS += gddr6.o
endif
evga-icx : $(OBJS) evga-icx : $(OBJS)
debug : CFLAGS += -g -O0 debug : CFLAGS += -g -O0

View File

@ -21,6 +21,7 @@ Access to the `/dev/i2c` device files, which means either:
## Dependencies ## Dependencies
* libi2c-dev * libi2c-dev
* libnvidia-ml-dev (if building with `USE_NVML=1`) * libnvidia-ml-dev (if building with `USE_NVML=1`)
* libpci-dev (if building with `USE_LIBPCI=1`)
## Building ## Building
`make` `make`
@ -30,6 +31,9 @@ Access to the `/dev/i2c` device files, which means either:
### NVML support ### NVML support
Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed. Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed.
### VRAM and Hotspot temperature
Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy.
## Usage ## Usage
Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode. Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode.

View File

@ -67,6 +67,7 @@ int find_evga_gpu_i2cs(struct card_info *infos, int max_gpus)
/* Write our card info into the provided struct array */ /* Write our card info into the provided struct array */
infos[num_gpus].card_name = evga_pci_ids[i].card_name; infos[num_gpus].card_name = evga_pci_ids[i].card_name;
infos[num_gpus].pci_id = pci_addr; infos[num_gpus].pci_id = pci_addr;
infos[num_gpus].pci_device_id = pci_device;
infos[num_gpus].i2c_dev_path = calloc(strlen(dev_file) + 1, sizeof(char)); infos[num_gpus].i2c_dev_path = calloc(strlen(dev_file) + 1, sizeof(char));
strcpy(infos[num_gpus].i2c_dev_path, dev_file); strcpy(infos[num_gpus].i2c_dev_path, dev_file);
num_gpus++; num_gpus++;

View File

@ -84,12 +84,13 @@
#define EVGA_RTX3090TI_FTW3_ULTRA_GAMING_SUB_DEV 0x4985 #define EVGA_RTX3090TI_FTW3_ULTRA_GAMING_SUB_DEV 0x4985
struct card_info { struct card_info {
char *card_name; char *card_name; /* The 'nice' name of the card */
char *pci_id; char *pci_id; /* PCI bus address in domain:bus:device.function format. May be shortened (e.g. c:00.0) */
unsigned short pci_device_id; unsigned short pci_device_id; /* The device ID of the card, i.e. corresponds to the NVIDIA model number */
char *i2c_dev_path; char *i2c_dev_path; /* Path to the i2c device file */
int i2c_fd; int i2c_fd; /* File descriptor for the i2c device file, for re-use */
int product_id; int product_id; /* EVGA internal product ID, as reported by the iCX3 controller */
unsigned int bar0; /* Address of the card's PCI base address register */
}; };
struct gpu_pci_info { struct gpu_pci_info {

View File

@ -9,6 +9,10 @@
#include "nvidia-sensors.h" #include "nvidia-sensors.h"
#endif #endif
#ifdef USE_LIBPCI
#include "gddr6.h"
#endif
#include "icx3.h" #include "icx3.h"
#include "evga-card.h" #include "evga-card.h"
@ -141,6 +145,12 @@ int main (int argc, char **argv)
#ifdef USE_NVML #ifdef USE_NVML
init_nvml(); init_nvml();
#endif #endif
/* PCI init for VRAM/hotspot temps */
#ifdef USE_LIBPCI
for (int i = 0; i < gpu_count; i++)
init_gddr6(&gpus[i]);
#endif
/* print sensor info */ /* print sensor info */
if (print_info) { if (print_info) {
@ -183,22 +193,58 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
printf("#%d ", gpu_num); printf("#%d ", gpu_num);
print_icx3_fans_oneline(gpu); print_icx3_fans_oneline(gpu);
printf(" GPU"); printf(" GPU");
#ifdef USE_NVML #ifdef USE_NVML
print_nvml_temp(1, gpu); printf(" %3d", get_nvml_temp(gpu));
#endif #endif
print_icx3_temps_oneline(gpu);
printf("°C"); float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
get_temp_sensors(icx_temp_sensors, gpu);
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
printf(" %.3s", icx3_temp_sensor_names[i]);
#ifdef USE_LIBPCI
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
printf(" %3.0f", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
#endif
printf(" %3.0f", icx_temp_sensors[i]);
}
#ifdef USE_LIBPCI
printf(" HOT %3.0f", get_hotspot_temp(gpu));
#endif
printf("°C ");
#ifdef USE_NVML #ifdef USE_NVML
print_nvml_clock_reason(1, gpu); print_nvml_clock_reason(1, gpu);
#endif #endif
} else { } else {
/* One line per GPU sensor */ /* One line per GPU sensor */
printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id); printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id);
print_icx3_fans(gpu); print_icx3_fans(gpu);
#ifdef USE_NVML #ifdef USE_NVML
print_nvml_temp(0, gpu); printf("GPU1: %+d°C\n", get_nvml_temp(gpu));
#endif #endif
print_icx3_temps(gpu); float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
get_temp_sensors(icx_temp_sensors, gpu);
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
#ifdef USE_LIBPCI
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
printf("VRAM: +%.0f°C\n", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
#endif
printf("%s: %+.1f°C\n",
icx3_temp_sensor_names[i],
icx_temp_sensors[i]);
}
#ifdef USE_LIBPCI
printf("HotSpot: +%.0f°C\n", get_hotspot_temp(gpu));
#endif
#ifdef USE_NVML #ifdef USE_NVML
print_nvml_clock_reason(0, gpu); print_nvml_clock_reason(0, gpu);
#endif #endif

105
gddr6.c Normal file
View File

@ -0,0 +1,105 @@
#include <string.h>
#include <stdio.h>
#include <pci/pci.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#define PG_SZ sysconf(_SC_PAGE_SIZE)
#include "gddr6.h"
void init_gddr6(struct card_info *card)
{
/* Parse the address of the card to get the PCI info */
char pci_address[] = "00000000:00:00.0";
int len = strlen(card->pci_id);
strcpy(&pci_address[sizeof(pci_address) - len - 1], card->pci_id);
int domain = 0;
int bus = 0;
int dev = 0;
int func = 0;
sscanf(pci_address, "%x:%x:%x.%x", &domain, &bus, &dev, &func);
struct pci_access *pacc = NULL;
struct pci_dev *pci_dev = NULL;
pacc = pci_alloc();
pci_init(pacc);
pci_dev = pci_get_dev(pacc, domain, bus, dev, func);
pci_fill_info(pci_dev, PCI_FILL_IDENT | PCI_FILL_BASES | PCI_FILL_CLASS);
card->bar0 = (pci_dev->base_addr[0] & 0xFFFFFFFF);
pci_cleanup(pacc);
}
float get_vram_temp(struct card_info *card)
{
int fd;
float temp = 0.0;
if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)
{
printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n");
return 0;
}
for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) {
if (card->pci_device_id == device_offset_info[i].device_id){
unsigned int phys_addr = (card->bar0 + device_offset_info[i].vram_offset);
unsigned int base_offset = phys_addr & ~(PG_SZ-1);
void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset);
if(map_base == (void *) -1)
{
if (fd != -1)
close(fd);
printf("Can't read memory for VRAM temperature. If you are root, enable kernel parameter iomem=relaxed\n");
}
void *virt_addr = (char *) map_base + (phys_addr - base_offset);
int read_result = *((unsigned int *) virt_addr);
temp = ((read_result & 0x00000fff) / 0x20);
munmap(map_base, PG_SZ);
}
}
close(fd);
return temp;
}
float get_hotspot_temp(struct card_info *card)
{
int fd;
float temp = 0.0;
if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)
{
printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n");
return 0;
}
for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) {
if (card->pci_device_id == device_offset_info[i].device_id){
unsigned int phys_addr = (card->bar0 + device_offset_info[i].hotspot_offset);
unsigned int base_offset = phys_addr & ~(PG_SZ-1);
void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset);
if(map_base == (void *) -1)
{
if (fd != -1)
close(fd);
printf("Can't read memory for hotspot. If you are root, enable kernel parameter iomem=relaxed\n");
}
void *virt_addr = (char *) map_base + (phys_addr - base_offset);
int read_result = *((unsigned int *) virt_addr);
temp = (read_result >> 8) & 0xff;
munmap(map_base, PG_SZ);
}
}
close(fd);
return temp;
}

26
gddr6.h Normal file
View File

@ -0,0 +1,26 @@
#include "evga-card.h"
struct device_offset {
unsigned short device_id;
int vram_offset;
int hotspot_offset;
};
static struct device_offset device_offset_info[] =
{
{.device_id = NVIDIA_RTX3090TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 Ti */
{.device_id = NVIDIA_RTX3090_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 */
{.device_id = NVIDIA_RTX3080TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 Ti */
{.device_id = NVIDIA_RTX3080_12G_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 12G LHR */
{.device_id = NVIDIA_RTX3080_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 LHR */
{.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */
{.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */
{.device_id = NVIDIA_RTX3070TI_GA102_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti GA102 TODO:check */
{.device_id = NVIDIA_RTX3070TI_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti TODO:check */
{.device_id = NVIDIA_RTX3070_LHR_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 */
{.device_id = NVIDIA_RTX3070_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 LHR */
};
void init_gddr6(struct card_info *card);
float get_vram_temp(struct card_info *card);
float get_hotspot_temp(struct card_info *card);

13
icx3.c
View File

@ -69,19 +69,6 @@ void print_icx3_fans_oneline(struct card_info *card)
printf("%%"); printf("%%");
} }
void print_icx3_temps(struct card_info *card)
{
float temps[ICX3_NUM_TEMP_SENSORS];
get_temp_sensors(temps, card);
for (int i=0; i<ICX3_NUM_TEMP_SENSORS; i++) {
printf("%s: %+.1f°C\n",
icx3_temp_sensor_names[i],
temps[i]);
}
}
void print_icx3_temps_oneline(struct card_info *card) void print_icx3_temps_oneline(struct card_info *card)
{ {
float temps[ICX3_NUM_TEMP_SENSORS]; float temps[ICX3_NUM_TEMP_SENSORS];

1
icx3.h
View File

@ -119,7 +119,6 @@ static char *icx3_temp_sensor_names[] = {
int icx3_init(struct card_info *card); int icx3_init(struct card_info *card);
void print_icx3_fans(struct card_info *card); void print_icx3_fans(struct card_info *card);
void print_icx3_fans_oneline(struct card_info *card); void print_icx3_fans_oneline(struct card_info *card);
void print_icx3_temps(struct card_info *card);
void print_icx3_temps_oneline(struct card_info *card); void print_icx3_temps_oneline(struct card_info *card);
void get_available_fans(char *fans_avail, struct card_info *card); void get_available_fans(char *fans_avail, struct card_info *card);
void get_fan_status(struct icx3_fan_control *fans, struct card_info *card); void get_fan_status(struct icx3_fan_control *fans, struct card_info *card);

View File

@ -10,14 +10,6 @@ void init_nvml()
printf("Could not init NVML: %s\n", nvmlErrorString(result)); printf("Could not init NVML: %s\n", nvmlErrorString(result));
} }
void print_nvml_temp(int compact, struct card_info *card)
{
if (compact)
printf(" %3d", get_nvml_temp(card));
else
printf("GPU1: %+d°C\n", get_nvml_temp(card));
}
void print_nvml_clock_reason(int compact, struct card_info *card) void print_nvml_clock_reason(int compact, struct card_info *card)
{ {
unsigned long long reasons = get_nvml_clock_reasons(card); unsigned long long reasons = get_nvml_clock_reasons(card);

View File

@ -22,7 +22,6 @@ static struct clock_reason clock_reason_names[] =
}; };
void init_nvml(); void init_nvml();
void print_nvml_temp(int compact, struct card_info *card);
void print_nvml_clock_reason(int compact, struct card_info *card); void print_nvml_clock_reason(int compact, struct card_info *card);
unsigned int get_nvml_temp(struct card_info *card); unsigned int get_nvml_temp(struct card_info *card);
unsigned long long get_nvml_clock_reasons(struct card_info *card); unsigned long long get_nvml_clock_reasons(struct card_info *card);