Added support for reading VRAM and HotSpot temperatures.
This commit is contained in:
parent
54885e16f0
commit
8d4a8f7ee1
6
Makefile
6
Makefile
@ -10,6 +10,12 @@ CFLAGS += -DUSE_NVML
|
||||
OBJS += nvidia-sensors.o
|
||||
endif
|
||||
|
||||
ifdef USE_LIBPCI
|
||||
LDLIBS += -lpci
|
||||
CFLAGS += -DUSE_LIBPCI
|
||||
OBJS += gddr6.o
|
||||
endif
|
||||
|
||||
evga-icx : $(OBJS)
|
||||
|
||||
debug : CFLAGS += -g -O0
|
||||
|
@ -21,6 +21,7 @@ Access to the `/dev/i2c` device files, which means either:
|
||||
## Dependencies
|
||||
* libi2c-dev
|
||||
* libnvidia-ml-dev (if building with `USE_NVML=1`)
|
||||
* libpci-dev (if building with `USE_LIBPCI=1`)
|
||||
|
||||
## Building
|
||||
`make`
|
||||
@ -30,6 +31,9 @@ Access to the `/dev/i2c` device files, which means either:
|
||||
### NVML support
|
||||
Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason. This requires the NVIDIA management library (NVML) to be installed.
|
||||
|
||||
### VRAM and Hotspot temperature
|
||||
Add the make flag `USE_LIBPCI=1` and you can also read the VRAM and "hotspot" temperatures. These require direct memory access to the PCI device so you must run as root and also enable the kernel parameter `iomem=relaxed`. These sensors are **extremely** undocumented so I can't say anything about their accuracy.
|
||||
|
||||
## Usage
|
||||
Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode.
|
||||
|
||||
|
@ -67,6 +67,7 @@ int find_evga_gpu_i2cs(struct card_info *infos, int max_gpus)
|
||||
/* Write our card info into the provided struct array */
|
||||
infos[num_gpus].card_name = evga_pci_ids[i].card_name;
|
||||
infos[num_gpus].pci_id = pci_addr;
|
||||
infos[num_gpus].pci_device_id = pci_device;
|
||||
infos[num_gpus].i2c_dev_path = calloc(strlen(dev_file) + 1, sizeof(char));
|
||||
strcpy(infos[num_gpus].i2c_dev_path, dev_file);
|
||||
num_gpus++;
|
||||
|
13
evga-card.h
13
evga-card.h
@ -84,12 +84,13 @@
|
||||
#define EVGA_RTX3090TI_FTW3_ULTRA_GAMING_SUB_DEV 0x4985
|
||||
|
||||
struct card_info {
|
||||
char *card_name;
|
||||
char *pci_id;
|
||||
unsigned short pci_device_id;
|
||||
char *i2c_dev_path;
|
||||
int i2c_fd;
|
||||
int product_id;
|
||||
char *card_name; /* The 'nice' name of the card */
|
||||
char *pci_id; /* PCI bus address in domain:bus:device.function format. May be shortened (e.g. c:00.0) */
|
||||
unsigned short pci_device_id; /* The device ID of the card, i.e. corresponds to the NVIDIA model number */
|
||||
char *i2c_dev_path; /* Path to the i2c device file */
|
||||
int i2c_fd; /* File descriptor for the i2c device file, for re-use */
|
||||
int product_id; /* EVGA internal product ID, as reported by the iCX3 controller */
|
||||
unsigned int bar0; /* Address of the card's PCI base address register */
|
||||
};
|
||||
|
||||
struct gpu_pci_info {
|
||||
|
54
evga-icx.c
54
evga-icx.c
@ -9,6 +9,10 @@
|
||||
#include "nvidia-sensors.h"
|
||||
#endif
|
||||
|
||||
#ifdef USE_LIBPCI
|
||||
#include "gddr6.h"
|
||||
#endif
|
||||
|
||||
#include "icx3.h"
|
||||
#include "evga-card.h"
|
||||
|
||||
@ -142,6 +146,12 @@ int main (int argc, char **argv)
|
||||
init_nvml();
|
||||
#endif
|
||||
|
||||
/* PCI init for VRAM/hotspot temps */
|
||||
#ifdef USE_LIBPCI
|
||||
for (int i = 0; i < gpu_count; i++)
|
||||
init_gddr6(&gpus[i]);
|
||||
#endif
|
||||
|
||||
/* print sensor info */
|
||||
if (print_info) {
|
||||
do {
|
||||
@ -183,22 +193,58 @@ void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
|
||||
printf("#%d ", gpu_num);
|
||||
print_icx3_fans_oneline(gpu);
|
||||
printf(" GPU");
|
||||
|
||||
#ifdef USE_NVML
|
||||
print_nvml_temp(1, gpu);
|
||||
printf(" %3d", get_nvml_temp(gpu));
|
||||
#endif
|
||||
print_icx3_temps_oneline(gpu);
|
||||
|
||||
float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
|
||||
get_temp_sensors(icx_temp_sensors, gpu);
|
||||
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
|
||||
if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
|
||||
printf(" %.3s", icx3_temp_sensor_names[i]);
|
||||
#ifdef USE_LIBPCI
|
||||
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
|
||||
printf(" %3.0f", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
|
||||
#endif
|
||||
printf(" %3.0f", icx_temp_sensors[i]);
|
||||
}
|
||||
|
||||
#ifdef USE_LIBPCI
|
||||
printf(" HOT %3.0f", get_hotspot_temp(gpu));
|
||||
#endif
|
||||
|
||||
printf("°C ");
|
||||
|
||||
#ifdef USE_NVML
|
||||
print_nvml_clock_reason(1, gpu);
|
||||
#endif
|
||||
|
||||
|
||||
} else {
|
||||
/* One line per GPU sensor */
|
||||
printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id);
|
||||
print_icx3_fans(gpu);
|
||||
|
||||
#ifdef USE_NVML
|
||||
print_nvml_temp(0, gpu);
|
||||
printf("GPU1: %+d°C\n", get_nvml_temp(gpu));
|
||||
#endif
|
||||
print_icx3_temps(gpu);
|
||||
float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
|
||||
get_temp_sensors(icx_temp_sensors, gpu);
|
||||
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
|
||||
#ifdef USE_LIBPCI
|
||||
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
|
||||
printf("VRAM: +%.0f°C\n", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
|
||||
#endif
|
||||
printf("%s: %+.1f°C\n",
|
||||
icx3_temp_sensor_names[i],
|
||||
icx_temp_sensors[i]);
|
||||
}
|
||||
|
||||
#ifdef USE_LIBPCI
|
||||
printf("HotSpot: +%.0f°C\n", get_hotspot_temp(gpu));
|
||||
#endif
|
||||
|
||||
#ifdef USE_NVML
|
||||
print_nvml_clock_reason(0, gpu);
|
||||
#endif
|
||||
|
105
gddr6.c
Normal file
105
gddr6.c
Normal file
@ -0,0 +1,105 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <pci/pci.h>
|
||||
#include <sys/mman.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define PG_SZ sysconf(_SC_PAGE_SIZE)
|
||||
|
||||
#include "gddr6.h"
|
||||
|
||||
void init_gddr6(struct card_info *card)
|
||||
{
|
||||
/* Parse the address of the card to get the PCI info */
|
||||
char pci_address[] = "00000000:00:00.0";
|
||||
int len = strlen(card->pci_id);
|
||||
strcpy(&pci_address[sizeof(pci_address) - len - 1], card->pci_id);
|
||||
|
||||
int domain = 0;
|
||||
int bus = 0;
|
||||
int dev = 0;
|
||||
int func = 0;
|
||||
|
||||
sscanf(pci_address, "%x:%x:%x.%x", &domain, &bus, &dev, &func);
|
||||
|
||||
struct pci_access *pacc = NULL;
|
||||
struct pci_dev *pci_dev = NULL;
|
||||
|
||||
pacc = pci_alloc();
|
||||
pci_init(pacc);
|
||||
|
||||
pci_dev = pci_get_dev(pacc, domain, bus, dev, func);
|
||||
|
||||
pci_fill_info(pci_dev, PCI_FILL_IDENT | PCI_FILL_BASES | PCI_FILL_CLASS);
|
||||
|
||||
card->bar0 = (pci_dev->base_addr[0] & 0xFFFFFFFF);
|
||||
|
||||
pci_cleanup(pacc);
|
||||
}
|
||||
|
||||
float get_vram_temp(struct card_info *card)
|
||||
{
|
||||
int fd;
|
||||
float temp = 0.0;
|
||||
|
||||
if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)
|
||||
{
|
||||
printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) {
|
||||
if (card->pci_device_id == device_offset_info[i].device_id){
|
||||
unsigned int phys_addr = (card->bar0 + device_offset_info[i].vram_offset);
|
||||
unsigned int base_offset = phys_addr & ~(PG_SZ-1);
|
||||
void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset);
|
||||
if(map_base == (void *) -1)
|
||||
{
|
||||
if (fd != -1)
|
||||
close(fd);
|
||||
printf("Can't read memory for VRAM temperature. If you are root, enable kernel parameter iomem=relaxed\n");
|
||||
}
|
||||
void *virt_addr = (char *) map_base + (phys_addr - base_offset);
|
||||
int read_result = *((unsigned int *) virt_addr);
|
||||
temp = ((read_result & 0x00000fff) / 0x20);
|
||||
munmap(map_base, PG_SZ);
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
return temp;
|
||||
}
|
||||
|
||||
float get_hotspot_temp(struct card_info *card)
|
||||
{
|
||||
int fd;
|
||||
float temp = 0.0;
|
||||
|
||||
if ((fd = open("/dev/mem", O_RDWR | O_SYNC)) == -1)
|
||||
{
|
||||
printf("Can't read memory. If you are root, enable kernel parameter iomem=relaxed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < sizeof(device_offset_info) / sizeof(struct device_offset); i++) {
|
||||
if (card->pci_device_id == device_offset_info[i].device_id){
|
||||
unsigned int phys_addr = (card->bar0 + device_offset_info[i].hotspot_offset);
|
||||
unsigned int base_offset = phys_addr & ~(PG_SZ-1);
|
||||
void *map_base = mmap(0, PG_SZ, PROT_READ, MAP_SHARED, fd, base_offset);
|
||||
if(map_base == (void *) -1)
|
||||
{
|
||||
if (fd != -1)
|
||||
close(fd);
|
||||
printf("Can't read memory for hotspot. If you are root, enable kernel parameter iomem=relaxed\n");
|
||||
}
|
||||
void *virt_addr = (char *) map_base + (phys_addr - base_offset);
|
||||
int read_result = *((unsigned int *) virt_addr);
|
||||
temp = (read_result >> 8) & 0xff;
|
||||
munmap(map_base, PG_SZ);
|
||||
}
|
||||
}
|
||||
|
||||
close(fd);
|
||||
return temp;
|
||||
}
|
26
gddr6.h
Normal file
26
gddr6.h
Normal file
@ -0,0 +1,26 @@
|
||||
#include "evga-card.h"
|
||||
|
||||
struct device_offset {
|
||||
unsigned short device_id;
|
||||
int vram_offset;
|
||||
int hotspot_offset;
|
||||
};
|
||||
|
||||
static struct device_offset device_offset_info[] =
|
||||
{
|
||||
{.device_id = NVIDIA_RTX3090TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 Ti */
|
||||
{.device_id = NVIDIA_RTX3090_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3090 */
|
||||
{.device_id = NVIDIA_RTX3080TI_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 Ti */
|
||||
{.device_id = NVIDIA_RTX3080_12G_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 12G LHR */
|
||||
{.device_id = NVIDIA_RTX3080_LHR_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 LHR */
|
||||
{.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */
|
||||
{.device_id = NVIDIA_RTX3080_DEV, .vram_offset = 0x0000E2A8, .hotspot_offset = 0x0002046c}, /* RTX 3080 */
|
||||
{.device_id = NVIDIA_RTX3070TI_GA102_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti GA102 TODO:check */
|
||||
{.device_id = NVIDIA_RTX3070TI_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 Ti TODO:check */
|
||||
{.device_id = NVIDIA_RTX3070_LHR_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 */
|
||||
{.device_id = NVIDIA_RTX3070_DEV, .vram_offset = 0x0000EE50, .hotspot_offset = 0x0002046c}, /* RTX 3070 LHR */
|
||||
};
|
||||
|
||||
void init_gddr6(struct card_info *card);
|
||||
float get_vram_temp(struct card_info *card);
|
||||
float get_hotspot_temp(struct card_info *card);
|
13
icx3.c
13
icx3.c
@ -69,19 +69,6 @@ void print_icx3_fans_oneline(struct card_info *card)
|
||||
printf("%%");
|
||||
}
|
||||
|
||||
void print_icx3_temps(struct card_info *card)
|
||||
{
|
||||
float temps[ICX3_NUM_TEMP_SENSORS];
|
||||
|
||||
get_temp_sensors(temps, card);
|
||||
|
||||
for (int i=0; i<ICX3_NUM_TEMP_SENSORS; i++) {
|
||||
printf("%s: %+.1f°C\n",
|
||||
icx3_temp_sensor_names[i],
|
||||
temps[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void print_icx3_temps_oneline(struct card_info *card)
|
||||
{
|
||||
float temps[ICX3_NUM_TEMP_SENSORS];
|
||||
|
1
icx3.h
1
icx3.h
@ -119,7 +119,6 @@ static char *icx3_temp_sensor_names[] = {
|
||||
int icx3_init(struct card_info *card);
|
||||
void print_icx3_fans(struct card_info *card);
|
||||
void print_icx3_fans_oneline(struct card_info *card);
|
||||
void print_icx3_temps(struct card_info *card);
|
||||
void print_icx3_temps_oneline(struct card_info *card);
|
||||
void get_available_fans(char *fans_avail, struct card_info *card);
|
||||
void get_fan_status(struct icx3_fan_control *fans, struct card_info *card);
|
||||
|
@ -10,14 +10,6 @@ void init_nvml()
|
||||
printf("Could not init NVML: %s\n", nvmlErrorString(result));
|
||||
}
|
||||
|
||||
void print_nvml_temp(int compact, struct card_info *card)
|
||||
{
|
||||
if (compact)
|
||||
printf(" %3d", get_nvml_temp(card));
|
||||
else
|
||||
printf("GPU1: %+d°C\n", get_nvml_temp(card));
|
||||
}
|
||||
|
||||
void print_nvml_clock_reason(int compact, struct card_info *card)
|
||||
{
|
||||
unsigned long long reasons = get_nvml_clock_reasons(card);
|
||||
|
@ -22,7 +22,6 @@ static struct clock_reason clock_reason_names[] =
|
||||
};
|
||||
|
||||
void init_nvml();
|
||||
void print_nvml_temp(int compact, struct card_info *card);
|
||||
void print_nvml_clock_reason(int compact, struct card_info *card);
|
||||
unsigned int get_nvml_temp(struct card_info *card);
|
||||
unsigned long long get_nvml_clock_reasons(struct card_info *card);
|
Loading…
x
Reference in New Issue
Block a user