From b7d22ed9ac510442c8fa346812939d98b68fd34d Mon Sep 17 00:00:00 2001 From: moosecrap Date: Fri, 21 Feb 2025 10:59:31 -0800 Subject: [PATCH] Reduced number of NVML function calls --- README.md | 2 +- evga-card.h | 1 + evga-icx.c | 4 +++- nvidia-sensors.c | 24 ++++++++++-------------- nvidia-sensors.h | 2 +- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index e3f480d..c4fc66f 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Note that when controlling fans directly through iCX3 they will fall offline fro ```text Available options: ---i2c N : Only probe I2C bus N instead of all (may help with stuttering with --watch) +--i2c N : Only probe I2C bus N instead of all (may help with stuttering or freezing when probing I2C devices) --gpu N : Control only GPU N instead of all supported cards --fan SPEED : Set all fans at once to SPEED (see below) --fanN SPEED : Set fan N (0-3) to SPEED diff --git a/evga-card.h b/evga-card.h index 79189bb..b6adc94 100644 --- a/evga-card.h +++ b/evga-card.h @@ -91,6 +91,7 @@ struct card_info { int i2c_fd; /* File descriptor for the i2c device file, for re-use */ int product_id; /* EVGA internal product ID, as reported by the iCX3 controller */ unsigned int bar0; /* Address of the card's PCI base address register */ + void *nvml_device; /* Pointer to nvmlDevice_t for use in NVML calls */ }; struct gpu_pci_info { diff --git a/evga-icx.c b/evga-icx.c index 0b92c12..328ddc9 100644 --- a/evga-icx.c +++ b/evga-icx.c @@ -24,7 +24,7 @@ char *header_start = ""; char *header_end = ""; static const char helpstring[] = "Available options:\n" - "--i2c N : Only probe I2C bus N instead of all (may help with stuttering with --watch)\n" + "--i2c N : Only probe I2C bus N instead of all (may help with stuttering or freezing when probing I2C devices)\n" "--gpu N : Control only GPU N instead of all supported cards\n" "--fan SPEED : Set all fans at once to SPEED (see below)\n" "--fanN SPEED : Set fan N (0-3) to SPEED\n" @@ -163,6 +163,8 @@ int main (int argc, char **argv) /* NVML init */ #ifdef USE_NVML init_nvml(); + for (int i = 0; i < gpu_count; i++) + get_nvml_handle(&gpus[i]); #endif /* PCI init for VRAM/hotspot temps */ diff --git a/nvidia-sensors.c b/nvidia-sensors.c index 1b258df..3b6eaf9 100644 --- a/nvidia-sensors.c +++ b/nvidia-sensors.c @@ -10,15 +10,14 @@ void init_nvml() printf("Could not init NVML: %s\n", nvmlErrorString(result)); } -int get_nvml_handle(nvmlDevice_t *device, struct card_info *card) +void get_nvml_handle(struct card_info *card) { nvmlReturn_t result; - result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, device); + result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, card->nvml_device); if (result != NVML_SUCCESS) { - printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); - return 0; + printf("Failed to get NVML device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); + card->nvml_device = NULL; } - return 1; } void print_nvml_clock_reason(int compact, struct card_info *card) @@ -52,12 +51,11 @@ void print_nvml_clock_reason(int compact, struct card_info *card) unsigned int get_nvml_temp(struct card_info *card) { - nvmlDevice_t nvml_device; - if (!get_nvml_handle(&nvml_device, card)) + if (card->nvml_device == NULL) return 0; unsigned int temp; - nvmlReturn_t result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp); + nvmlReturn_t result = nvmlDeviceGetTemperature(*(nvmlDevice_t*)(card->nvml_device), NVML_TEMPERATURE_GPU, &temp); if (result != NVML_SUCCESS) { printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); return 0; @@ -67,12 +65,11 @@ unsigned int get_nvml_temp(struct card_info *card) unsigned long long get_nvml_clock_reasons(struct card_info *card) { - nvmlDevice_t nvml_device; - if (!get_nvml_handle(&nvml_device, card)) + if (card->nvml_device == NULL) return 0; unsigned long long reasons; - nvmlReturn_t result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ; + nvmlReturn_t result = nvmlDeviceGetCurrentClocksEventReasons(*(nvmlDevice_t*)(card->nvml_device), &reasons) ; if (result != NVML_SUCCESS) { printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); return 0; @@ -83,12 +80,11 @@ unsigned long long get_nvml_clock_reasons(struct card_info *card) unsigned int get_nvml_mem_util(struct card_info *card) { - nvmlDevice_t nvml_device; - if (!get_nvml_handle(&nvml_device, card)) + if (card->nvml_device == NULL) return 0; nvmlUtilization_t util; - nvmlReturn_t result = nvmlDeviceGetUtilizationRates(nvml_device, &util); + nvmlReturn_t result = nvmlDeviceGetUtilizationRates(*(nvmlDevice_t*)(card->nvml_device), &util); if (result != NVML_SUCCESS) { printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result)); return 0; diff --git a/nvidia-sensors.h b/nvidia-sensors.h index f9b0301..155813f 100644 --- a/nvidia-sensors.h +++ b/nvidia-sensors.h @@ -22,7 +22,7 @@ static struct clock_reason clock_reason_names[] = }; void init_nvml(); -int get_nvml_handle(nvmlDevice_t *device, struct card_info *card); +void get_nvml_handle(struct card_info *card); void print_nvml_clock_reason(int compact, struct card_info *card); unsigned int get_nvml_temp(struct card_info *card); unsigned long long get_nvml_clock_reasons(struct card_info *card);