Added support for some NVML sensors.
GPU temp sensor and clock reasons.
This commit is contained in:
parent
4cb9ef42a6
commit
4c5321bc4e
@ -4,13 +4,20 @@ OBJS = evga-icx.o evga-card.o icx3.o
|
||||
LDLIBS = -li2c
|
||||
CFLAGS = -MD
|
||||
|
||||
ifdef USE_NVML
|
||||
LDLIBS += -lnvidia-ml
|
||||
CFLAGS += -DUSE_NVML
|
||||
OBJS += nvidia-sensors.o
|
||||
endif
|
||||
|
||||
evga-icx : $(OBJS)
|
||||
|
||||
debug : CFLAGS += -g -O0
|
||||
debug : evga-icx
|
||||
|
||||
clean :
|
||||
rm evga-icx $(OBJS)
|
||||
rm evga-icx
|
||||
rm *.o
|
||||
rm *.d
|
||||
|
||||
-include $(OBJS:.o=.d)
|
@ -24,6 +24,11 @@ Access to the `/dev/i2c` device files, which means either:
|
||||
## Building
|
||||
`make`
|
||||
|
||||
## Optional features
|
||||
|
||||
### NVML support
|
||||
Add the make flag `USE_NVML=1` and the it will also display the main GPU temperature ("GPU1") as reported by the NVIDIA driver. It will also display the performance cap/clock reason.
|
||||
|
||||
## Usage
|
||||
Note that when controlling fans directly through iCX3 they will fall offline from the Nvidia driver and show as 0 RPM until you return them to automatic mode.
|
||||
|
||||
|
66
evga-icx.c
66
evga-icx.c
@ -5,6 +5,10 @@
|
||||
#include <fcntl.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef USE_NVML
|
||||
#include "nvidia-sensors.h"
|
||||
#endif
|
||||
|
||||
#include "icx3.h"
|
||||
#include "evga-card.h"
|
||||
|
||||
@ -32,7 +36,7 @@ int main (int argc, char **argv)
|
||||
int print_info = 0;
|
||||
int compact = 0;
|
||||
int gpu_num = -1; /* Card to control */
|
||||
int watch = -1;
|
||||
unsigned int watch = 0;
|
||||
char *fan_speed[ICX3_MAX_FANS] = {NULL};
|
||||
|
||||
/* Input parsing */
|
||||
@ -125,33 +129,57 @@ int main (int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
/* NVML init */
|
||||
#ifdef USE_NVML
|
||||
int nvml_ok = init_nvml();
|
||||
nvmlDevice_t* device[MAX_GPUS];
|
||||
#endif
|
||||
|
||||
/* print sensor info */
|
||||
print:
|
||||
if (print_info) {
|
||||
if (gpu_num == -1) {
|
||||
for (int i = 0; i < gpu_count; i++){
|
||||
print_gpu_info(i, gpus, compact);
|
||||
}
|
||||
} else if (gpu_num <= gpu_count - 1) {
|
||||
print_gpu_info(gpu_num, gpus, compact);
|
||||
}
|
||||
}
|
||||
if (watch > 0) {
|
||||
sleep(watch);
|
||||
goto print;
|
||||
do {
|
||||
if (gpu_num == -1) {
|
||||
for (int i = 0; i < gpu_count; i++){
|
||||
print_gpu_info(i, &gpus[i], compact);
|
||||
}
|
||||
} else if (gpu_num <= gpu_count - 1) {
|
||||
print_gpu_info(gpu_num, &gpus[gpu_num], compact);
|
||||
}
|
||||
sleep(watch);
|
||||
} while (watch > 0);
|
||||
}
|
||||
|
||||
#ifdef USE_NVML
|
||||
nvmlShutdown();
|
||||
#endif
|
||||
}
|
||||
|
||||
void print_gpu_info(int gpu_num, struct card_info gpus[], int compact) {
|
||||
void print_gpu_info(int gpu_num, struct card_info *gpu, int compact) {
|
||||
if (compact) {
|
||||
/* One line per GPU */
|
||||
printf("#%d ", gpu_num);
|
||||
print_icx3_fans_oneline(&gpus[gpu_num]);
|
||||
print_icx3_temps_oneline(&gpus[gpu_num]);
|
||||
print_icx3_fans_oneline(gpu);
|
||||
printf(" GPU");
|
||||
#ifdef USE_NVML
|
||||
print_nvml_temp(1, gpu);
|
||||
#endif
|
||||
print_icx3_temps_oneline(gpu);
|
||||
printf("°C");
|
||||
#ifdef USE_NVML
|
||||
print_nvml_clock_reason(1, gpu);
|
||||
#endif
|
||||
printf("\n");
|
||||
} else {
|
||||
printf("#%d: %s (%s) @ %s\n", gpu_num, gpus[gpu_num].card_name, gpus[gpu_num].i2c_dev_path, gpus[gpu_num].pci_id);
|
||||
print_icx3_fans(&gpus[gpu_num]);
|
||||
print_icx3_temps(&gpus[gpu_num]);
|
||||
/* One line per GPU sensor */
|
||||
printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id);
|
||||
print_icx3_fans(gpu);
|
||||
#ifdef USE_NVML
|
||||
print_nvml_temp(0, gpu);
|
||||
#endif
|
||||
print_icx3_temps(gpu);
|
||||
#ifdef USE_NVML
|
||||
print_nvml_clock_reason(0, gpu);
|
||||
#endif
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
|
4
icx3.c
4
icx3.c
@ -89,11 +89,11 @@ void print_icx3_temps_oneline(struct card_info *card)
|
||||
get_temp_sensors(temps, card);
|
||||
|
||||
for (int i=0; i<ICX3_NUM_TEMP_SENSORS; i++) {
|
||||
if (i == 0 || strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
|
||||
/* If this math seems a little jank, it's so we can optionally inject the NVML and vram temps into the oneline */
|
||||
if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
|
||||
printf(" %.3s", icx3_temp_sensor_names[i]);
|
||||
printf(" %3.0f", temps[i]);
|
||||
}
|
||||
printf("°C");
|
||||
}
|
||||
|
||||
void get_available_fans(char *fans_avail, struct card_info *card)
|
||||
|
95
nvidia-sensors.c
Normal file
95
nvidia-sensors.c
Normal file
@ -0,0 +1,95 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#include "nvidia-sensors.h"
|
||||
|
||||
int init_nvml()
|
||||
{
|
||||
nvmlReturn_t result;
|
||||
result = nvmlInit_v2();
|
||||
if (result != NVML_SUCCESS) {
|
||||
printf("Could not init NVML: %s\n", nvmlErrorString(result));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
void print_nvml_temp(int compact, struct card_info *card)
|
||||
{
|
||||
if (compact)
|
||||
printf(" %3d", get_nvml_temp(card));
|
||||
else
|
||||
printf("GPU1: %+d°C\n", get_nvml_temp(card));
|
||||
}
|
||||
|
||||
void print_nvml_clock_reason(int compact, struct card_info *card)
|
||||
{
|
||||
unsigned long long reasons = get_nvml_clock_reasons(card);
|
||||
int single_reason = 1;
|
||||
|
||||
if (compact)
|
||||
printf(" CLK ");
|
||||
else
|
||||
printf("Clock reasons: ");
|
||||
|
||||
for (int i = 0; i < (sizeof(clock_reason_names) / sizeof(struct clock_reason)); i++) {
|
||||
if (reasons & clock_reason_names[i].mask) {
|
||||
if (!single_reason) {
|
||||
if (compact)
|
||||
printf(",");
|
||||
else
|
||||
printf(", ");
|
||||
}
|
||||
single_reason = 0;
|
||||
|
||||
if (compact)
|
||||
printf("%s", clock_reason_names[i].short_name);
|
||||
else
|
||||
printf("%s", clock_reason_names[i].long_name);
|
||||
}
|
||||
}
|
||||
|
||||
if (single_reason)
|
||||
printf("None");
|
||||
|
||||
if (!compact)
|
||||
printf(" (0x%llx)\n", reasons);
|
||||
}
|
||||
|
||||
unsigned int get_nvml_temp(struct card_info *card)
|
||||
{
|
||||
nvmlReturn_t result;
|
||||
nvmlDevice_t nvml_device;
|
||||
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int temp;
|
||||
result = nvmlDeviceGetTemperature(nvml_device, NVML_TEMPERATURE_GPU, &temp);
|
||||
if (result != NVML_SUCCESS) {
|
||||
printf("Failed to get temperature for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
|
||||
return 0;
|
||||
}
|
||||
return temp;
|
||||
}
|
||||
|
||||
unsigned long long get_nvml_clock_reasons(struct card_info *card)
|
||||
{
|
||||
nvmlReturn_t result;
|
||||
nvmlDevice_t nvml_device;
|
||||
result = nvmlDeviceGetHandleByPciBusId_v2(card->pci_id, &nvml_device);
|
||||
if (result != NVML_SUCCESS) {
|
||||
printf("Failed to get device handle for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned long long reasons;
|
||||
result = nvmlDeviceGetCurrentClocksEventReasons(nvml_device, &reasons) ;
|
||||
if (result != NVML_SUCCESS) {
|
||||
printf("Failed to get clock reasons for card at %s: %s\n", card->pci_id, nvmlErrorString(result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
return reasons;
|
||||
}
|
||||
|
28
nvidia-sensors.h
Normal file
28
nvidia-sensors.h
Normal file
@ -0,0 +1,28 @@
|
||||
#include <nvml.h>
|
||||
|
||||
#include "evga-card.h"
|
||||
|
||||
struct clock_reason {
|
||||
unsigned long long mask;
|
||||
char *short_name;
|
||||
char *long_name;
|
||||
};
|
||||
|
||||
static struct clock_reason clock_reason_names[] =
|
||||
{
|
||||
{nvmlClocksEventReasonGpuIdle, "Idle", "GPU idle"},
|
||||
{nvmlClocksEventReasonApplicationsClocksSetting, "AppClk", "Application clocks"},
|
||||
{nvmlClocksEventReasonSwPowerCap, "Pwr", "Power cap"},
|
||||
{nvmlClocksThrottleReasonHwSlowdown, "HWSlow", "Hardware slowdown"},
|
||||
{nvmlClocksEventReasonSyncBoost, "Sync", "Sync boost"},
|
||||
{nvmlClocksEventReasonSwThermalSlowdown, "SWTherm", "Software thermal"},
|
||||
{nvmlClocksThrottleReasonHwThermalSlowdown, "HWTherm", "Hardware thermal"},
|
||||
{nvmlClocksThrottleReasonHwPowerBrakeSlowdown, "HWPower", "Hardware power brake"},
|
||||
{nvmlClocksEventReasonDisplayClockSetting, "DispClk", "Display clock"}
|
||||
};
|
||||
|
||||
int init_nvml();
|
||||
void print_nvml_temp(int compact, struct card_info *card);
|
||||
void print_nvml_clock_reason(int compact, struct card_info *card);
|
||||
unsigned int get_nvml_temp(struct card_info *card);
|
||||
unsigned long long get_nvml_clock_reasons(struct card_info *card);
|
Loading…
x
Reference in New Issue
Block a user