evga-icx/evga-icx.c

391 lines
12 KiB
C

#include <unistd.h>
#include <dirent.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#ifdef USE_NVML
#include "nvidia-sensors.h"
#endif
#ifdef USE_LIBPCI
#include "gddr6.h"
#endif
#include "icx3.h"
#include "evga-card.h"
#include "board-sensors.h"
#include "zen3-rapl.h"
#define MAX_GPUS 16
#define MAX_BOARD_SENSORS 256
#define HEADER_COLOR_START "\x1b[36m"
#define HEADER_COLOR_END "\x1b[39m"
static char *header_start = "";
static char *header_end = "";
static int zen3_rapl_sensor = -1; /* Board sensor number for the RAPL sensor */
static int compact = 0; /* Compact one-line per GPU display */
static int no_reasons = 0; /* Don't probe or display NVML clock reasons */
static const char helpstring[] = "Available options:\n"
"--i2c N : Only probe I2C bus N instead of all (may help with stuttering or freezing when probing I2C devices)\n"
"--gpu N : Control only GPU N instead of all supported cards\n"
"--fan SPEED : Set all fans at once to SPEED (see below)\n"
"--fanN SPEED : Set fan N (0-3) to SPEED\n"
" SPEED may be one of the following:\n"
" 'auto' to return the fan to its default control mode\n"
" N to set the fan to that manual % speed\n"
" [+/-]N to set that fan to an RPM offset from the GPU-controlled speed\n"
"--reset : Reset all fans to their default mode\n"
"--sensors : Print sensor readings even if setting a fan speed \n"
"--compact : Print sensor reading in a compact one-line per card format\n"
"--watch N : Keep printing output every N seconds\n"
"--overwrite : Overwrite previously displayed info with --watch and --compact instead of continuously logging\n"
"--color : Print headers in color in --compact mode for better readability\n"
"--no-reasons : Do not query NVML for clock reasons (can cause stuttering)\n"
"--board : Also print temperatures from the CPU, motherboard, and other sensors";
void print_gpu_info(int gpu_num, struct card_info gpus[]);
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors);
int main (int argc, char **argv)
{
struct card_info gpus[MAX_GPUS];
struct hwmon_avail_sensor board_sensors[MAX_BOARD_SENSORS];
int gpu_count, board_sensor_count;
int print_info = 0;
int gpu_num = -1; /* Card to control */
int i2c_bus = -1; /* Specific i2c bus to probe instead of all */
int overwrite = 0; /* Overwrite printed console info in compact mode */
unsigned int watch = 0; /* Refresh display every this many seconds */
int print_board_sensors = 0; /* Print CPU/motherbord/other sensors as well */
char *fan_speed[ICX3_MAX_FANS] = {NULL};
/* Input parsing */
for (int i = 1; i < argc; i++){
if (strcmp(argv[i], "--i2c") == 0) {
i++;
if (i < argc) {
i2c_bus = atoi(argv[i]);
} else {
printf(helpstring);
return -1;
}
} else if (strcmp(argv[i], "--gpu") == 0) {
i++;
if (i < argc) {
gpu_num = atoi(argv[i]);
} else {
printf(helpstring);
return -1;
}
} else if (strcmp(argv[i], "--fan") == 0) {
i++;
if (i < argc) {
for (int j = 0; j < ICX3_MAX_FANS; j++)
fan_speed[j] = argv[i];
} else {
printf(helpstring);
return -1;
}
} else if (strncmp(argv[i], "--fan", 5) == 0) {
int fan_num = atoi(argv[i]+5);
i++;
if (i < argc) {
if (fan_num <= ICX3_MAX_FANS)
fan_speed[fan_num] = argv[i];
} else {
printf(helpstring);
return -1;
}
} else if (strcmp(argv[i], "--reset") == 0) {
for (int j = 0; j < ICX3_MAX_FANS; j++)
fan_speed[j] = "auto";
} else if (strcmp(argv[i], "--sensors") == 0) {
print_info = 1;
} else if (strcmp(argv[i], "--compact") == 0) {
compact = 1;
} else if (strcmp(argv[i], "--watch") == 0) {
i++;
if (i < argc) {
watch = atoi(argv[i]);
} else {
printf(helpstring);
return -1;
}
} else if (strcmp(argv[i], "--overwrite") == 0) {
overwrite = 1;
} else if (strcmp(argv[i], "--color") == 0) {
header_start = HEADER_COLOR_START;
header_end = HEADER_COLOR_END;
} else if (strcmp(argv[i], "--no-reasons") == 0) {
no_reasons = 1;
} else if (strcmp(argv[i], "--board") == 0) {
print_board_sensors = 1;
} else {
printf(helpstring);
return 0;
}
}
if (print_info == 0) {
/* Check for no fan commands given, so display info by default */
print_info = 1;
for (int i = 0; i < ICX3_MAX_FANS; i++) {
if (fan_speed[i] != NULL)
print_info = 0;
}
}
/* Don't use overwrite mode unless set to compact (we can't tell how many lines the output will be per GPU) */
if (overwrite && !compact)
overwrite = 0;
/* Scan for supported GPUs */
gpu_count = find_evga_gpu_i2cs(gpus, MAX_GPUS, i2c_bus);
/* Check for no GPUs found or other errors */
if (gpu_count == -1) {
printf("Error scanning I2C devices\n");
return -1;
} else if (gpu_count == 0) {
printf("No supported GPUs found.\nAre you root or do you have udev access to i2c devices?\nDo you need to run `modprobe i2c-dev`?\n");
return -1;
}
/* Check for invalid GPUs */
if (gpu_num > gpu_count - 1) {
printf("Invalid GPU number specified (%d, max %d)\n", gpu_num, gpu_count - 1);
return -1;
}
/* Scan for motherboard/CPU/other sensors */
if (print_board_sensors) {
board_sensor_count = find_board_sensors(board_sensors, MAX_BOARD_SENSORS);
if (init_rapl() && board_sensor_count < MAX_BOARD_SENSORS) {
board_sensors[board_sensor_count] = rapl_sensor;
zen3_rapl_sensor = board_sensor_count;
board_sensor_count++;
}
}
/* execute fan commands */
if (gpu_num == -1) {
for (int i = 0; i < gpu_count; i++){
for (int j = 0; j < ICX3_MAX_FANS; j++) {
if (fan_speed[j] != NULL)
set_fan(j, fan_speed[j], &gpus[i]);
}
}
} else if (gpu_num <= gpu_count - 1) {
for (int j = 0; j < ICX3_MAX_FANS; j++) {
if (fan_speed[j] != NULL)
set_fan(gpu_num, fan_speed[j], &gpus[gpu_num]);
}
}
/* NVML init */
#ifdef USE_NVML
init_nvml();
for (int i = 0; i < gpu_count; i++) {
gpus[i].nvml_device = malloc(sizeof(nvmlDevice_t));
get_nvml_handle(&gpus[i]);
}
#endif
/* PCI init for VRAM/hotspot temps */
#ifdef USE_LIBPCI
for (int i = 0; i < gpu_count; i++)
init_gddr6(&gpus[i]);
#endif
/* print sensor info */
if (print_info) {
do {
if (overwrite)
printf("\x1b[K"); /* Clear current console line */
if (print_board_sensors)
print_board_info(board_sensors, board_sensor_count);
if (gpu_num == -1) {
/* No GPU specified on command line, loop over all supported GPUs */
for (int i = 0; i < gpu_count; i++){
if (i > 0)
printf("\n");
print_gpu_info(i, &gpus[i]);
}
} else if (gpu_num <= gpu_count - 1) {
print_gpu_info(gpu_num, &gpus[gpu_num]);
}
if (!overwrite)
printf("\n"); /* Print line break at the end for continuous output */
if (overwrite && compact) {
printf("\x1b[1G"); /* Move cursor back to column 1 */
if (gpu_count > 1 || print_board_sensors)
printf("\x1b[%dA", gpu_count-1+print_board_sensors); /* Move cursor back up to the top of gpu list */
}
fflush(stdout);
sleep(watch);
} while (watch > 0);
}
#ifdef USE_NVML
nvmlShutdown();
#endif
}
void print_board_info(struct hwmon_avail_sensor *board_sensors, int num_sensors)
{
int printed_sensors = 0;
int current_sort_index = 0;
float current_reading = 0.0;
int good_reading = 0;
/* These allow us to 'summarize' units and categories by only printing them when they change */
char *last_short_name = NULL;
char *last_units = NULL;
float rapl_power;
for (int i = 0; i < (sizeof(hwmon_sensor_info) / sizeof(struct hwmon_sensor)); i++) {
/* Inject our Zen RAPL power reading here */
if (strcmp(hwmon_sensor_info[i].driver_name, "zen-rapl") == 0) {
board_sensors[zen3_rapl_sensor].sort_index = current_sort_index;
board_sensors[zen3_rapl_sensor].sensor_info = &hwmon_sensor_info[i];
}
for (int j=0; j < num_sensors; j++) {
struct hwmon_avail_sensor *current_sensor = &board_sensors[j];
/* Loop over all sensors, but only output those with the current sort index so they come out sort of sorted
Duplicates (e.g.) multiple NVMe will come out in whatever sort of order the directory listing happened to */
if (current_sensor->sort_index == current_sort_index) {
printed_sensors++;
if (j == zen3_rapl_sensor) {
good_reading = 1;
current_reading = get_rapl_package_power();
} else {
good_reading = get_sensor_reading(current_sensor, &current_reading);
}
if (!good_reading)
continue;
if (compact) {
/* Print units if needed */
if (last_units != NULL && strcmp(current_sensor->sensor_info->units, last_units))
printf("%s", last_units);
/* Print new section header if needed */
if (last_short_name == NULL || strcmp(current_sensor->sensor_info->short_name, last_short_name)) {
if (last_short_name != NULL) /* Spacer for all headings not the first one */
printf(" ");
printf("%s%s%s", header_start, current_sensor->sensor_info->short_name, header_end);
}
printf(" %3.0f", current_reading);
last_short_name = current_sensor->sensor_info->short_name;
last_units = current_sensor->sensor_info->units;
} else {
printf("%s%s: %+.1f%s\n",
current_sensor->sensor_info->name_prefix,
current_sensor->sensor_name,
current_reading,
current_sensor->sensor_info->units);
}
}
}
current_sort_index++;
}
if (compact && last_units != NULL)
printf("%s", last_units);
printf("\n");
}
void print_gpu_info(int gpu_num, struct card_info *gpu)
{
if (compact) {
/* One line per GPU */
printf("%s#%d FAN%s", header_start, gpu_num, header_end);
print_icx3_fans_oneline(gpu);
printf("%s GPU%s", header_start, header_end);
#ifdef USE_NVML
printf(" %3d", get_nvml_temp(gpu));
#endif
float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
get_temp_sensors(icx_temp_sensors, gpu);
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
if (i > 0 && strncmp(icx3_temp_sensor_names[i], icx3_temp_sensor_names[i-1], 3))
printf("%s %.3s%s", header_start, icx3_temp_sensor_names[i], header_end);
#ifdef USE_LIBPCI
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
printf(" %3.0f", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
#endif
printf(" %3.0f", icx_temp_sensors[i]);
}
#ifdef USE_LIBPCI
printf("%s HOT%s %3.0f", header_start, header_end, get_hotspot_temp(gpu));
#endif
printf("°C ");
#ifdef USE_NVML
printf("%s MEM %s", header_start, header_end);
printf("%3d%%", get_nvml_mem_util(gpu));
if (!no_reasons) {
printf("%s CLK %s", header_start, header_end);
print_nvml_clock_reason(1, gpu);
}
#endif
} else {
/* One line per GPU sensor */
printf("#%d: %s (%s) @ %s\n", gpu_num, gpu->card_name, gpu->i2c_dev_path, gpu->pci_id);
print_icx3_fans(gpu);
#ifdef USE_NVML
printf("GPU1: %+d°C\n", get_nvml_temp(gpu));
#endif
float icx_temp_sensors[ICX3_NUM_TEMP_SENSORS] = {};
get_temp_sensors(icx_temp_sensors, gpu);
for (int i = 0; i < ICX3_NUM_TEMP_SENSORS; i++) {
#ifdef USE_LIBPCI
if (strncmp(icx3_temp_sensor_names[i], "MEM1", 4) == 0)
printf("VRAM: +%.0f°C\n", get_vram_temp(gpu)); /* Print the VRAM temp before the rest of the memory sensors */
#endif
printf("%s: %+.1f°C\n",
icx3_temp_sensor_names[i],
icx_temp_sensors[i]);
}
#ifdef USE_LIBPCI
printf("HotSpot: +%.0f°C\n", get_hotspot_temp(gpu));
#endif
#ifdef USE_NVML
printf("Mem util: %d%%\n", get_nvml_mem_util(gpu));
if (!no_reasons) {
printf("Clock reasons: ");
print_nvml_clock_reason(0, gpu);
}
printf("\n");
#endif
}
}