94 lines
3.1 KiB
Plaintext
94 lines
3.1 KiB
Plaintext
// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include <fstream>
|
|
|
|
#include "CUDAWrapper.h"
|
|
#include "JFJochException.h"
|
|
|
|
inline void cuda_err(cudaError_t val) {
|
|
if (val != cudaSuccess)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
int32_t get_gpu_count() {
|
|
int device_count;
|
|
cudaError_t val = cudaGetDeviceCount(&device_count);
|
|
switch (val) {
|
|
case cudaSuccess:
|
|
return device_count;
|
|
case cudaErrorNoDevice:
|
|
case cudaErrorInsufficientDriver:
|
|
return 0;
|
|
default:
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
}
|
|
|
|
void set_gpu(int32_t dev_id) {
|
|
auto dev_count = get_gpu_count();
|
|
|
|
// Ignore if no GPU present
|
|
if (dev_count > 0) {
|
|
if ((dev_id < 0) || (dev_id >= dev_count))
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Device ID cannot be negative");
|
|
|
|
cuda_err(cudaSetDevice(dev_id));
|
|
}
|
|
}
|
|
|
|
// Return CUDA device PCI Bus ID as "domain:bus:device.function", e.g., "0000:65:00.0"
|
|
static std::string get_cuda_device_pci_bus_id(int dev_id) {
|
|
// CUDA API provides cudaDeviceGetPCIBusId
|
|
char buf[64] = {0};
|
|
cudaDeviceProp prop;
|
|
cudaError_t st = cudaGetDeviceProperties(&prop, dev_id);
|
|
if (st != cudaSuccess) {
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(st));
|
|
}
|
|
// Prefer cudaDeviceGetPCIBusId for full id including domain and function
|
|
cudaError_t st2 = cudaDeviceGetPCIBusId(buf, static_cast<int>(sizeof(buf)), dev_id);
|
|
if (st2 == cudaSuccess) {
|
|
return std::string(buf);
|
|
}
|
|
// Fallback: synthesize from properties (domain may be missing on very old drivers)
|
|
// Note: function is typically ".0"
|
|
char alt[64];
|
|
std::snprintf(alt, sizeof(alt), "%04x:%02x:%02x.%u",
|
|
prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, 0u);
|
|
return std::string(alt);
|
|
}
|
|
|
|
// Resolve NUMA node from PCI address using Linux sysfs
|
|
// Returns:
|
|
// >=0 NUMA node index
|
|
// -1 if NUMA node is not available/unknown
|
|
int get_gpu_numa_node(int dev_id) {
|
|
auto dev_count = get_gpu_count();
|
|
if (dev_count <= 0) return -1;
|
|
if (dev_id < 0 || dev_id >= dev_count) {
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Invalid CUDA device ID");
|
|
}
|
|
|
|
// We don't need to call cudaSetDevice here; querying by id is sufficient.
|
|
const std::string pci_bus_id = get_cuda_device_pci_bus_id(dev_id); // "dddd:bb:dd.f"
|
|
|
|
// sysfs path for PCI device. Examples:
|
|
// - /sys/bus/pci/devices/0000:65:00.0/numa_node
|
|
const std::string sysfs_path = std::string("/sys/bus/pci/devices/") + pci_bus_id + "/numa_node";
|
|
|
|
std::ifstream f(sysfs_path);
|
|
if (!f.is_open()) {
|
|
// On some systems, the symlink may be via /sys/class/drm or nvidia, but primary path should exist.
|
|
return -1;
|
|
}
|
|
|
|
int numa = -1;
|
|
f >> numa;
|
|
if (!f.good()) {
|
|
return -1;
|
|
}
|
|
return numa;
|
|
}
|