Files
Jungfraujoch/common/CUDAWrapper.cu
2025-10-20 20:43:44 +02:00

94 lines
3.1 KiB
Plaintext

// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include <fstream>
#include "CUDAWrapper.h"
#include "JFJochException.h"
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
int32_t get_gpu_count() {
int device_count;
cudaError_t val = cudaGetDeviceCount(&device_count);
switch (val) {
case cudaSuccess:
return device_count;
case cudaErrorNoDevice:
case cudaErrorInsufficientDriver:
return 0;
default:
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
}
void set_gpu(int32_t dev_id) {
auto dev_count = get_gpu_count();
// Ignore if no GPU present
if (dev_count > 0) {
if ((dev_id < 0) || (dev_id >= dev_count))
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Device ID cannot be negative");
cuda_err(cudaSetDevice(dev_id));
}
}
// Return CUDA device PCI Bus ID as "domain:bus:device.function", e.g., "0000:65:00.0"
static std::string get_cuda_device_pci_bus_id(int dev_id) {
// CUDA API provides cudaDeviceGetPCIBusId
char buf[64] = {0};
cudaDeviceProp prop;
cudaError_t st = cudaGetDeviceProperties(&prop, dev_id);
if (st != cudaSuccess) {
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(st));
}
// Prefer cudaDeviceGetPCIBusId for full id including domain and function
cudaError_t st2 = cudaDeviceGetPCIBusId(buf, static_cast<int>(sizeof(buf)), dev_id);
if (st2 == cudaSuccess) {
return std::string(buf);
}
// Fallback: synthesize from properties (domain may be missing on very old drivers)
// Note: function is typically ".0"
char alt[64];
std::snprintf(alt, sizeof(alt), "%04x:%02x:%02x.%u",
prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, 0u);
return std::string(alt);
}
// Resolve NUMA node from PCI address using Linux sysfs
// Returns:
// >=0 NUMA node index
// -1 if NUMA node is not available/unknown
int get_gpu_numa_node(int dev_id) {
auto dev_count = get_gpu_count();
if (dev_count <= 0) return -1;
if (dev_id < 0 || dev_id >= dev_count) {
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Invalid CUDA device ID");
}
// We don't need to call cudaSetDevice here; querying by id is sufficient.
const std::string pci_bus_id = get_cuda_device_pci_bus_id(dev_id); // "dddd:bb:dd.f"
// sysfs path for PCI device. Examples:
// - /sys/bus/pci/devices/0000:65:00.0/numa_node
const std::string sysfs_path = std::string("/sys/bus/pci/devices/") + pci_bus_id + "/numa_node";
std::ifstream f(sysfs_path);
if (!f.is_open()) {
// On some systems, the symlink may be via /sys/class/drm or nvidia, but primary path should exist.
return -1;
}
int numa = -1;
f >> numa;
if (!f.good()) {
return -1;
}
return numa;
}