// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include #include "CUDAWrapper.h" #include "JFJochException.h" inline void cuda_err(cudaError_t val) { if (val != cudaSuccess) throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); } int32_t get_gpu_count() { int device_count; cudaError_t val = cudaGetDeviceCount(&device_count); switch (val) { case cudaSuccess: return device_count; case cudaErrorNoDevice: case cudaErrorInsufficientDriver: return 0; default: throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val)); } } void set_gpu(int32_t dev_id) { auto dev_count = get_gpu_count(); // Ignore if no GPU present if (dev_count > 0) { if ((dev_id < 0) || (dev_id >= dev_count)) throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Device ID cannot be negative"); cuda_err(cudaSetDevice(dev_id)); } } // Return CUDA device PCI Bus ID as "domain:bus:device.function", e.g., "0000:65:00.0" static std::string get_cuda_device_pci_bus_id(int dev_id) { // CUDA API provides cudaDeviceGetPCIBusId char buf[64] = {0}; cudaDeviceProp prop; cudaError_t st = cudaGetDeviceProperties(&prop, dev_id); if (st != cudaSuccess) { throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(st)); } // Prefer cudaDeviceGetPCIBusId for full id including domain and function cudaError_t st2 = cudaDeviceGetPCIBusId(buf, static_cast(sizeof(buf)), dev_id); if (st2 == cudaSuccess) { return std::string(buf); } // Fallback: synthesize from properties (domain may be missing on very old drivers) // Note: function is typically ".0" char alt[64]; std::snprintf(alt, sizeof(alt), "%04x:%02x:%02x.%u", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID, 0u); return std::string(alt); } // Resolve NUMA node from PCI address using Linux sysfs // Returns: // >=0 NUMA node index // -1 if NUMA node is not available/unknown int get_gpu_numa_node(int dev_id) { auto dev_count = get_gpu_count(); if (dev_count <= 0) return -1; if (dev_id < 0 || dev_id >= dev_count) { throw JFJochException(JFJochExceptionCategory::InputParameterInvalid, "Invalid CUDA device ID"); } // We don't need to call cudaSetDevice here; querying by id is sufficient. const std::string pci_bus_id = get_cuda_device_pci_bus_id(dev_id); // "dddd:bb:dd.f" // sysfs path for PCI device. Examples: // - /sys/bus/pci/devices/0000:65:00.0/numa_node const std::string sysfs_path = std::string("/sys/bus/pci/devices/") + pci_bus_id + "/numa_node"; std::ifstream f(sysfs_path); if (!f.is_open()) { // On some systems, the symlink may be via /sys/class/drm or nvidia, but primary path should exist. return -1; } int numa = -1; f >> numa; if (!f.good()) { return -1; } return numa; }