Intelcc+OpenMPI update

This commit is contained in:
2023-06-28 15:51:59 +02:00
parent 12ac9e9e47
commit 4de859b7d0
9 changed files with 151 additions and 27 deletions

14
Batchsystem/slurm/build Executable file
View File

@@ -0,0 +1,14 @@
#!/usr/bin/env modbuild
pbuild::set_download_url "https://download.schedmd.com/${P}/${P}-${V}.tar.bz2"
pbuild::add_to_group 'Batchsystem'
pbuild::install_docfiles 'AUTHORS' 'INSTALL' 'NEWS' 'README.rst' 'RELEASE_NOTES'
pbuild::pre_configure() {
pbuild::add_configure_args "--with-pmix=${PMIX_PREFIX}"
pbuild::add_configure_args "--with-nvml=${CUDA_PREFIX}"
pbuild::add_configure_args "--with-hwloc=${HWLOC_PREFIX}"
pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}"
# pbuild::add_configure_args "--with-netloc=${HWLOC_PREFIX}"
}

View File

@@ -0,0 +1 @@
slurm/22.05.9 unstable b:pmix/4.2.4 b:cuda/12.1.1 b:ucx/1.14.1_slurm

View File

@@ -0,0 +1,28 @@
#%Module1.0
module-whatis "Slurm Workload Manager"
module-url "https://slurm.schedmd.com/"
module-license "See https://github.com/SchedMD/slurm/blob/master/LICENSE.OpenSSL"
module-maintainer "Marc Caubet <marc.caubet@psi.ch>"
module-help "
Slurm is an open source, fault-tolerant, and highly scalable cluster
management and job scheduling system for large and small Linux clusters.
Slurm requires no kernel modifications for its operation and is relatively
self-contained. As a cluster workload manager, Slurm has three key functions:
* First, it allocates exclusive and/or non-exclusive access to resources
(compute nodes) to users for some duration of time so they can perform
work.
* Second, it provides a framework for starting, executing, and monitoring
work (normally a parallel job) on the set of allocated nodes.
* Finally, it arbitrates contention for resources by managing a queue of
pending work.
Optional plugins can be used for accounting, advanced reservation, gang
scheduling (time sharing for parallel jobs), backfill scheduling, topology
optimized resource selection, resource limits by user or bank account, and
sophisticated multifactor job prioritization algorithms.
"
module-addgroup Batchsystem

View File

@@ -13,28 +13,43 @@ pbuild::install_docfiles 'AUTHORS' 'LICENSE' 'NEWS' 'README'
pbuild::pre_configure() {
if [[ -n "${CUDA_VERSION}" ]]; then
pbuild::add_configure_args "--with-cuda=${CUDA_HOME}"
fi
pbuild::add_configure_args "--prefix=${PREFIX}"
pbuild::add_configure_args "--enable-mpi-cxx"
pbuild::add_configure_args "--enable-mpi-cxx-seek"
pbuild::add_configure_args "--enable-orterun-prefix-by-default"
pbuild::add_configure_args "--enable-shared"
pbuild::add_configure_args "--enable-static"
pbuild::add_configure_args "--with-hwloc=internal"
pbuild::add_configure_args "--with-slurm=yes"
if [[ -n "${CUDA_VERSION}" ]]; then
pbuild::add_configure_args "--with-cuda=${CUDA_HOME}"
fi
if [[ -n "${HWLOC_VERSION}" ]]; then
unset HWLOC_VERSION
pbuild::add_configure_args "--with-hwloc=${HWLOC_PREFIX}"
else
pbuild::add_configure_args "--with-hwloc=internal"
fi
if [[ -n "${LIBEVENT_VERSION}" ]]; then
pbuild::add_configure_args "--with-libevent=${LIBEVENT_PREFIX}"
fi
if [[ -n "${PMIX_VERSION}" ]]; then
unset PMIX_VERSION
pbuild::add_configure_args "--with-pmix=${PMIX_PREFIX}"
fi
if [[ -n "${LIBFABRIC_VERSION}" ]]; then
pbuild::add_configure_args "--with-ofi=${LIBFABRIC_PREFIX}"
# pbuild::add_configure_args "--with-ofi-libdir=${LIBFABRIC_LIBRARY_DIR}"
fi
if [[ -n "${UCX_VERSION}" ]]; then
pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}"
fi
if [[ -n "${INTEL_VERSION}" ]]; then
pbuild::add_configure_args "CC=icc"
pbuild::add_configure_args "CXX=icpc"
@@ -46,16 +61,10 @@ pbuild::pre_configure() {
pbuild::add_configure_args "--enable-debug"
fi
if [[ -n "${LIBEVENT_VERSION}" ]]; then
pbuild::add_configure_args "--with-libevent=${LIBEVENT_PREFIX}"
fi
if [[ -n "${UCX_VERSION}" ]]; then
pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}"
fi
if pbuild::use_flag slurm || pbuild::use_flag dgx || pbuild::use_flag merlin6; then
pbuild::add_configure_args "--with-pmi"
pbuild::add_configure_args "--with-pmi-libdir=/usr/lib64/"
pbuild::add_configure_args "--with-gpfs=/usr/lpp/mmfs"
# pbuild::add_configure_args "--with-pmi-libdir=/usr/lib64/"
if pbuild::use_flag "libpmix"; then
pbuild::add_configure_args "--enable-install-libpmix"
@@ -76,18 +85,29 @@ pbuild::pre_configure() {
}
pbuild::post_install() {
mkdir -p "${PREFIX}/lib/fallback"
local -r binary=$(ls "${PREFIX}"/lib/libmpi.so.*.*.*)
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuc[mpst].so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuct_ib.so.0'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libnuma.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libibverbs.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/librdmacm.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so'
if [[ -n "${CUDA_VERSION}" ]]; then
echo "opal_warn_on_missing_libcuda = 0" >> ${PREFIX}/etc/openmpi-mca-params.conf
if ! pbuild::use_flag slurm && ! pbuild::use_flag dgx && ! pbuild::use_flag merlin6; then
mkdir -p "${PREFIX}/lib/fallback"
local -r binary=$(ls "${PREFIX}"/lib/libmpi.so.*.*.*)
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuc[mpst].so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuct_ib.so.0'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libnuma.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libibverbs.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/librdmacm.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so'
pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so'
fi
if [[ -n "${CUDA_VERSION}" ]]; then
echo "opal_warn_on_missing_libcuda = 0" >> ${PREFIX}/etc/openmpi-mca-params.conf
fi
for FILE in $(find $PREFIX -type f \( ! -name "*.a" -and ! -name "*.mod" \) -exec grep -IL . "{}" \;)
do
OLD_RPATH=$(objdump -a -x $FILE | grep RPATH | awk '{print $2}')
NEW_RPATH=$(echo $OLD_RPATH | sed 's/:\/usr\/lib64:/:/g')
if [[ "${OLD_RPATH}" != "${NEW_RPATH}" ]]; then
patchelf --force-rpath --set-rpath "${NEW_RPATH}" "${FILE}"
fi
done
}

View File

@@ -41,7 +41,7 @@ openmpi/4.1.3_slurm stable gcc/{9.3.0,10.3.0,11.2.0} cuda/11.5.1 b:ucx/1
openmpi/4.1.4_slurm stable gcc/10.4.0 cuda/11.5.1 b:ucx/1.12.1_slurm
openmpi/4.1.5_slurm unstable gcc/10.4.0 cuda/12.1.1 b:ucx/1.14.1_slurm
openmpi/4.1.5_slurm unstable intelcc/22.2 cuda/12.1.1 b:pmix/4.2.4 b:ucx/1.14.1_slurm b:hwloc/2.9.1
openmpi/4.1.5_slurm unstable intelcc/22.2 b:cuda/12.1.1 b:pmix/4.2.4 b:ucx/1.14.1_slurm b:libfabric/1.18.0 b:hwloc/2.9.1 b:patchelf/0.14.5
openmpi/4.0.5-1_dgx deprecated gcc/{8.4.0,9.3.0,10.2.0} cuda/11.2.2 b:ucx/1.10.0-1_dgx
openmpi/4.1.0-1_dgx deprecated gcc/10.2.0 cuda/11.2.2 b:ucx/1.10.0-1_dgx

7
Libraries/libfabric/build Executable file
View File

@@ -0,0 +1,7 @@
#!/usr/bin/env modbuild
pbuild::set_download_url "https://github.com/ofiwg/libfabric/releases/download/v$V/$P-$V.tar.bz2"
pbuild::add_to_group 'Libraries'
pbuild::install_docfiles 'COPYING' 'AUTHORS' 'README' 'NEWS.md'

View File

@@ -0,0 +1 @@
libfabric/1.18.0 unstable b:gcc/10.4.0

View File

@@ -0,0 +1,53 @@
#%Module1.0
module-whatis "Open Fabrics Interfaces (OFI)"
module-url "https://ofiwg.github.io/libfabric/"
module-license "Open source, see https://github.com/ofiwg/libfabric/blob/main/COPYING"
module-maintainer "Marc Caubet Serrabou <marc.caubet@psi.ch>"
module-help "
The Open Fabrics Interfaces (OFI) is a framework focused on exporting
fabric communication services to applications.
Libfabric, also known as Open Fabrics Interfaces (OFI), defines a
communication API for high-performance parallel and distributed
applications. It is a low-level communication library that abstracts
diverse networking technologies. Libfabric is developed by the OFI
Working Group (OFIWG, pronounced “o-fee-wig”), a subgroup of the
OpenFabrics Alliance - OFA.
Participation in the OFIWG is open to anyone, and not restricted to
members of OFA.
The goal of libfabric is to define interfaces that enable a tight
semantic map between applications and underlying fabric services.
Specifically, libfabric software interfaces have been co-designed with
fabric hardware providers and application developers, with a focus on
the needs of HPC users. Libfabric supports multiple communication
semantics, is fabric and hardware implementation agnostic, and leverages
and expands the existing RDMA open source community.
Libfabric is designed to minimize the impedance mismatch between
applications, including middleware such as MPI, SHMEM, data storage, and
PGAS, and fabric communication hardware. Its interfaces target
high-bandwidth, low-latency NICs, with a goal to scale to tens of
thousands of nodes.
Libfabric targets support for the Linux, Free BSD, Windows, and OS X.
A reasonable effort is made to support all major, modern Linux
distributions; however, validation is limited to the most recent 2-3
releases of Red Hat Enterprise Linux (RHEL) and SUSE Linux Enterprise
Server (SLES). Support for a particular operating system version or
distribution is vendor specific. The exceptions are the tcp and udp
based socket providers are available on all platforms.
"
unsetenv LIBFABRIC_LIBRARY_DIR /opt/psi/Libraries/libfabric/1.18.0/lib
remove-path LIBRARY_PATH $PREFIX/lib
remove-path LD_LIBRARY_PATH $PREFIX/lib
setenv LIBFABRIC_LIBRARY_DIR $PREFIX/lib64
# prepend-path LIBRARY_PATH $PREFIX/lib64
# prepend-path LD_LIBRARY_PATH $PREFIX/lib64
# prepend-path LIBFABRIC_LIBRARY_DIR $PREFIX/lib64

View File

@@ -3,4 +3,4 @@ pmix/2.2.5 unstable libevent/2.1.12
pmix/3.2.3 unstable libevent/2.1.12
pmix/4.1.2 unstable libevent/2.1.12 b:hwloc/2.7.1
pmix/4.2.3 unstable libevent/2.1.12 b:hwloc/2.9.1
pmix/4.2.4 unstable libevent/2.1.12 b:hwloc/2.9.1
pmix/4.2.4 unstable b:gcc/10.4.0 libevent/2.1.12 b:hwloc/2.9.1