diff --git a/Batchsystem/slurm/build b/Batchsystem/slurm/build new file mode 100755 index 0000000..3800ffa --- /dev/null +++ b/Batchsystem/slurm/build @@ -0,0 +1,14 @@ +#!/usr/bin/env modbuild + +pbuild::set_download_url "https://download.schedmd.com/${P}/${P}-${V}.tar.bz2" + +pbuild::add_to_group 'Batchsystem' +pbuild::install_docfiles 'AUTHORS' 'INSTALL' 'NEWS' 'README.rst' 'RELEASE_NOTES' + +pbuild::pre_configure() { + pbuild::add_configure_args "--with-pmix=${PMIX_PREFIX}" + pbuild::add_configure_args "--with-nvml=${CUDA_PREFIX}" + pbuild::add_configure_args "--with-hwloc=${HWLOC_PREFIX}" + pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}" + # pbuild::add_configure_args "--with-netloc=${HWLOC_PREFIX}" +} diff --git a/Batchsystem/slurm/files/variants.merlin6 b/Batchsystem/slurm/files/variants.merlin6 new file mode 100644 index 0000000..d30cdae --- /dev/null +++ b/Batchsystem/slurm/files/variants.merlin6 @@ -0,0 +1 @@ +slurm/22.05.9 unstable b:pmix/4.2.4 b:cuda/12.1.1 b:ucx/1.14.1_slurm diff --git a/Batchsystem/slurm/modulefile b/Batchsystem/slurm/modulefile new file mode 100644 index 0000000..cdd0490 --- /dev/null +++ b/Batchsystem/slurm/modulefile @@ -0,0 +1,28 @@ +#%Module1.0 + +module-whatis "Slurm Workload Manager" +module-url "https://slurm.schedmd.com/" +module-license "See https://github.com/SchedMD/slurm/blob/master/LICENSE.OpenSSL" +module-maintainer "Marc Caubet " +module-help " +Slurm is an open source, fault-tolerant, and highly scalable cluster +management and job scheduling system for large and small Linux clusters. + +Slurm requires no kernel modifications for its operation and is relatively +self-contained. As a cluster workload manager, Slurm has three key functions: + + * First, it allocates exclusive and/or non-exclusive access to resources + (compute nodes) to users for some duration of time so they can perform + work. + * Second, it provides a framework for starting, executing, and monitoring + work (normally a parallel job) on the set of allocated nodes. + * Finally, it arbitrates contention for resources by managing a queue of + pending work. + +Optional plugins can be used for accounting, advanced reservation, gang +scheduling (time sharing for parallel jobs), backfill scheduling, topology +optimized resource selection, resource limits by user or bank account, and +sophisticated multifactor job prioritization algorithms. +" + +module-addgroup Batchsystem diff --git a/Compiler/openmpi/build b/Compiler/openmpi/build index 389ac11..00a0e58 100755 --- a/Compiler/openmpi/build +++ b/Compiler/openmpi/build @@ -13,28 +13,43 @@ pbuild::install_docfiles 'AUTHORS' 'LICENSE' 'NEWS' 'README' pbuild::pre_configure() { - if [[ -n "${CUDA_VERSION}" ]]; then - pbuild::add_configure_args "--with-cuda=${CUDA_HOME}" - fi pbuild::add_configure_args "--prefix=${PREFIX}" pbuild::add_configure_args "--enable-mpi-cxx" pbuild::add_configure_args "--enable-mpi-cxx-seek" pbuild::add_configure_args "--enable-orterun-prefix-by-default" pbuild::add_configure_args "--enable-shared" pbuild::add_configure_args "--enable-static" - pbuild::add_configure_args "--with-hwloc=internal" pbuild::add_configure_args "--with-slurm=yes" + if [[ -n "${CUDA_VERSION}" ]]; then + pbuild::add_configure_args "--with-cuda=${CUDA_HOME}" + fi + if [[ -n "${HWLOC_VERSION}" ]]; then unset HWLOC_VERSION pbuild::add_configure_args "--with-hwloc=${HWLOC_PREFIX}" + else + pbuild::add_configure_args "--with-hwloc=internal" fi + if [[ -n "${LIBEVENT_VERSION}" ]]; then + pbuild::add_configure_args "--with-libevent=${LIBEVENT_PREFIX}" + fi + if [[ -n "${PMIX_VERSION}" ]]; then unset PMIX_VERSION pbuild::add_configure_args "--with-pmix=${PMIX_PREFIX}" fi + if [[ -n "${LIBFABRIC_VERSION}" ]]; then + pbuild::add_configure_args "--with-ofi=${LIBFABRIC_PREFIX}" + # pbuild::add_configure_args "--with-ofi-libdir=${LIBFABRIC_LIBRARY_DIR}" + fi + + if [[ -n "${UCX_VERSION}" ]]; then + pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}" + fi + if [[ -n "${INTEL_VERSION}" ]]; then pbuild::add_configure_args "CC=icc" pbuild::add_configure_args "CXX=icpc" @@ -46,16 +61,10 @@ pbuild::pre_configure() { pbuild::add_configure_args "--enable-debug" fi - if [[ -n "${LIBEVENT_VERSION}" ]]; then - pbuild::add_configure_args "--with-libevent=${LIBEVENT_PREFIX}" - fi - - if [[ -n "${UCX_VERSION}" ]]; then - pbuild::add_configure_args "--with-ucx=${UCX_PREFIX}" - fi if pbuild::use_flag slurm || pbuild::use_flag dgx || pbuild::use_flag merlin6; then pbuild::add_configure_args "--with-pmi" - pbuild::add_configure_args "--with-pmi-libdir=/usr/lib64/" + pbuild::add_configure_args "--with-gpfs=/usr/lpp/mmfs" + # pbuild::add_configure_args "--with-pmi-libdir=/usr/lib64/" if pbuild::use_flag "libpmix"; then pbuild::add_configure_args "--enable-install-libpmix" @@ -76,18 +85,29 @@ pbuild::pre_configure() { } pbuild::post_install() { - mkdir -p "${PREFIX}/lib/fallback" - local -r binary=$(ls "${PREFIX}"/lib/libmpi.so.*.*.*) - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuc[mpst].so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuct_ib.so.0' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libnuma.so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libibverbs.so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/librdmacm.so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi.so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so' - pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so' - - if [[ -n "${CUDA_VERSION}" ]]; then - echo "opal_warn_on_missing_libcuda = 0" >> ${PREFIX}/etc/openmpi-mca-params.conf + if ! pbuild::use_flag slurm && ! pbuild::use_flag dgx && ! pbuild::use_flag merlin6; then + mkdir -p "${PREFIX}/lib/fallback" + local -r binary=$(ls "${PREFIX}"/lib/libmpi.so.*.*.*) + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuc[mpst].so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libuct_ib.so.0' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libnuma.so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libibverbs.so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/librdmacm.so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi.so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so' + pbuild::install_shared_libs "${binary}" "${PREFIX}/lib/fallback" '/libpmi2.so' fi + + if [[ -n "${CUDA_VERSION}" ]]; then + echo "opal_warn_on_missing_libcuda = 0" >> ${PREFIX}/etc/openmpi-mca-params.conf + fi + + for FILE in $(find $PREFIX -type f \( ! -name "*.a" -and ! -name "*.mod" \) -exec grep -IL . "{}" \;) + do + OLD_RPATH=$(objdump -a -x $FILE | grep RPATH | awk '{print $2}') + NEW_RPATH=$(echo $OLD_RPATH | sed 's/:\/usr\/lib64:/:/g') + if [[ "${OLD_RPATH}" != "${NEW_RPATH}" ]]; then + patchelf --force-rpath --set-rpath "${NEW_RPATH}" "${FILE}" + fi + done } diff --git a/Compiler/openmpi/files/variants.merlin6 b/Compiler/openmpi/files/variants.merlin6 index ad00fec..cb2cd16 100644 --- a/Compiler/openmpi/files/variants.merlin6 +++ b/Compiler/openmpi/files/variants.merlin6 @@ -41,7 +41,7 @@ openmpi/4.1.3_slurm stable gcc/{9.3.0,10.3.0,11.2.0} cuda/11.5.1 b:ucx/1 openmpi/4.1.4_slurm stable gcc/10.4.0 cuda/11.5.1 b:ucx/1.12.1_slurm openmpi/4.1.5_slurm unstable gcc/10.4.0 cuda/12.1.1 b:ucx/1.14.1_slurm -openmpi/4.1.5_slurm unstable intelcc/22.2 cuda/12.1.1 b:pmix/4.2.4 b:ucx/1.14.1_slurm b:hwloc/2.9.1 +openmpi/4.1.5_slurm unstable intelcc/22.2 b:cuda/12.1.1 b:pmix/4.2.4 b:ucx/1.14.1_slurm b:libfabric/1.18.0 b:hwloc/2.9.1 b:patchelf/0.14.5 openmpi/4.0.5-1_dgx deprecated gcc/{8.4.0,9.3.0,10.2.0} cuda/11.2.2 b:ucx/1.10.0-1_dgx openmpi/4.1.0-1_dgx deprecated gcc/10.2.0 cuda/11.2.2 b:ucx/1.10.0-1_dgx diff --git a/Libraries/libfabric/build b/Libraries/libfabric/build new file mode 100755 index 0000000..0580efc --- /dev/null +++ b/Libraries/libfabric/build @@ -0,0 +1,7 @@ +#!/usr/bin/env modbuild + +pbuild::set_download_url "https://github.com/ofiwg/libfabric/releases/download/v$V/$P-$V.tar.bz2" +pbuild::add_to_group 'Libraries' + +pbuild::install_docfiles 'COPYING' 'AUTHORS' 'README' 'NEWS.md' + diff --git a/Libraries/libfabric/files/variants b/Libraries/libfabric/files/variants new file mode 100644 index 0000000..ab94958 --- /dev/null +++ b/Libraries/libfabric/files/variants @@ -0,0 +1 @@ +libfabric/1.18.0 unstable b:gcc/10.4.0 diff --git a/Libraries/libfabric/modulefile b/Libraries/libfabric/modulefile new file mode 100644 index 0000000..d7dc74f --- /dev/null +++ b/Libraries/libfabric/modulefile @@ -0,0 +1,53 @@ +#%Module1.0 + +module-whatis "Open Fabrics Interfaces (OFI)" +module-url "https://ofiwg.github.io/libfabric/" +module-license "Open source, see https://github.com/ofiwg/libfabric/blob/main/COPYING" +module-maintainer "Marc Caubet Serrabou " + +module-help " +The Open Fabrics Interfaces (OFI) is a framework focused on exporting +fabric communication services to applications. + +Libfabric, also known as Open Fabrics Interfaces (OFI), defines a +communication API for high-performance parallel and distributed +applications. It is a low-level communication library that abstracts +diverse networking technologies. Libfabric is developed by the OFI +Working Group (OFIWG, pronounced “o-fee-wig”), a subgroup of the +OpenFabrics Alliance - OFA. + +Participation in the OFIWG is open to anyone, and not restricted to +members of OFA. + +The goal of libfabric is to define interfaces that enable a tight +semantic map between applications and underlying fabric services. +Specifically, libfabric software interfaces have been co-designed with +fabric hardware providers and application developers, with a focus on +the needs of HPC users. Libfabric supports multiple communication +semantics, is fabric and hardware implementation agnostic, and leverages +and expands the existing RDMA open source community. + +Libfabric is designed to minimize the impedance mismatch between +applications, including middleware such as MPI, SHMEM, data storage, and +PGAS, and fabric communication hardware. Its interfaces target +high-bandwidth, low-latency NICs, with a goal to scale to tens of +thousands of nodes. + +Libfabric targets support for the Linux, Free BSD, Windows, and OS X. +A reasonable effort is made to support all major, modern Linux +distributions; however, validation is limited to the most recent 2-3 +releases of Red Hat Enterprise Linux (RHEL) and SUSE Linux Enterprise +Server (SLES). Support for a particular operating system version or +distribution is vendor specific. The exceptions are the tcp and udp +based socket providers are available on all platforms. +" + +unsetenv LIBFABRIC_LIBRARY_DIR /opt/psi/Libraries/libfabric/1.18.0/lib + +remove-path LIBRARY_PATH $PREFIX/lib +remove-path LD_LIBRARY_PATH $PREFIX/lib +setenv LIBFABRIC_LIBRARY_DIR $PREFIX/lib64 + +# prepend-path LIBRARY_PATH $PREFIX/lib64 +# prepend-path LD_LIBRARY_PATH $PREFIX/lib64 +# prepend-path LIBFABRIC_LIBRARY_DIR $PREFIX/lib64 diff --git a/Libraries/pmix/files/variants.merlin6 b/Libraries/pmix/files/variants.merlin6 index ead5082..beb6bdd 100644 --- a/Libraries/pmix/files/variants.merlin6 +++ b/Libraries/pmix/files/variants.merlin6 @@ -3,4 +3,4 @@ pmix/2.2.5 unstable libevent/2.1.12 pmix/3.2.3 unstable libevent/2.1.12 pmix/4.1.2 unstable libevent/2.1.12 b:hwloc/2.7.1 pmix/4.2.3 unstable libevent/2.1.12 b:hwloc/2.9.1 -pmix/4.2.4 unstable libevent/2.1.12 b:hwloc/2.9.1 +pmix/4.2.4 unstable b:gcc/10.4.0 libevent/2.1.12 b:hwloc/2.9.1