public release 4.2.0 - see README.md and CHANGES.md for details

2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions
--- a/pmsco/schedule.py
+++ b/pmsco/schedule.py
@@ -1,21 +1,21 @@
 """
@package pmsco.schedule
-job schedule interface
+Job schedule interface

-this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
+This module defines common infrastructure to submit a PMSCO calculation job to a job scheduler such as Slurm.

-the schedule can be defined as part of the run-file (see pmsco module).
-users may derive sub-classes in a separate module to adapt to their own computing cluster.
+The schedule can be defined as part of the run-file (see pmsco module).
+Users may derive sub-classes in a separate module to adapt to their own computing cluster.

-the basic call sequence is:
-1. create a schedule object.
-2. initialize its properties with job parameters.
-3. validate()
-4. submit()
+The basic call sequence is:
+1. Create a schedule object.
+2. Initialize its properties with job parameters.
+3. Validate()
+4. Submit()

@author Matthias Muntwiler, matthias.muntwiler@psi.ch

-@copyright (c) 2015-21 by Paul Scherrer Institut @n
+@copyright (c) 2015-23 by Paul Scherrer Institut @n
 Licensed under the Apache License, Version 2.0 (the "License"); @n
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at
@@ -23,12 +23,20 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
 """

 import collections.abc
-import commentjson as json
 import datetime
 import logging
+import os
 from pathlib import Path
 import shutil
 import subprocess
+import sys
+from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, Union
+
+try:
+    import commentjson as json
+except ImportError:
+    import json
+
 import pmsco.config

 logger = logging.getLogger(__name__)
@@ -36,37 +44,82 @@ logger = logging.getLogger(__name__)

 class JobSchedule(pmsco.config.ConfigurableObject):
    """
-    base class for job schedule
+    Base class for job schedule

-    this class defines the abstract interface and some utilities.
-    derived classes may override any method, but should call the inherited method.
+    Usage:
+    1. Create object, assigning a project instance.
+    2. Set attributes.
+    3. Assign run_dict.
+    4. Call validate.
+    5. Call submit.

-    usage:
-    1. create object, assigning a project instance.
-    2. assign run_file.
-    3. call validate.
-    4. call submit.
+    An example can be seen in pmsco.schedule_project.

-    this class' properties should not be listed in the run file - they will be overwritten.
+    This class defines the abstract interface, common actions and some utilities.
+    Derived classes may override any method, but should call the inherited method.
    """

    ## @var enabled (bool)
    #
-    # this parameter signals whether pmsco should schedule a job or run the calculation.
-    # it is not directly used by the schedule classes but by the pmsco module.
-    # it must be defined in the run file and set to true to submit the job to a scheduler.
-    # it is set to false in the run file copied to the job directory so that the job script starts the calculation.
+    # This parameter signals whether pmsco should schedule a job or run the calculation.
+    # It is not directly used by the schedule classes but by the pmsco module.
+    # It must be defined in the run file and set to true to submit the job to a scheduler.
+    # It is set to false in the run file copied to the job directory so that the job script starts the calculation.

    def __init__(self, project):
        super(JobSchedule, self).__init__()
        self.project = project
+
+        # Specifies whether a job script is created (True) or not (False, default).
        self.enabled = False
+
+        # Specifies whether the job script is submitted manually by the user (True, default)
+        # or as a part of this class' execution (False).
+        self.manual = True
+
+        # Specifies whether an existing job directory is overwritten (True).
+        # Otherwise, an exception is raised if it exists (False, default).
+        self.overwrite_job_dir = False
+
+        # Content of the run-file.
+        # This must be set by the caller before the object is executed.
        self.run_dict = {}
-        self.job_dir = Path()
-        self.job_file = Path()
+
+        # Path to the destination run-file.
+        # This value is set by self.validate.
        self.run_file = Path()
-        # directory that contains the pmsco and projects directories
-        self.pmsco_root = Path(__file__).parent.parent
+
+        # Job work directory.
+        # This value is set by self.validate.
+        self.job_dir = Path()
+
+        # Dest path of the shell script to be submitted to the queue.
+        # This value is set by self.validate.
+        self.job_file = Path()
+
+        # Directory that contains the pmsco code.
+        # This value is set by self.validate.
+        self.pmsco_root_dir = Path(__file__).parent.parent
+
+        # Directory that contains the project module
+        # This value is set by self.validate.
+        self.project_dir = Path()
+
+        # Project files to be copied to the job_dir.
+        # Paths should be absolute or relative to project_dir.
+        # The project module is appended by self.validate.
+        # Other files have to be added by the caller.
+        self.project_files = []
+
+        # Name of the conda environment to activate (`source activate` command).
+        # Either conda_env or virtual_env should be specified.
+        # If both are empty, the environment is detected from the environment of the current process.
+        self.conda_env = ""
+
+        # Path of the virtual environment to activate (must contain the `activate` command).
+        # Either conda_env or virtual_env should be specified.
+        # If both are empty, the environment is detected from the environment of the current process.
+        self.virtual_env = ""

    def validate(self):
        """
@@ -76,20 +129,35 @@ class JobSchedule(pmsco.config.ConfigurableObject):

        @return: None
        """
-        self.pmsco_root = Path(self.project.directories['pmsco']).parent
+
+        assert self.run_dict, "run_dict not set"
+
+        self.pmsco_root_dir = Path(self.project.directories['pmsco']).parent
+        self.project_dir = Path(self.project.directories['project'])
        output_dir = Path(self.project.directories['output'])

-        assert self.pmsco_root.is_dir()
-        assert (self.pmsco_root / "pmsco").is_dir()
-        assert (self.pmsco_root / "projects").is_dir()
-        assert output_dir.is_dir()
-        assert self.project.job_name
+        assert self.pmsco_root_dir.is_dir()
+        assert (self.pmsco_root_dir / "pmsco").is_dir(), "can't find pmsco directory (source code)"
+        assert self.project_dir.is_dir(), "can't find project directory"
+        assert output_dir.is_dir(), "can't find output directory"
+        assert output_dir.is_absolute(), "output directory must be an absolute path"
+        assert self.project.job_name, "job_name is undefined"
+
+        if output_dir.name == self.project.job_name:
+            self.job_dir = output_dir
+        else:
+            self.job_dir = output_dir / self.project.job_name
+        try:
+            self.job_dir.mkdir(parents=True, exist_ok=self.overwrite_job_dir)
+        except FileExistsError:
+            logger.error("job directory exists - check job name or clean up manually")
+            raise

-        self.job_dir = output_dir / self.project.job_name
-        self.job_dir.mkdir(parents=True, exist_ok=True)
        self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
        self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")

+        self.project_files.append(sys.modules[self.project.__module__].__file__)
+
    def submit(self):
        """
        submit the job to the scheduler.
@@ -111,24 +179,16 @@ class JobSchedule(pmsco.config.ConfigurableObject):
        """
        copy the source files to the job directory.

-        the source_dir and job_dir attributes must be correct.
-        the job_dir directory must not exist and will be created.
-
-        this is a utility method used internally by derived classes.
-
-        job_dir/pmsco/pmsco/**
-        job_dir/pmsco/projects/**
-        job_dir/job.sh
-        job_dir/job.json
+        the files to copy must be listed explicitly in the project_files attribute.
+        the files are copied to the job_dir directory.
+        the directory must exist.

        @return: None
        """

-        source = self.pmsco_root
-        dest = self.job_dir / "pmsco"
-        ignore = shutil.ignore_patterns(".*", "~*", "*~")
-        shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
-        shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
+        files = set((self.project_dir.joinpath(pf) for pf in self.project_files))
+        for f in files:
+            shutil.copy2(f, self.job_dir)

    def _fix_run_file(self):
        """
@@ -144,6 +204,7 @@ class JobSchedule(pmsco.config.ConfigurableObject):
        """
        self.run_dict['schedule']['enabled'] = False
        self.run_dict['project']['directories']['output'] = str(self.job_dir)
+        self.run_dict['project']['job_name'] = self.project.job_name
        self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))

    def _write_run_file(self):
@@ -169,6 +230,31 @@ class JobSchedule(pmsco.config.ConfigurableObject):
        """
        pass

+    @staticmethod
+    def detect_env() -> Dict[str, os.PathLike]:
+        """
+        detect the python environment
+
+        determines the current python environment.
+
+        examples:
+        - /das/work/p17/p17274/conda/envs/pmsco310/bin
+        - /home/user/envs/pmsco-uv
+
+        @return: dictionary type -> path containing one or zero items.
+            type is either 'conda', 'venv' or 'system';
+            path is the bin directory containing python.
+        """
+
+        pp = Path(sys.executable).parent
+        if (pp / 'activate').is_file():
+            return {"venv": pp}
+        for parent in pp.parents:
+            if (parent / "condabin").is_dir():
+                return {"conda": pp}
+        else:
+            return {"system": pp}
+

 class SlurmSchedule(JobSchedule):
    """
@@ -189,10 +275,9 @@ class SlurmSchedule(JobSchedule):
        super(SlurmSchedule, self).__init__(project)
        self.host = ""
        self.nodes = 1
-        self.tasks_per_node = 8
+        self.tasks = 8
        self.wall_time = datetime.timedelta(hours=1)
        self.signal_time = 600
-        self.manual = True

    @staticmethod
    def parse_timedelta(td):
@@ -221,13 +306,21 @@ class SlurmSchedule(JobSchedule):
                pass
            td = datetime.timedelta(**dt)
        elif isinstance(td, collections.abc.Mapping):
+            td = {k: float(v) for k, v in td.items()}
            td = datetime.timedelta(**td)
        return td

+    @property
+    def tasks_per_node(self):
+        return self.tasks // self.nodes
+
+    @tasks_per_node.setter
+    def tasks_per_node(self, value):
+        self.tasks = value * self.nodes
+
    def validate(self):
        super(SlurmSchedule, self).validate()
        self.wall_time = self.parse_timedelta(self.wall_time)
-        assert self.job_dir.is_absolute()

    def submit(self):
        """
@@ -252,8 +345,31 @@ class PsiRaSchedule(SlurmSchedule):
    job shedule for the Ra cluster at PSI.

    this class selects specific features of the Ra cluster,
-    such as the partition and node type (24 or 32 cores).
+    such as the partition and node type.
    it also implements the _write_job_file method.
+
+    for information about the Ra cluster, see
+    https://www.psi.ch/en/photon-science-data-services/offline-computing-facility-for-sls-and-swissfel-data-analysis
+
+    COMPUTE NODES
+
+    | NodeName | Weight | CPUs | RealMemory | MemSpecLimit | Sockets | CoresPerSocket | ThreadsPerCore |
+    | --- | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
+    | ra-c-[033-048] | 100 | 36 | 256000 | 21502 | 2 | 18 | 1 |
+    | ra-c-[049-072] | 100 | 40 | 386000 | 21502 | 2 | 20 | 1 |
+    | ra-c-[073-084] |  | 52 |  |  | 2 | 26 | 1 |
+    | ra-c-[085-096] |  | 56 |  |  | 2 | 28 | 1 |
+
+    PARTITIONS
+
+    | PartitionName | Nodes | MaxTime | DefaultTime | Priority |
+    | --- | :---: | :---: | :---: | :---: | :---: |
+    | hour    |  | 0-01:00:00 | 0-01:00:00 | 4 |
+    | day     |  | 1-00:00:00 | 0-08:00:00 | 2 |
+    | week    |  | 8-00:00:00 | 2-00:00:00 | 1 |
+
+    The option --ntasks-per-node is meant to be used with the --nodes option.
+    (For the --ntasks option, the default is one task per node, use the --cpus-per-task option to change this default.)
    """

    ## @var partition (str)
@@ -261,24 +377,64 @@ class PsiRaSchedule(SlurmSchedule):
    # the partition is selected based on wall time and number of tasks by the validate() method.
    # it should not be listed in the run file.

+    ## @var modules (list of str)
+    #
+    # names of the software modules to load (`module load` command)
+
    def __init__(self, project):
        super(PsiRaSchedule, self).__init__(project)
-        self.partition = "shared"
+        self.partition = ""
+        self.modules = []
+        self.default_env = {}

    def validate(self):
+        """
+        check validity of parameters and detect the environment
+
+        the validity is not checked in detail - just intercept some common or severe mistakes.
+
+        detect whether we are in a conda or virtual environment
+        so that we can add the necessary activation to the job script.
+
+        detect which modules are active. depending on the python environment, LOADEDMODULES is:
+            - venv: openssl/3.4.1:TclTk/8.6.16:xz/5.8.0:Python/3.12.9:gcc/9.3.0:libfabric/1.18.0:cuda/11.1.0:openmpi/4.0.5_slurm
+            - conda: miniforge/2025-03-25:gcc/9.3.0:libfabric/1.18.0:cuda/11.1.0:openmpi/4.0.5_slurm
+
+        other potentially useful but currently unused environment variables are:
+            - PMODULES_LOADED_COMPILER: openmpi/4.0.5_slurm
+            - PMODULES_LOADED_TOOLS: openssl/3.4.1:xz/5.8.0
+            - PMODULES_LOADED_LIBRARIES: libfabric/1.18.0
+            - PMODULES_LOADED_PROGRAMMING: TclTk/8.6.16:Python/3.12.9:gcc/9.3.0:cuda/11.1.0
+            - PMODULES_LOADED_TOMCAT: miniforge/2025-03-25
+
+        note: openssl, TclTk, xz and cuda were not requested explicitly.
+        """
+
        super(PsiRaSchedule, self).validate()
+
+        # check that the submission is sane - the values are not firm
        assert self.nodes <= 2
-        assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
-        assert self.wall_time.total_seconds() >= 60
+        assert self.tasks <= 64
+        assert 30 * 60 <= self.wall_time.total_seconds() <= 8 * 24 * 60 * 60
+
        if self.wall_time.total_seconds() > 24 * 60 * 60:
            self.partition = "week"
-        elif self.tasks_per_node < 24:
-            self.partition = "shared"
        else:
            self.partition = "day"
-        assert self.partition in ["day", "week", "shared"]
+
+        assert self.partition in ["day", "week"]
+
+        self.default_env = self.detect_env()
+
+        if len(self.modules) == 0:
+            self.modules = os.environ["LOADEDMODULES"].split(":")

    def _write_job_file(self):
+        """
+        write a job file for the ra cluster
+
+        @return: None
+        """
        lines = []

        lines.append('#!/bin/bash')
@@ -288,21 +444,38 @@ class PsiRaSchedule(SlurmSchedule):
        lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
        lines.append(f'#SBATCH --nodes={self.nodes}')
        lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
-        if self.tasks_per_node > 24:
-            lines.append('#SBATCH --cores-per-socket=16')
        # 0 - 65535 seconds
        # currently, PMSCO does not react to signals properly
        # lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
        lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
        lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
-        lines.append('module load psi-python36/4.4.0')
-        lines.append('module load gcc/4.8.5')
-        lines.append('module load openmpi/3.1.3')
-        lines.append('source activate pmsco')
+
+        # environment
+        if self.modules:
+            lines.append('module use unstable || true')
+            lines.append('module use Libraries || true')
+        for module in self.modules:
+            lines.append(f'module load {module}')
+
+        conda_env = self.conda_env or self.default_env.get("conda")
+        virtual_env = self.virtual_env or self.default_env.get("venv")
+        if conda_env:
+            lines.append(f'source activate {conda_env}')
+        elif virtual_env:
+            activate_script = Path(virtual_env) / "activate"
+            if activate_script.is_file():
+                lines.append(f'source {activate_script}')
+
+        lines.append('env')
+        lines.append('')
+
+        # run
        lines.append(f'cd "{self.job_dir}"')
-        lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
+        lines.append(f'mpirun python -m pmsco -r {self.run_file}')
+        lines.append('')
+
+        # clean up
        lines.append(f'cd "{self.job_dir}"')
-        lines.append('rm -rf pmsco')
        lines.append('exit 0')

        self.job_file.write_text("\n".join(lines))