public release 4.2.0 - see README.md and CHANGES.md for details

This commit is contained in:
2026-01-08 19:10:45 +01:00
parent ef781e2db4
commit b64beb694c
181 changed files with 39388 additions and 6527 deletions

View File

@@ -1,21 +1,21 @@
"""
@package pmsco.schedule
job schedule interface
Job schedule interface
this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
This module defines common infrastructure to submit a PMSCO calculation job to a job scheduler such as Slurm.
the schedule can be defined as part of the run-file (see pmsco module).
users may derive sub-classes in a separate module to adapt to their own computing cluster.
The schedule can be defined as part of the run-file (see pmsco module).
Users may derive sub-classes in a separate module to adapt to their own computing cluster.
the basic call sequence is:
1. create a schedule object.
2. initialize its properties with job parameters.
3. validate()
4. submit()
The basic call sequence is:
1. Create a schedule object.
2. Initialize its properties with job parameters.
3. Validate()
4. Submit()
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
@copyright (c) 2015-21 by Paul Scherrer Institut @n
@copyright (c) 2015-23 by Paul Scherrer Institut @n
Licensed under the Apache License, Version 2.0 (the "License"); @n
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
@@ -23,12 +23,20 @@ Licensed under the Apache License, Version 2.0 (the "License"); @n
"""
import collections.abc
import commentjson as json
import datetime
import logging
import os
from pathlib import Path
import shutil
import subprocess
import sys
from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Mapping, Optional, Sequence, Set, Tuple, Union
try:
import commentjson as json
except ImportError:
import json
import pmsco.config
logger = logging.getLogger(__name__)
@@ -36,37 +44,82 @@ logger = logging.getLogger(__name__)
class JobSchedule(pmsco.config.ConfigurableObject):
"""
base class for job schedule
Base class for job schedule
this class defines the abstract interface and some utilities.
derived classes may override any method, but should call the inherited method.
Usage:
1. Create object, assigning a project instance.
2. Set attributes.
3. Assign run_dict.
4. Call validate.
5. Call submit.
usage:
1. create object, assigning a project instance.
2. assign run_file.
3. call validate.
4. call submit.
An example can be seen in pmsco.schedule_project.
this class' properties should not be listed in the run file - they will be overwritten.
This class defines the abstract interface, common actions and some utilities.
Derived classes may override any method, but should call the inherited method.
"""
## @var enabled (bool)
#
# this parameter signals whether pmsco should schedule a job or run the calculation.
# it is not directly used by the schedule classes but by the pmsco module.
# it must be defined in the run file and set to true to submit the job to a scheduler.
# it is set to false in the run file copied to the job directory so that the job script starts the calculation.
# This parameter signals whether pmsco should schedule a job or run the calculation.
# It is not directly used by the schedule classes but by the pmsco module.
# It must be defined in the run file and set to true to submit the job to a scheduler.
# It is set to false in the run file copied to the job directory so that the job script starts the calculation.
def __init__(self, project):
super(JobSchedule, self).__init__()
self.project = project
# Specifies whether a job script is created (True) or not (False, default).
self.enabled = False
# Specifies whether the job script is submitted manually by the user (True, default)
# or as a part of this class' execution (False).
self.manual = True
# Specifies whether an existing job directory is overwritten (True).
# Otherwise, an exception is raised if it exists (False, default).
self.overwrite_job_dir = False
# Content of the run-file.
# This must be set by the caller before the object is executed.
self.run_dict = {}
self.job_dir = Path()
self.job_file = Path()
# Path to the destination run-file.
# This value is set by self.validate.
self.run_file = Path()
# directory that contains the pmsco and projects directories
self.pmsco_root = Path(__file__).parent.parent
# Job work directory.
# This value is set by self.validate.
self.job_dir = Path()
# Dest path of the shell script to be submitted to the queue.
# This value is set by self.validate.
self.job_file = Path()
# Directory that contains the pmsco code.
# This value is set by self.validate.
self.pmsco_root_dir = Path(__file__).parent.parent
# Directory that contains the project module
# This value is set by self.validate.
self.project_dir = Path()
# Project files to be copied to the job_dir.
# Paths should be absolute or relative to project_dir.
# The project module is appended by self.validate.
# Other files have to be added by the caller.
self.project_files = []
# Name of the conda environment to activate (`source activate` command).
# Either conda_env or virtual_env should be specified.
# If both are empty, the environment is detected from the environment of the current process.
self.conda_env = ""
# Path of the virtual environment to activate (must contain the `activate` command).
# Either conda_env or virtual_env should be specified.
# If both are empty, the environment is detected from the environment of the current process.
self.virtual_env = ""
def validate(self):
"""
@@ -76,20 +129,35 @@ class JobSchedule(pmsco.config.ConfigurableObject):
@return: None
"""
self.pmsco_root = Path(self.project.directories['pmsco']).parent
assert self.run_dict, "run_dict not set"
self.pmsco_root_dir = Path(self.project.directories['pmsco']).parent
self.project_dir = Path(self.project.directories['project'])
output_dir = Path(self.project.directories['output'])
assert self.pmsco_root.is_dir()
assert (self.pmsco_root / "pmsco").is_dir()
assert (self.pmsco_root / "projects").is_dir()
assert output_dir.is_dir()
assert self.project.job_name
assert self.pmsco_root_dir.is_dir()
assert (self.pmsco_root_dir / "pmsco").is_dir(), "can't find pmsco directory (source code)"
assert self.project_dir.is_dir(), "can't find project directory"
assert output_dir.is_dir(), "can't find output directory"
assert output_dir.is_absolute(), "output directory must be an absolute path"
assert self.project.job_name, "job_name is undefined"
if output_dir.name == self.project.job_name:
self.job_dir = output_dir
else:
self.job_dir = output_dir / self.project.job_name
try:
self.job_dir.mkdir(parents=True, exist_ok=self.overwrite_job_dir)
except FileExistsError:
logger.error("job directory exists - check job name or clean up manually")
raise
self.job_dir = output_dir / self.project.job_name
self.job_dir.mkdir(parents=True, exist_ok=True)
self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")
self.project_files.append(sys.modules[self.project.__module__].__file__)
def submit(self):
"""
submit the job to the scheduler.
@@ -111,24 +179,16 @@ class JobSchedule(pmsco.config.ConfigurableObject):
"""
copy the source files to the job directory.
the source_dir and job_dir attributes must be correct.
the job_dir directory must not exist and will be created.
this is a utility method used internally by derived classes.
job_dir/pmsco/pmsco/**
job_dir/pmsco/projects/**
job_dir/job.sh
job_dir/job.json
the files to copy must be listed explicitly in the project_files attribute.
the files are copied to the job_dir directory.
the directory must exist.
@return: None
"""
source = self.pmsco_root
dest = self.job_dir / "pmsco"
ignore = shutil.ignore_patterns(".*", "~*", "*~")
shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
files = set((self.project_dir.joinpath(pf) for pf in self.project_files))
for f in files:
shutil.copy2(f, self.job_dir)
def _fix_run_file(self):
"""
@@ -144,6 +204,7 @@ class JobSchedule(pmsco.config.ConfigurableObject):
"""
self.run_dict['schedule']['enabled'] = False
self.run_dict['project']['directories']['output'] = str(self.job_dir)
self.run_dict['project']['job_name'] = self.project.job_name
self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))
def _write_run_file(self):
@@ -169,6 +230,31 @@ class JobSchedule(pmsco.config.ConfigurableObject):
"""
pass
@staticmethod
def detect_env() -> Dict[str, os.PathLike]:
"""
detect the python environment
determines the current python environment.
examples:
- /das/work/p17/p17274/conda/envs/pmsco310/bin
- /home/user/envs/pmsco-uv
@return: dictionary type -> path containing one or zero items.
type is either 'conda', 'venv' or 'system';
path is the bin directory containing python.
"""
pp = Path(sys.executable).parent
if (pp / 'activate').is_file():
return {"venv": pp}
for parent in pp.parents:
if (parent / "condabin").is_dir():
return {"conda": pp}
else:
return {"system": pp}
class SlurmSchedule(JobSchedule):
"""
@@ -189,10 +275,9 @@ class SlurmSchedule(JobSchedule):
super(SlurmSchedule, self).__init__(project)
self.host = ""
self.nodes = 1
self.tasks_per_node = 8
self.tasks = 8
self.wall_time = datetime.timedelta(hours=1)
self.signal_time = 600
self.manual = True
@staticmethod
def parse_timedelta(td):
@@ -221,13 +306,21 @@ class SlurmSchedule(JobSchedule):
pass
td = datetime.timedelta(**dt)
elif isinstance(td, collections.abc.Mapping):
td = {k: float(v) for k, v in td.items()}
td = datetime.timedelta(**td)
return td
@property
def tasks_per_node(self):
return self.tasks // self.nodes
@tasks_per_node.setter
def tasks_per_node(self, value):
self.tasks = value * self.nodes
def validate(self):
super(SlurmSchedule, self).validate()
self.wall_time = self.parse_timedelta(self.wall_time)
assert self.job_dir.is_absolute()
def submit(self):
"""
@@ -252,8 +345,31 @@ class PsiRaSchedule(SlurmSchedule):
job shedule for the Ra cluster at PSI.
this class selects specific features of the Ra cluster,
such as the partition and node type (24 or 32 cores).
such as the partition and node type.
it also implements the _write_job_file method.
for information about the Ra cluster, see
https://www.psi.ch/en/photon-science-data-services/offline-computing-facility-for-sls-and-swissfel-data-analysis
COMPUTE NODES
| NodeName | Weight | CPUs | RealMemory | MemSpecLimit | Sockets | CoresPerSocket | ThreadsPerCore |
| --- | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| ra-c-[033-048] | 100 | 36 | 256000 | 21502 | 2 | 18 | 1 |
| ra-c-[049-072] | 100 | 40 | 386000 | 21502 | 2 | 20 | 1 |
| ra-c-[073-084] | | 52 | | | 2 | 26 | 1 |
| ra-c-[085-096] | | 56 | | | 2 | 28 | 1 |
PARTITIONS
| PartitionName | Nodes | MaxTime | DefaultTime | Priority |
| --- | :---: | :---: | :---: | :---: | :---: |
| hour | | 0-01:00:00 | 0-01:00:00 | 4 |
| day | | 1-00:00:00 | 0-08:00:00 | 2 |
| week | | 8-00:00:00 | 2-00:00:00 | 1 |
The option --ntasks-per-node is meant to be used with the --nodes option.
(For the --ntasks option, the default is one task per node, use the --cpus-per-task option to change this default.)
"""
## @var partition (str)
@@ -261,24 +377,64 @@ class PsiRaSchedule(SlurmSchedule):
# the partition is selected based on wall time and number of tasks by the validate() method.
# it should not be listed in the run file.
## @var modules (list of str)
#
# names of the software modules to load (`module load` command)
def __init__(self, project):
super(PsiRaSchedule, self).__init__(project)
self.partition = "shared"
self.partition = ""
self.modules = []
self.default_env = {}
def validate(self):
"""
check validity of parameters and detect the environment
the validity is not checked in detail - just intercept some common or severe mistakes.
detect whether we are in a conda or virtual environment
so that we can add the necessary activation to the job script.
detect which modules are active. depending on the python environment, LOADEDMODULES is:
- venv: openssl/3.4.1:TclTk/8.6.16:xz/5.8.0:Python/3.12.9:gcc/9.3.0:libfabric/1.18.0:cuda/11.1.0:openmpi/4.0.5_slurm
- conda: miniforge/2025-03-25:gcc/9.3.0:libfabric/1.18.0:cuda/11.1.0:openmpi/4.0.5_slurm
other potentially useful but currently unused environment variables are:
- PMODULES_LOADED_COMPILER: openmpi/4.0.5_slurm
- PMODULES_LOADED_TOOLS: openssl/3.4.1:xz/5.8.0
- PMODULES_LOADED_LIBRARIES: libfabric/1.18.0
- PMODULES_LOADED_PROGRAMMING: TclTk/8.6.16:Python/3.12.9:gcc/9.3.0:cuda/11.1.0
- PMODULES_LOADED_TOMCAT: miniforge/2025-03-25
note: openssl, TclTk, xz and cuda were not requested explicitly.
"""
super(PsiRaSchedule, self).validate()
# check that the submission is sane - the values are not firm
assert self.nodes <= 2
assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
assert self.wall_time.total_seconds() >= 60
assert self.tasks <= 64
assert 30 * 60 <= self.wall_time.total_seconds() <= 8 * 24 * 60 * 60
if self.wall_time.total_seconds() > 24 * 60 * 60:
self.partition = "week"
elif self.tasks_per_node < 24:
self.partition = "shared"
else:
self.partition = "day"
assert self.partition in ["day", "week", "shared"]
assert self.partition in ["day", "week"]
self.default_env = self.detect_env()
if len(self.modules) == 0:
self.modules = os.environ["LOADEDMODULES"].split(":")
def _write_job_file(self):
"""
write a job file for the ra cluster
@return: None
"""
lines = []
lines.append('#!/bin/bash')
@@ -288,21 +444,38 @@ class PsiRaSchedule(SlurmSchedule):
lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
lines.append(f'#SBATCH --nodes={self.nodes}')
lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
if self.tasks_per_node > 24:
lines.append('#SBATCH --cores-per-socket=16')
# 0 - 65535 seconds
# currently, PMSCO does not react to signals properly
# lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
lines.append('module load psi-python36/4.4.0')
lines.append('module load gcc/4.8.5')
lines.append('module load openmpi/3.1.3')
lines.append('source activate pmsco')
# environment
if self.modules:
lines.append('module use unstable || true')
lines.append('module use Libraries || true')
for module in self.modules:
lines.append(f'module load {module}')
conda_env = self.conda_env or self.default_env.get("conda")
virtual_env = self.virtual_env or self.default_env.get("venv")
if conda_env:
lines.append(f'source activate {conda_env}')
elif virtual_env:
activate_script = Path(virtual_env) / "activate"
if activate_script.is_file():
lines.append(f'source {activate_script}')
lines.append('env')
lines.append('')
# run
lines.append(f'cd "{self.job_dir}"')
lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
lines.append(f'mpirun python -m pmsco -r {self.run_file}')
lines.append('')
# clean up
lines.append(f'cd "{self.job_dir}"')
lines.append('rm -rf pmsco')
lines.append('exit 0')
self.job_file.write_text("\n".join(lines))