public release 3.0.0 - see README and CHANGES for details

2021-02-09 12:46:20 +01:00
parent 2b3dbd8bac
commit ef781e2db4
46 changed files with 4390 additions and 1655 deletions
--- a/pmsco/schedule.py
+++ b/pmsco/schedule.py
@ -0,0 +1,309 @@
+"""
+@package pmsco.schedule
+job schedule interface
+
+this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
+
+the schedule can be defined as part of the run-file (see pmsco module).
+users may derive sub-classes in a separate module to adapt to their own computing cluster.
+
+the basic call sequence is:
+1. create a schedule object.
+2. initialize its properties with job parameters.
+3. validate()
+4. submit()
+
+@author Matthias Muntwiler, matthias.muntwiler@psi.ch
+
+@copyright (c) 2015-21 by Paul Scherrer Institut @n
+Licensed under the Apache License, Version 2.0 (the "License"); @n
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+"""
+
+import collections.abc
+import commentjson as json
+import datetime
+import logging
+from pathlib import Path
+import shutil
+import subprocess
+import pmsco.config
+
+logger = logging.getLogger(__name__)
+
+
+class JobSchedule(pmsco.config.ConfigurableObject):
+    """
+    base class for job schedule
+
+    this class defines the abstract interface and some utilities.
+    derived classes may override any method, but should call the inherited method.
+
+    usage:
+    1. create object, assigning a project instance.
+    2. assign run_file.
+    3. call validate.
+    4. call submit.
+
+    this class' properties should not be listed in the run file - they will be overwritten.
+    """
+
+    ## @var enabled (bool)
+    #
+    # this parameter signals whether pmsco should schedule a job or run the calculation.
+    # it is not directly used by the schedule classes but by the pmsco module.
+    # it must be defined in the run file and set to true to submit the job to a scheduler.
+    # it is set to false in the run file copied to the job directory so that the job script starts the calculation.
+
+    def __init__(self, project):
+        super(JobSchedule, self).__init__()
+        self.project = project
+        self.enabled = False
+        self.run_dict = {}
+        self.job_dir = Path()
+        self.job_file = Path()
+        self.run_file = Path()
+        # directory that contains the pmsco and projects directories
+        self.pmsco_root = Path(__file__).parent.parent
+
+    def validate(self):
+        """
+        validate the job parameters.
+
+        make sure all object attributes are correct for submission.
+
+        @return: None
+        """
+        self.pmsco_root = Path(self.project.directories['pmsco']).parent
+        output_dir = Path(self.project.directories['output'])
+
+        assert self.pmsco_root.is_dir()
+        assert (self.pmsco_root / "pmsco").is_dir()
+        assert (self.pmsco_root / "projects").is_dir()
+        assert output_dir.is_dir()
+        assert self.project.job_name
+
+        self.job_dir = output_dir / self.project.job_name
+        self.job_dir.mkdir(parents=True, exist_ok=True)
+        self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
+        self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")
+
+    def submit(self):
+        """
+        submit the job to the scheduler.
+
+        as of this class, the method does to following:
+
+        1. copy source files
+        2. copy a patched version of the run file.
+        3. write the job file (_write_job_file must be implemented by a derived class).
+
+        @return: None
+        """
+        self._copy_source()
+        self._fix_run_file()
+        self._write_run_file()
+        self._write_job_file()
+
+    def _copy_source(self):
+        """
+        copy the source files to the job directory.
+
+        the source_dir and job_dir attributes must be correct.
+        the job_dir directory must not exist and will be created.
+
+        this is a utility method used internally by derived classes.
+
+        job_dir/pmsco/pmsco/**
+        job_dir/pmsco/projects/**
+        job_dir/job.sh
+        job_dir/job.json
+
+        @return: None
+        """
+
+        source = self.pmsco_root
+        dest = self.job_dir / "pmsco"
+        ignore = shutil.ignore_patterns(".*", "~*", "*~")
+        shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
+        shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
+
+    def _fix_run_file(self):
+        """
+        fix the run file.
+
+        patch some entries of self.run_dict so that it can be used as run file.
+        the following changes are made:
+        1. set schedule.enabled to false so that the calculation is run.
+        2. set the output directory to the job directory.
+        3. set the log file to the job directory.
+
+        @return: None
+        """
+        self.run_dict['schedule']['enabled'] = False
+        self.run_dict['project']['directories']['output'] = str(self.job_dir)
+        self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))
+
+    def _write_run_file(self):
+        """
+        copy the run file.
+
+        this is a JSON dump of self.run_dict to the self.run_file file.
+
+        @return: None
+        """
+        with open(self.run_file, "wt") as f:
+            json.dump(self.run_dict, f, indent=2)
+
+    def _write_job_file(self):
+        """
+        create the job script.
+
+        this method must be implemented by a derived class.
+        the script must be written to the self.job_file file.
+        don't forget to make the file executable.
+
+        @return: None
+        """
+        pass
+
+
+class SlurmSchedule(JobSchedule):
+    """
+    job schedule for a slurm scheduler.
+
+    this class implements commonly used features of the slurm scheduler.
+    host-specific features and the creation of the job file should be done in a derived class.
+    derived classes must, in particular, implement the _write_job_file method.
+    they can override other methods, too, but should call the inherited method first.
+
+    1. copy the source trees (pmsco and projects) to the job directory
+    2. copy a patched version of the run file.
+    3. call the submission command
+
+    the public properties of this class should be assigned from the run file.
+    """
+    def __init__(self, project):
+        super(SlurmSchedule, self).__init__(project)
+        self.host = ""
+        self.nodes = 1
+        self.tasks_per_node = 8
+        self.wall_time = datetime.timedelta(hours=1)
+        self.signal_time = 600
+        self.manual = True
+
+    @staticmethod
+    def parse_timedelta(td):
+        """
+        parse time delta input formats
+
+        converts a string or dictionary from run-file into datetime.timedelta.
+
+        @param td:
+            str: [days-]hours[:minutes[:seconds]]
+            dict: days, hours, minutes, seconds - at least one needs to be defined. values must be numeric.
+            datetime.timedelta - native type
+        @return: datetime.timedelta
+        """
+        if isinstance(td, str):
+            dt = {}
+            d = td.split("-")
+            if len(d) > 1:
+                dt['days'] = float(d.pop(0))
+            t = d[0].split(":")
+            try:
+                dt['hours'] = float(t.pop(0))
+                dt['minutes'] = float(t.pop(0))
+                dt['seconds'] = float(t.pop(0))
+            except (IndexError, ValueError):
+                pass
+            td = datetime.timedelta(**dt)
+        elif isinstance(td, collections.abc.Mapping):
+            td = datetime.timedelta(**td)
+        return td
+
+    def validate(self):
+        super(SlurmSchedule, self).validate()
+        self.wall_time = self.parse_timedelta(self.wall_time)
+        assert self.job_dir.is_absolute()
+
+    def submit(self):
+        """
+        call the sbatch command
+
+        if manual is true, the job files are generated but the job is not submitted.
+
+        @return: None
+        """
+        super(SlurmSchedule, self).submit()
+        args = ['sbatch', str(self.job_file)]
+        print(" ".join(args))
+        if self.manual:
+            print("manual run - job files created but not submitted")
+        else:
+            cp = subprocess.run(args)
+            cp.check_returncode()
+
+
+class PsiRaSchedule(SlurmSchedule):
+    """
+    job shedule for the Ra cluster at PSI.
+
+    this class selects specific features of the Ra cluster,
+    such as the partition and node type (24 or 32 cores).
+    it also implements the _write_job_file method.
+    """
+
+    ## @var partition (str)
+    #
+    # the partition is selected based on wall time and number of tasks by the validate() method.
+    # it should not be listed in the run file.
+
+    def __init__(self, project):
+        super(PsiRaSchedule, self).__init__(project)
+        self.partition = "shared"
+
+    def validate(self):
+        super(PsiRaSchedule, self).validate()
+        assert self.nodes <= 2
+        assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
+        assert self.wall_time.total_seconds() >= 60
+        if self.wall_time.total_seconds() > 24 * 60 * 60:
+            self.partition = "week"
+        elif self.tasks_per_node < 24:
+            self.partition = "shared"
+        else:
+            self.partition = "day"
+        assert self.partition in ["day", "week", "shared"]
+
+    def _write_job_file(self):
+        lines = []
+
+        lines.append('#!/bin/bash')
+        lines.append('#SBATCH --export=NONE')
+        lines.append(f'#SBATCH --job-name="{self.project.job_name}"')
+        lines.append(f'#SBATCH --partition={self.partition}')
+        lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
+        lines.append(f'#SBATCH --nodes={self.nodes}')
+        lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
+        if self.tasks_per_node > 24:
+            lines.append('#SBATCH --cores-per-socket=16')
+        # 0 - 65535 seconds
+        # currently, PMSCO does not react to signals properly
+        # lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
+        lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
+        lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
+        lines.append('module load psi-python36/4.4.0')
+        lines.append('module load gcc/4.8.5')
+        lines.append('module load openmpi/3.1.3')
+        lines.append('source activate pmsco')
+        lines.append(f'cd "{self.job_dir}"')
+        lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
+        lines.append(f'cd "{self.job_dir}"')
+        lines.append('rm -rf pmsco')
+        lines.append('exit 0')
+
+        self.job_file.write_text("\n".join(lines))
+        self.job_file.chmod(0o755)