310 lines
10 KiB
Python
310 lines
10 KiB
Python
"""
|
|
@package pmsco.schedule
|
|
job schedule interface
|
|
|
|
this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
|
|
|
|
the schedule can be defined as part of the run-file (see pmsco module).
|
|
users may derive sub-classes in a separate module to adapt to their own computing cluster.
|
|
|
|
the basic call sequence is:
|
|
1. create a schedule object.
|
|
2. initialize its properties with job parameters.
|
|
3. validate()
|
|
4. submit()
|
|
|
|
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
|
|
|
|
@copyright (c) 2015-21 by Paul Scherrer Institut @n
|
|
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
"""
|
|
|
|
import collections.abc
|
|
import commentjson as json
|
|
import datetime
|
|
import logging
|
|
from pathlib import Path
|
|
import shutil
|
|
import subprocess
|
|
import pmsco.config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class JobSchedule(pmsco.config.ConfigurableObject):
|
|
"""
|
|
base class for job schedule
|
|
|
|
this class defines the abstract interface and some utilities.
|
|
derived classes may override any method, but should call the inherited method.
|
|
|
|
usage:
|
|
1. create object, assigning a project instance.
|
|
2. assign run_file.
|
|
3. call validate.
|
|
4. call submit.
|
|
|
|
this class' properties should not be listed in the run file - they will be overwritten.
|
|
"""
|
|
|
|
## @var enabled (bool)
|
|
#
|
|
# this parameter signals whether pmsco should schedule a job or run the calculation.
|
|
# it is not directly used by the schedule classes but by the pmsco module.
|
|
# it must be defined in the run file and set to true to submit the job to a scheduler.
|
|
# it is set to false in the run file copied to the job directory so that the job script starts the calculation.
|
|
|
|
def __init__(self, project):
|
|
super(JobSchedule, self).__init__()
|
|
self.project = project
|
|
self.enabled = False
|
|
self.run_dict = {}
|
|
self.job_dir = Path()
|
|
self.job_file = Path()
|
|
self.run_file = Path()
|
|
# directory that contains the pmsco and projects directories
|
|
self.pmsco_root = Path(__file__).parent.parent
|
|
|
|
def validate(self):
|
|
"""
|
|
validate the job parameters.
|
|
|
|
make sure all object attributes are correct for submission.
|
|
|
|
@return: None
|
|
"""
|
|
self.pmsco_root = Path(self.project.directories['pmsco']).parent
|
|
output_dir = Path(self.project.directories['output'])
|
|
|
|
assert self.pmsco_root.is_dir()
|
|
assert (self.pmsco_root / "pmsco").is_dir()
|
|
assert (self.pmsco_root / "projects").is_dir()
|
|
assert output_dir.is_dir()
|
|
assert self.project.job_name
|
|
|
|
self.job_dir = output_dir / self.project.job_name
|
|
self.job_dir.mkdir(parents=True, exist_ok=True)
|
|
self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
|
|
self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")
|
|
|
|
def submit(self):
|
|
"""
|
|
submit the job to the scheduler.
|
|
|
|
as of this class, the method does to following:
|
|
|
|
1. copy source files
|
|
2. copy a patched version of the run file.
|
|
3. write the job file (_write_job_file must be implemented by a derived class).
|
|
|
|
@return: None
|
|
"""
|
|
self._copy_source()
|
|
self._fix_run_file()
|
|
self._write_run_file()
|
|
self._write_job_file()
|
|
|
|
def _copy_source(self):
|
|
"""
|
|
copy the source files to the job directory.
|
|
|
|
the source_dir and job_dir attributes must be correct.
|
|
the job_dir directory must not exist and will be created.
|
|
|
|
this is a utility method used internally by derived classes.
|
|
|
|
job_dir/pmsco/pmsco/**
|
|
job_dir/pmsco/projects/**
|
|
job_dir/job.sh
|
|
job_dir/job.json
|
|
|
|
@return: None
|
|
"""
|
|
|
|
source = self.pmsco_root
|
|
dest = self.job_dir / "pmsco"
|
|
ignore = shutil.ignore_patterns(".*", "~*", "*~")
|
|
shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
|
|
shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
|
|
|
|
def _fix_run_file(self):
|
|
"""
|
|
fix the run file.
|
|
|
|
patch some entries of self.run_dict so that it can be used as run file.
|
|
the following changes are made:
|
|
1. set schedule.enabled to false so that the calculation is run.
|
|
2. set the output directory to the job directory.
|
|
3. set the log file to the job directory.
|
|
|
|
@return: None
|
|
"""
|
|
self.run_dict['schedule']['enabled'] = False
|
|
self.run_dict['project']['directories']['output'] = str(self.job_dir)
|
|
self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))
|
|
|
|
def _write_run_file(self):
|
|
"""
|
|
copy the run file.
|
|
|
|
this is a JSON dump of self.run_dict to the self.run_file file.
|
|
|
|
@return: None
|
|
"""
|
|
with open(self.run_file, "wt") as f:
|
|
json.dump(self.run_dict, f, indent=2)
|
|
|
|
def _write_job_file(self):
|
|
"""
|
|
create the job script.
|
|
|
|
this method must be implemented by a derived class.
|
|
the script must be written to the self.job_file file.
|
|
don't forget to make the file executable.
|
|
|
|
@return: None
|
|
"""
|
|
pass
|
|
|
|
|
|
class SlurmSchedule(JobSchedule):
|
|
"""
|
|
job schedule for a slurm scheduler.
|
|
|
|
this class implements commonly used features of the slurm scheduler.
|
|
host-specific features and the creation of the job file should be done in a derived class.
|
|
derived classes must, in particular, implement the _write_job_file method.
|
|
they can override other methods, too, but should call the inherited method first.
|
|
|
|
1. copy the source trees (pmsco and projects) to the job directory
|
|
2. copy a patched version of the run file.
|
|
3. call the submission command
|
|
|
|
the public properties of this class should be assigned from the run file.
|
|
"""
|
|
def __init__(self, project):
|
|
super(SlurmSchedule, self).__init__(project)
|
|
self.host = ""
|
|
self.nodes = 1
|
|
self.tasks_per_node = 8
|
|
self.wall_time = datetime.timedelta(hours=1)
|
|
self.signal_time = 600
|
|
self.manual = True
|
|
|
|
@staticmethod
|
|
def parse_timedelta(td):
|
|
"""
|
|
parse time delta input formats
|
|
|
|
converts a string or dictionary from run-file into datetime.timedelta.
|
|
|
|
@param td:
|
|
str: [days-]hours[:minutes[:seconds]]
|
|
dict: days, hours, minutes, seconds - at least one needs to be defined. values must be numeric.
|
|
datetime.timedelta - native type
|
|
@return: datetime.timedelta
|
|
"""
|
|
if isinstance(td, str):
|
|
dt = {}
|
|
d = td.split("-")
|
|
if len(d) > 1:
|
|
dt['days'] = float(d.pop(0))
|
|
t = d[0].split(":")
|
|
try:
|
|
dt['hours'] = float(t.pop(0))
|
|
dt['minutes'] = float(t.pop(0))
|
|
dt['seconds'] = float(t.pop(0))
|
|
except (IndexError, ValueError):
|
|
pass
|
|
td = datetime.timedelta(**dt)
|
|
elif isinstance(td, collections.abc.Mapping):
|
|
td = datetime.timedelta(**td)
|
|
return td
|
|
|
|
def validate(self):
|
|
super(SlurmSchedule, self).validate()
|
|
self.wall_time = self.parse_timedelta(self.wall_time)
|
|
assert self.job_dir.is_absolute()
|
|
|
|
def submit(self):
|
|
"""
|
|
call the sbatch command
|
|
|
|
if manual is true, the job files are generated but the job is not submitted.
|
|
|
|
@return: None
|
|
"""
|
|
super(SlurmSchedule, self).submit()
|
|
args = ['sbatch', str(self.job_file)]
|
|
print(" ".join(args))
|
|
if self.manual:
|
|
print("manual run - job files created but not submitted")
|
|
else:
|
|
cp = subprocess.run(args)
|
|
cp.check_returncode()
|
|
|
|
|
|
class PsiRaSchedule(SlurmSchedule):
|
|
"""
|
|
job shedule for the Ra cluster at PSI.
|
|
|
|
this class selects specific features of the Ra cluster,
|
|
such as the partition and node type (24 or 32 cores).
|
|
it also implements the _write_job_file method.
|
|
"""
|
|
|
|
## @var partition (str)
|
|
#
|
|
# the partition is selected based on wall time and number of tasks by the validate() method.
|
|
# it should not be listed in the run file.
|
|
|
|
def __init__(self, project):
|
|
super(PsiRaSchedule, self).__init__(project)
|
|
self.partition = "shared"
|
|
|
|
def validate(self):
|
|
super(PsiRaSchedule, self).validate()
|
|
assert self.nodes <= 2
|
|
assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
|
|
assert self.wall_time.total_seconds() >= 60
|
|
if self.wall_time.total_seconds() > 24 * 60 * 60:
|
|
self.partition = "week"
|
|
elif self.tasks_per_node < 24:
|
|
self.partition = "shared"
|
|
else:
|
|
self.partition = "day"
|
|
assert self.partition in ["day", "week", "shared"]
|
|
|
|
def _write_job_file(self):
|
|
lines = []
|
|
|
|
lines.append('#!/bin/bash')
|
|
lines.append('#SBATCH --export=NONE')
|
|
lines.append(f'#SBATCH --job-name="{self.project.job_name}"')
|
|
lines.append(f'#SBATCH --partition={self.partition}')
|
|
lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
|
|
lines.append(f'#SBATCH --nodes={self.nodes}')
|
|
lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
|
|
if self.tasks_per_node > 24:
|
|
lines.append('#SBATCH --cores-per-socket=16')
|
|
# 0 - 65535 seconds
|
|
# currently, PMSCO does not react to signals properly
|
|
# lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
|
|
lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
|
|
lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
|
|
lines.append('module load psi-python36/4.4.0')
|
|
lines.append('module load gcc/4.8.5')
|
|
lines.append('module load openmpi/3.1.3')
|
|
lines.append('source activate pmsco')
|
|
lines.append(f'cd "{self.job_dir}"')
|
|
lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
|
|
lines.append(f'cd "{self.job_dir}"')
|
|
lines.append('rm -rf pmsco')
|
|
lines.append('exit 0')
|
|
|
|
self.job_file.write_text("\n".join(lines))
|
|
self.job_file.chmod(0o755)
|