public release 3.0.0 - see README and CHANGES for details
This commit is contained in:
309
pmsco/schedule.py
Normal file
309
pmsco/schedule.py
Normal file
@ -0,0 +1,309 @@
|
||||
"""
|
||||
@package pmsco.schedule
|
||||
job schedule interface
|
||||
|
||||
this module defines common infrastructure to submit a pmsco calculation job to a job scheduler such as slurm.
|
||||
|
||||
the schedule can be defined as part of the run-file (see pmsco module).
|
||||
users may derive sub-classes in a separate module to adapt to their own computing cluster.
|
||||
|
||||
the basic call sequence is:
|
||||
1. create a schedule object.
|
||||
2. initialize its properties with job parameters.
|
||||
3. validate()
|
||||
4. submit()
|
||||
|
||||
@author Matthias Muntwiler, matthias.muntwiler@psi.ch
|
||||
|
||||
@copyright (c) 2015-21 by Paul Scherrer Institut @n
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); @n
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
"""
|
||||
|
||||
import collections.abc
|
||||
import commentjson as json
|
||||
import datetime
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import subprocess
|
||||
import pmsco.config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JobSchedule(pmsco.config.ConfigurableObject):
|
||||
"""
|
||||
base class for job schedule
|
||||
|
||||
this class defines the abstract interface and some utilities.
|
||||
derived classes may override any method, but should call the inherited method.
|
||||
|
||||
usage:
|
||||
1. create object, assigning a project instance.
|
||||
2. assign run_file.
|
||||
3. call validate.
|
||||
4. call submit.
|
||||
|
||||
this class' properties should not be listed in the run file - they will be overwritten.
|
||||
"""
|
||||
|
||||
## @var enabled (bool)
|
||||
#
|
||||
# this parameter signals whether pmsco should schedule a job or run the calculation.
|
||||
# it is not directly used by the schedule classes but by the pmsco module.
|
||||
# it must be defined in the run file and set to true to submit the job to a scheduler.
|
||||
# it is set to false in the run file copied to the job directory so that the job script starts the calculation.
|
||||
|
||||
def __init__(self, project):
|
||||
super(JobSchedule, self).__init__()
|
||||
self.project = project
|
||||
self.enabled = False
|
||||
self.run_dict = {}
|
||||
self.job_dir = Path()
|
||||
self.job_file = Path()
|
||||
self.run_file = Path()
|
||||
# directory that contains the pmsco and projects directories
|
||||
self.pmsco_root = Path(__file__).parent.parent
|
||||
|
||||
def validate(self):
|
||||
"""
|
||||
validate the job parameters.
|
||||
|
||||
make sure all object attributes are correct for submission.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
self.pmsco_root = Path(self.project.directories['pmsco']).parent
|
||||
output_dir = Path(self.project.directories['output'])
|
||||
|
||||
assert self.pmsco_root.is_dir()
|
||||
assert (self.pmsco_root / "pmsco").is_dir()
|
||||
assert (self.pmsco_root / "projects").is_dir()
|
||||
assert output_dir.is_dir()
|
||||
assert self.project.job_name
|
||||
|
||||
self.job_dir = output_dir / self.project.job_name
|
||||
self.job_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.job_file = (self.job_dir / self.project.job_name).with_suffix(".sh")
|
||||
self.run_file = (self.job_dir / self.project.job_name).with_suffix(".json")
|
||||
|
||||
def submit(self):
|
||||
"""
|
||||
submit the job to the scheduler.
|
||||
|
||||
as of this class, the method does to following:
|
||||
|
||||
1. copy source files
|
||||
2. copy a patched version of the run file.
|
||||
3. write the job file (_write_job_file must be implemented by a derived class).
|
||||
|
||||
@return: None
|
||||
"""
|
||||
self._copy_source()
|
||||
self._fix_run_file()
|
||||
self._write_run_file()
|
||||
self._write_job_file()
|
||||
|
||||
def _copy_source(self):
|
||||
"""
|
||||
copy the source files to the job directory.
|
||||
|
||||
the source_dir and job_dir attributes must be correct.
|
||||
the job_dir directory must not exist and will be created.
|
||||
|
||||
this is a utility method used internally by derived classes.
|
||||
|
||||
job_dir/pmsco/pmsco/**
|
||||
job_dir/pmsco/projects/**
|
||||
job_dir/job.sh
|
||||
job_dir/job.json
|
||||
|
||||
@return: None
|
||||
"""
|
||||
|
||||
source = self.pmsco_root
|
||||
dest = self.job_dir / "pmsco"
|
||||
ignore = shutil.ignore_patterns(".*", "~*", "*~")
|
||||
shutil.copytree(source / "pmsco", dest / "pmsco", ignore=ignore)
|
||||
shutil.copytree(source / "projects", dest / "projects", ignore=ignore)
|
||||
|
||||
def _fix_run_file(self):
|
||||
"""
|
||||
fix the run file.
|
||||
|
||||
patch some entries of self.run_dict so that it can be used as run file.
|
||||
the following changes are made:
|
||||
1. set schedule.enabled to false so that the calculation is run.
|
||||
2. set the output directory to the job directory.
|
||||
3. set the log file to the job directory.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
self.run_dict['schedule']['enabled'] = False
|
||||
self.run_dict['project']['directories']['output'] = str(self.job_dir)
|
||||
self.run_dict['project']['log_file'] = str((self.job_dir / self.project.job_name).with_suffix(".log"))
|
||||
|
||||
def _write_run_file(self):
|
||||
"""
|
||||
copy the run file.
|
||||
|
||||
this is a JSON dump of self.run_dict to the self.run_file file.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
with open(self.run_file, "wt") as f:
|
||||
json.dump(self.run_dict, f, indent=2)
|
||||
|
||||
def _write_job_file(self):
|
||||
"""
|
||||
create the job script.
|
||||
|
||||
this method must be implemented by a derived class.
|
||||
the script must be written to the self.job_file file.
|
||||
don't forget to make the file executable.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class SlurmSchedule(JobSchedule):
|
||||
"""
|
||||
job schedule for a slurm scheduler.
|
||||
|
||||
this class implements commonly used features of the slurm scheduler.
|
||||
host-specific features and the creation of the job file should be done in a derived class.
|
||||
derived classes must, in particular, implement the _write_job_file method.
|
||||
they can override other methods, too, but should call the inherited method first.
|
||||
|
||||
1. copy the source trees (pmsco and projects) to the job directory
|
||||
2. copy a patched version of the run file.
|
||||
3. call the submission command
|
||||
|
||||
the public properties of this class should be assigned from the run file.
|
||||
"""
|
||||
def __init__(self, project):
|
||||
super(SlurmSchedule, self).__init__(project)
|
||||
self.host = ""
|
||||
self.nodes = 1
|
||||
self.tasks_per_node = 8
|
||||
self.wall_time = datetime.timedelta(hours=1)
|
||||
self.signal_time = 600
|
||||
self.manual = True
|
||||
|
||||
@staticmethod
|
||||
def parse_timedelta(td):
|
||||
"""
|
||||
parse time delta input formats
|
||||
|
||||
converts a string or dictionary from run-file into datetime.timedelta.
|
||||
|
||||
@param td:
|
||||
str: [days-]hours[:minutes[:seconds]]
|
||||
dict: days, hours, minutes, seconds - at least one needs to be defined. values must be numeric.
|
||||
datetime.timedelta - native type
|
||||
@return: datetime.timedelta
|
||||
"""
|
||||
if isinstance(td, str):
|
||||
dt = {}
|
||||
d = td.split("-")
|
||||
if len(d) > 1:
|
||||
dt['days'] = float(d.pop(0))
|
||||
t = d[0].split(":")
|
||||
try:
|
||||
dt['hours'] = float(t.pop(0))
|
||||
dt['minutes'] = float(t.pop(0))
|
||||
dt['seconds'] = float(t.pop(0))
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
td = datetime.timedelta(**dt)
|
||||
elif isinstance(td, collections.abc.Mapping):
|
||||
td = datetime.timedelta(**td)
|
||||
return td
|
||||
|
||||
def validate(self):
|
||||
super(SlurmSchedule, self).validate()
|
||||
self.wall_time = self.parse_timedelta(self.wall_time)
|
||||
assert self.job_dir.is_absolute()
|
||||
|
||||
def submit(self):
|
||||
"""
|
||||
call the sbatch command
|
||||
|
||||
if manual is true, the job files are generated but the job is not submitted.
|
||||
|
||||
@return: None
|
||||
"""
|
||||
super(SlurmSchedule, self).submit()
|
||||
args = ['sbatch', str(self.job_file)]
|
||||
print(" ".join(args))
|
||||
if self.manual:
|
||||
print("manual run - job files created but not submitted")
|
||||
else:
|
||||
cp = subprocess.run(args)
|
||||
cp.check_returncode()
|
||||
|
||||
|
||||
class PsiRaSchedule(SlurmSchedule):
|
||||
"""
|
||||
job shedule for the Ra cluster at PSI.
|
||||
|
||||
this class selects specific features of the Ra cluster,
|
||||
such as the partition and node type (24 or 32 cores).
|
||||
it also implements the _write_job_file method.
|
||||
"""
|
||||
|
||||
## @var partition (str)
|
||||
#
|
||||
# the partition is selected based on wall time and number of tasks by the validate() method.
|
||||
# it should not be listed in the run file.
|
||||
|
||||
def __init__(self, project):
|
||||
super(PsiRaSchedule, self).__init__(project)
|
||||
self.partition = "shared"
|
||||
|
||||
def validate(self):
|
||||
super(PsiRaSchedule, self).validate()
|
||||
assert self.nodes <= 2
|
||||
assert self.tasks_per_node <= 24 or self.tasks_per_node == 32
|
||||
assert self.wall_time.total_seconds() >= 60
|
||||
if self.wall_time.total_seconds() > 24 * 60 * 60:
|
||||
self.partition = "week"
|
||||
elif self.tasks_per_node < 24:
|
||||
self.partition = "shared"
|
||||
else:
|
||||
self.partition = "day"
|
||||
assert self.partition in ["day", "week", "shared"]
|
||||
|
||||
def _write_job_file(self):
|
||||
lines = []
|
||||
|
||||
lines.append('#!/bin/bash')
|
||||
lines.append('#SBATCH --export=NONE')
|
||||
lines.append(f'#SBATCH --job-name="{self.project.job_name}"')
|
||||
lines.append(f'#SBATCH --partition={self.partition}')
|
||||
lines.append(f'#SBATCH --time={int(self.wall_time.total_seconds() / 60)}')
|
||||
lines.append(f'#SBATCH --nodes={self.nodes}')
|
||||
lines.append(f'#SBATCH --ntasks-per-node={self.tasks_per_node}')
|
||||
if self.tasks_per_node > 24:
|
||||
lines.append('#SBATCH --cores-per-socket=16')
|
||||
# 0 - 65535 seconds
|
||||
# currently, PMSCO does not react to signals properly
|
||||
# lines.append(f'#SBATCH --signal=TERM@{self.signal_time}')
|
||||
lines.append(f'#SBATCH --output="{self.project.job_name}.o.%j"')
|
||||
lines.append(f'#SBATCH --error="{self.project.job_name}.e.%j"')
|
||||
lines.append('module load psi-python36/4.4.0')
|
||||
lines.append('module load gcc/4.8.5')
|
||||
lines.append('module load openmpi/3.1.3')
|
||||
lines.append('source activate pmsco')
|
||||
lines.append(f'cd "{self.job_dir}"')
|
||||
lines.append(f'mpirun python pmsco/pmsco -r {self.run_file.name}')
|
||||
lines.append(f'cd "{self.job_dir}"')
|
||||
lines.append('rm -rf pmsco')
|
||||
lines.append('exit 0')
|
||||
|
||||
self.job_file.write_text("\n".join(lines))
|
||||
self.job_file.chmod(0o755)
|
Reference in New Issue
Block a user