fixed/improved memory efficiency calculations
added parsing from TRES strings
This commit is contained in:
+105
-26
@@ -2,31 +2,16 @@
|
||||
"""
|
||||
slurm-eff-tool.py - Slurm job efficiency reporting and investigation tool.
|
||||
|
||||
Examples:
|
||||
# first get an overview (-U/--aggr-user) and write a cachefile
|
||||
slurm-eff-tool -O sacct.cache -U
|
||||
# now you can read the cachefile for later runs and e.g. sort based on waste_Mem
|
||||
slurm-eff-tool -F sacct.cache -U -s=-Y
|
||||
|
||||
slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
|
||||
slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
|
||||
# supports multiple sort keys
|
||||
slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
|
||||
# cluster jobs by Regexps
|
||||
slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
|
||||
# supports slurm style formatting (will be further improved)
|
||||
slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
|
||||
|
||||
Efficiency definitions:
|
||||
CPU_Efficiency = TotalCPU_seconds / CPUTimeRAW * 100
|
||||
Memory_Efficiency = max(MaxRSS_bytes across non-extern job steps) / requested_memory_bytes * 100
|
||||
Time_Efficiency = ElapsedRaw_seconds / (TimelimitRaw_minutes * 60) * 100
|
||||
|
||||
Memory efficiency intentionally uses the maximum MaxRSS seen across non-extern
|
||||
Slurm job steps. This conceptually matches the practical "peak RSS" style of
|
||||
seff, but it is not a perfect sum of all ranks.
|
||||
Memory_Efficiency = 100 (sum of peaks over all job steps) / (mem allocated by system)
|
||||
|
||||
This script intentionally asks sacct for raw parsable fields and caches those rows.
|
||||
TODO: allow to cache a faster already parsed and binary format. Keep sacct text option for
|
||||
debug option
|
||||
|
||||
"""
|
||||
# initial version of script produced by vibe coding of an exact functionality
|
||||
# description using chatgpt-5.5
|
||||
@@ -64,6 +49,8 @@ SACCT_FIELDS = [
|
||||
"CPUTimeRAW",
|
||||
"TotalCPU",
|
||||
"MaxRSS",
|
||||
"AllocTRES",
|
||||
"TRESUsageInTot",
|
||||
]
|
||||
|
||||
# Default columns to print in output, and their order
|
||||
@@ -76,6 +63,8 @@ DEFAULT_COLUMNS = [
|
||||
"CPU_Efficiency",
|
||||
"waste_CPU",
|
||||
"ReqMem",
|
||||
"UsedMem",
|
||||
"AllocMem",
|
||||
"MemPerCPU",
|
||||
"MaxRSS_max",
|
||||
"Memory_Efficiency",
|
||||
@@ -94,6 +83,7 @@ ALIASES = {
|
||||
"c": "CPUs",
|
||||
"N": "Nodes",
|
||||
"m": "ReqMem",
|
||||
"n": "UsedMem",
|
||||
"p": "MemPerCPU",
|
||||
"l": "ReqWalltime",
|
||||
"C": "Count",
|
||||
@@ -109,6 +99,8 @@ NUMERIC_COLUMNS = {
|
||||
"CPUs",
|
||||
"Nodes",
|
||||
"ReqMem",
|
||||
"AllocMem",
|
||||
"UsedMem",
|
||||
"MemPerCPU",
|
||||
"ReqWalltime",
|
||||
"Count",
|
||||
@@ -131,6 +123,8 @@ class JobRecord:
|
||||
nodes: int
|
||||
reqmem_gb: float | None
|
||||
mem_per_cpu_gb: float | None
|
||||
mem_alloc_tres: float | None
|
||||
mem_used_tres: float | None
|
||||
reqwall_hours: float | None
|
||||
reqmem_bytes_total: float | None
|
||||
elapsed_sec: int
|
||||
@@ -149,6 +143,8 @@ class OutputRow:
|
||||
CPUs: int
|
||||
Nodes: int
|
||||
ReqMem: float | None
|
||||
AllocMem: float | None
|
||||
UsedMem: float | None
|
||||
MemPerCPU: float | None
|
||||
ReqWalltime: float | None
|
||||
Count: int
|
||||
@@ -172,6 +168,8 @@ class OutputRow:
|
||||
"CPUs": self.CPUs,
|
||||
"Nodes": self.Nodes,
|
||||
"ReqMem": self.ReqMem,
|
||||
"AllocMem": self.AllocMem,
|
||||
"UsedMem": self.UsedMem,
|
||||
"MaxRSS_max": self.maxrss_max,
|
||||
"MemPerCPU": self.MemPerCPU,
|
||||
"ReqWalltime": self.ReqWalltime,
|
||||
@@ -241,7 +239,7 @@ def format_seconds(seconds: int | float | None) -> str:
|
||||
return f"{hours:02d}:{minutes:02d}:{sec:02d}"
|
||||
|
||||
|
||||
def parse_size_to_bytes(value: str) -> float | None:
|
||||
def parse_size_to_bytes(value: str) -> int | None:
|
||||
"""Parse Slurm memory size fields such as 1024K, 2000M, 8Gn, 4Gc, 1.5T."""
|
||||
s = (value or "").strip()
|
||||
if not s or s in {"Unknown", "0", "0K", "0M", "0G", "0T"}:
|
||||
@@ -255,7 +253,7 @@ def parse_size_to_bytes(value: str) -> float | None:
|
||||
if not m:
|
||||
return None
|
||||
|
||||
num = float(m.group(1))
|
||||
num = int(m.group(1))
|
||||
unit = m.group(2).upper() or "K" # Slurm memory fields are normally KiB when unitless.
|
||||
mult = {
|
||||
"K": 1024,
|
||||
@@ -266,7 +264,30 @@ def parse_size_to_bytes(value: str) -> float | None:
|
||||
}[unit]
|
||||
return num * mult
|
||||
|
||||
def parse_tres_mem_bytes(alloc_tres: str) -> int | None:
|
||||
"""
|
||||
Extract memory allocation from an AllocTRES string.
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
cpu=16,mem=64G,node=1,billing=16
|
||||
-> 68719476736
|
||||
|
||||
billing=64,cpu=64,mem=257698M,node=2
|
||||
-> 270215421952
|
||||
"""
|
||||
|
||||
if not alloc_tres:
|
||||
return None
|
||||
|
||||
for field in alloc_tres.split(","):
|
||||
field = field.strip()
|
||||
|
||||
if field.startswith("mem="):
|
||||
return parse_size_to_bytes(field[4:])
|
||||
|
||||
return None
|
||||
def reqmem_total_bytes(reqmem: str, cpus: int, nodes: int) -> float | None:
|
||||
"""Convert ReqMem into total requested bytes for the whole job allocation."""
|
||||
raw = (reqmem or "").strip()
|
||||
@@ -394,6 +415,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
|
||||
grouped[base_job_id(row.get("JobIDRaw") or row.get("JobID", ""))].append(row)
|
||||
|
||||
records: list[JobRecord] = []
|
||||
# Each group consists of sacct rows belonging to a job
|
||||
for _, group in grouped.items():
|
||||
top = next((r for r in group if is_top_level_row(r)), group[0])
|
||||
if only_user and top.get("User") != only_user:
|
||||
@@ -411,8 +433,21 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
|
||||
if ".extern" not in (r.get("JobID") or r.get("JobIDRaw") or "")
|
||||
]
|
||||
maxrss_values = [parse_size_to_bytes(r.get("MaxRSS", "")) for r in rss_source_rows]
|
||||
maxrss = max([x for x in maxrss_values if x is not None], default=None)
|
||||
# stores the largest maxrss value over all job steps
|
||||
maxrss_cleaned = [x for x in maxrss_values if x is not None]
|
||||
maxrss = max(maxrss_cleaned, default=None)
|
||||
|
||||
# find mem_used_tres over reported job steps. It may be in the
|
||||
# *.interactive or *.0 steps
|
||||
mem_used_tres_values = [parse_tres_mem_bytes(r.get("TRESUsageInTot", "")) \
|
||||
for r in rss_source_rows]
|
||||
# mem_used_tres_cleaned = [x for x in mem_used_tres_values if x is not None]
|
||||
mem_used_tres = max([x for x in mem_used_tres_values if x is not None],
|
||||
default=None)
|
||||
mem_used_tres_gb = None
|
||||
if mem_used_tres is not None:
|
||||
mem_used_tres_gb = mem_used_tres / (1024**3)
|
||||
|
||||
cpus = int(float(top.get("AllocCPUS") or 0))
|
||||
nodes = int(float(top.get("NNodes") or 0))
|
||||
elapsed = int(float(top.get("ElapsedRaw") or 0))
|
||||
@@ -424,22 +459,37 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
|
||||
cputime_raw = float(top.get("CPUTimeRAW") or 0)
|
||||
totalcpu = slurm_duration_to_seconds(top.get("TotalCPU", ""))
|
||||
|
||||
# Memory requested in total by the user
|
||||
reqmem = top.get("ReqMem", "")
|
||||
reqmem_total = reqmem_total_bytes(reqmem, cpus, nodes)
|
||||
|
||||
# Memory allocated by the scheduler, recorded in AllocTRES string
|
||||
mem_alloc_tres = parse_tres_mem_bytes(top.get("AllocTRES") or "")
|
||||
mem_alloc_tres_gb = None
|
||||
if mem_alloc_tres is not None:
|
||||
mem_alloc_tres_gb = mem_alloc_tres / (1024**3)
|
||||
|
||||
reqmem_gb = None
|
||||
if reqmem_total is not None:
|
||||
reqmem_gb = reqmem_total / (1024**3)
|
||||
|
||||
reqwall_hours = timelimit_seconds / 3600.0 if timelimit_seconds > 0 else None
|
||||
mem_per_cpu_gb = reqmem_gb / cpus if reqmem_gb is not None and cpus > 0 else None
|
||||
mem_per_cpu_gb = mem_alloc_tres_gb / cpus if reqmem_gb is not None \
|
||||
and mem_alloc_tres_gb is not None and cpus > 0 else None
|
||||
|
||||
# EFFFICIENCY CALCULATIONS
|
||||
#
|
||||
# TODO: find out why cpu_eff for some jobs is > 100%. Probably this is
|
||||
# related to 2 hyperthreads, and cputime_raw should have been doubled
|
||||
# for these cases
|
||||
cpu_eff = pct(totalcpu, cputime_raw)
|
||||
mem_eff = pct(maxrss, reqmem_total)
|
||||
time_eff = pct(elapsed, timelimit_seconds)
|
||||
# Old metric:
|
||||
# mem_eff = (Peak RSS of all job steps) / (user requested memory)
|
||||
# mem_eff = pct(maxrss, reqmem_total)
|
||||
# Better metric:
|
||||
# mem_eff = (sum of peaks over all job steps) / (mem allocated by system)
|
||||
mem_eff = pct(mem_used_tres, mem_alloc_tres)
|
||||
|
||||
records.append(
|
||||
JobRecord(
|
||||
@@ -449,7 +499,9 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
|
||||
cpus=cpus,
|
||||
nodes=nodes,
|
||||
reqmem_gb=reqmem_gb,
|
||||
mem_used_tres=mem_used_tres_gb,
|
||||
mem_per_cpu_gb=mem_per_cpu_gb,
|
||||
mem_alloc_tres=mem_alloc_tres_gb,
|
||||
reqwall_hours=reqwall_hours,
|
||||
reqmem_bytes_total=reqmem_total,
|
||||
elapsed_sec=elapsed,
|
||||
@@ -516,6 +568,8 @@ def make_single_row(rec: JobRecord) -> OutputRow:
|
||||
CPUs=rec.cpus,
|
||||
Nodes=rec.nodes,
|
||||
ReqMem=rec.reqmem_gb,
|
||||
AllocMem=rec.mem_alloc_tres,
|
||||
UsedMem=rec.mem_used_tres,
|
||||
MemPerCPU=rec.mem_per_cpu_gb,
|
||||
ReqWalltime=rec.reqwall_hours,
|
||||
Count=1,
|
||||
@@ -545,6 +599,13 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
|
||||
if walltime is not None:
|
||||
walltime /= 3600
|
||||
|
||||
# We average over all jobs' allocated memory. Some jobs could have received
|
||||
# different allocations, even though all of them had the same user required
|
||||
# Memory. Maybe should generate a warning. Most of the time, what the user
|
||||
# requested should match what the scheduler gave
|
||||
alloc_mem = mean_or_none([r.mem_alloc_tres for r in records if r.mem_alloc_tres is not None])
|
||||
used_mem = mean_or_none([r.mem_used_tres for r in records if r.mem_used_tres is not None])
|
||||
|
||||
memory_efficiency=mean_or_none(mem_eff_vals)
|
||||
count=len(records)
|
||||
|
||||
@@ -565,6 +626,8 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
|
||||
CPUs=first.cpus,
|
||||
Nodes=first.nodes,
|
||||
ReqMem=first.reqmem_gb,
|
||||
AllocMem=alloc_mem,
|
||||
UsedMem=used_mem,
|
||||
MemPerCPU=first.mem_per_cpu_gb,
|
||||
ReqWalltime=first.reqwall_hours,
|
||||
Count=count,
|
||||
@@ -654,7 +717,7 @@ def format_secs_to_hours(value: float | None) -> str:
|
||||
|
||||
|
||||
def format_value(value: Any, column: str | None = None) -> str:
|
||||
if column in ["ReqMem", "MemPerCPU", "MaxRSS_max"]:
|
||||
if column in ["ReqMem", "UsedMem","MemPerCPU", "MaxRSS_max", "AllocMem"]:
|
||||
return format_gb_value(value)
|
||||
if column in ["ReqWalltime", "Walltime", "Walltime_max"]:
|
||||
return format_secs_to_hours(value)
|
||||
@@ -711,7 +774,23 @@ def print_custom_format(rows: list[OutputRow], fmt: str, sdev: bool) -> None:
|
||||
|
||||
def parse_args(argv: list[str]) -> argparse.Namespace:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Display seff-style CPU, memory and walltime efficiency values from sacct data."
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="Display seff-style CPU, memory and walltime efficiency values from sacct data.",
|
||||
epilog="""Examples:
|
||||
# first get an overview (-U/--aggr-user) and write a cachefile
|
||||
slurm-eff-tool -O sacct.cache -U
|
||||
# now you can read the cachefile for later runs and e.g. sort based on waste_Mem
|
||||
slurm-eff-tool -F sacct.cache -U -s=-Y
|
||||
|
||||
slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
|
||||
slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
|
||||
# supports multiple sort keys
|
||||
slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
|
||||
# cluster jobs by Regexps
|
||||
slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
|
||||
# supports slurm style formatting (will be further improved)
|
||||
slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
|
||||
"""
|
||||
)
|
||||
|
||||
p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S",
|
||||
|
||||
Reference in New Issue
Block a user