fixed/improved memory efficiency calculations

added parsing from TRES strings
This commit is contained in:
2026-06-03 17:16:39 +02:00
parent 70d3fe7a26
commit d544cde7d9
+105 -26
View File
@@ -2,31 +2,16 @@
"""
slurm-eff-tool.py - Slurm job efficiency reporting and investigation tool.
Examples:
# first get an overview (-U/--aggr-user) and write a cachefile
slurm-eff-tool -O sacct.cache -U
# now you can read the cachefile for later runs and e.g. sort based on waste_Mem
slurm-eff-tool -F sacct.cache -U -s=-Y
slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
# supports multiple sort keys
slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
# cluster jobs by Regexps
slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
# supports slurm style formatting (will be further improved)
slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
Efficiency definitions:
CPU_Efficiency = TotalCPU_seconds / CPUTimeRAW * 100
Memory_Efficiency = max(MaxRSS_bytes across non-extern job steps) / requested_memory_bytes * 100
Time_Efficiency = ElapsedRaw_seconds / (TimelimitRaw_minutes * 60) * 100
Memory efficiency intentionally uses the maximum MaxRSS seen across non-extern
Slurm job steps. This conceptually matches the practical "peak RSS" style of
seff, but it is not a perfect sum of all ranks.
Memory_Efficiency = 100 (sum of peaks over all job steps) / (mem allocated by system)
This script intentionally asks sacct for raw parsable fields and caches those rows.
TODO: allow to cache a faster already parsed and binary format. Keep sacct text option for
debug option
"""
# initial version of script produced by vibe coding of an exact functionality
# description using chatgpt-5.5
@@ -64,6 +49,8 @@ SACCT_FIELDS = [
"CPUTimeRAW",
"TotalCPU",
"MaxRSS",
"AllocTRES",
"TRESUsageInTot",
]
# Default columns to print in output, and their order
@@ -76,6 +63,8 @@ DEFAULT_COLUMNS = [
"CPU_Efficiency",
"waste_CPU",
"ReqMem",
"UsedMem",
"AllocMem",
"MemPerCPU",
"MaxRSS_max",
"Memory_Efficiency",
@@ -94,6 +83,7 @@ ALIASES = {
"c": "CPUs",
"N": "Nodes",
"m": "ReqMem",
"n": "UsedMem",
"p": "MemPerCPU",
"l": "ReqWalltime",
"C": "Count",
@@ -109,6 +99,8 @@ NUMERIC_COLUMNS = {
"CPUs",
"Nodes",
"ReqMem",
"AllocMem",
"UsedMem",
"MemPerCPU",
"ReqWalltime",
"Count",
@@ -131,6 +123,8 @@ class JobRecord:
nodes: int
reqmem_gb: float | None
mem_per_cpu_gb: float | None
mem_alloc_tres: float | None
mem_used_tres: float | None
reqwall_hours: float | None
reqmem_bytes_total: float | None
elapsed_sec: int
@@ -149,6 +143,8 @@ class OutputRow:
CPUs: int
Nodes: int
ReqMem: float | None
AllocMem: float | None
UsedMem: float | None
MemPerCPU: float | None
ReqWalltime: float | None
Count: int
@@ -172,6 +168,8 @@ class OutputRow:
"CPUs": self.CPUs,
"Nodes": self.Nodes,
"ReqMem": self.ReqMem,
"AllocMem": self.AllocMem,
"UsedMem": self.UsedMem,
"MaxRSS_max": self.maxrss_max,
"MemPerCPU": self.MemPerCPU,
"ReqWalltime": self.ReqWalltime,
@@ -241,7 +239,7 @@ def format_seconds(seconds: int | float | None) -> str:
return f"{hours:02d}:{minutes:02d}:{sec:02d}"
def parse_size_to_bytes(value: str) -> float | None:
def parse_size_to_bytes(value: str) -> int | None:
"""Parse Slurm memory size fields such as 1024K, 2000M, 8Gn, 4Gc, 1.5T."""
s = (value or "").strip()
if not s or s in {"Unknown", "0", "0K", "0M", "0G", "0T"}:
@@ -255,7 +253,7 @@ def parse_size_to_bytes(value: str) -> float | None:
if not m:
return None
num = float(m.group(1))
num = int(m.group(1))
unit = m.group(2).upper() or "K" # Slurm memory fields are normally KiB when unitless.
mult = {
"K": 1024,
@@ -266,7 +264,30 @@ def parse_size_to_bytes(value: str) -> float | None:
}[unit]
return num * mult
def parse_tres_mem_bytes(alloc_tres: str) -> int | None:
"""
Extract memory allocation from an AllocTRES string.
Examples
--------
cpu=16,mem=64G,node=1,billing=16
-> 68719476736
billing=64,cpu=64,mem=257698M,node=2
-> 270215421952
"""
if not alloc_tres:
return None
for field in alloc_tres.split(","):
field = field.strip()
if field.startswith("mem="):
return parse_size_to_bytes(field[4:])
return None
def reqmem_total_bytes(reqmem: str, cpus: int, nodes: int) -> float | None:
"""Convert ReqMem into total requested bytes for the whole job allocation."""
raw = (reqmem or "").strip()
@@ -394,6 +415,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
grouped[base_job_id(row.get("JobIDRaw") or row.get("JobID", ""))].append(row)
records: list[JobRecord] = []
# Each group consists of sacct rows belonging to a job
for _, group in grouped.items():
top = next((r for r in group if is_top_level_row(r)), group[0])
if only_user and top.get("User") != only_user:
@@ -411,8 +433,21 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
if ".extern" not in (r.get("JobID") or r.get("JobIDRaw") or "")
]
maxrss_values = [parse_size_to_bytes(r.get("MaxRSS", "")) for r in rss_source_rows]
maxrss = max([x for x in maxrss_values if x is not None], default=None)
# stores the largest maxrss value over all job steps
maxrss_cleaned = [x for x in maxrss_values if x is not None]
maxrss = max(maxrss_cleaned, default=None)
# find mem_used_tres over reported job steps. It may be in the
# *.interactive or *.0 steps
mem_used_tres_values = [parse_tres_mem_bytes(r.get("TRESUsageInTot", "")) \
for r in rss_source_rows]
# mem_used_tres_cleaned = [x for x in mem_used_tres_values if x is not None]
mem_used_tres = max([x for x in mem_used_tres_values if x is not None],
default=None)
mem_used_tres_gb = None
if mem_used_tres is not None:
mem_used_tres_gb = mem_used_tres / (1024**3)
cpus = int(float(top.get("AllocCPUS") or 0))
nodes = int(float(top.get("NNodes") or 0))
elapsed = int(float(top.get("ElapsedRaw") or 0))
@@ -424,22 +459,37 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
cputime_raw = float(top.get("CPUTimeRAW") or 0)
totalcpu = slurm_duration_to_seconds(top.get("TotalCPU", ""))
# Memory requested in total by the user
reqmem = top.get("ReqMem", "")
reqmem_total = reqmem_total_bytes(reqmem, cpus, nodes)
# Memory allocated by the scheduler, recorded in AllocTRES string
mem_alloc_tres = parse_tres_mem_bytes(top.get("AllocTRES") or "")
mem_alloc_tres_gb = None
if mem_alloc_tres is not None:
mem_alloc_tres_gb = mem_alloc_tres / (1024**3)
reqmem_gb = None
if reqmem_total is not None:
reqmem_gb = reqmem_total / (1024**3)
reqwall_hours = timelimit_seconds / 3600.0 if timelimit_seconds > 0 else None
mem_per_cpu_gb = reqmem_gb / cpus if reqmem_gb is not None and cpus > 0 else None
mem_per_cpu_gb = mem_alloc_tres_gb / cpus if reqmem_gb is not None \
and mem_alloc_tres_gb is not None and cpus > 0 else None
# EFFFICIENCY CALCULATIONS
#
# TODO: find out why cpu_eff for some jobs is > 100%. Probably this is
# related to 2 hyperthreads, and cputime_raw should have been doubled
# for these cases
cpu_eff = pct(totalcpu, cputime_raw)
mem_eff = pct(maxrss, reqmem_total)
time_eff = pct(elapsed, timelimit_seconds)
# Old metric:
# mem_eff = (Peak RSS of all job steps) / (user requested memory)
# mem_eff = pct(maxrss, reqmem_total)
# Better metric:
# mem_eff = (sum of peaks over all job steps) / (mem allocated by system)
mem_eff = pct(mem_used_tres, mem_alloc_tres)
records.append(
JobRecord(
@@ -449,7 +499,9 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
cpus=cpus,
nodes=nodes,
reqmem_gb=reqmem_gb,
mem_used_tres=mem_used_tres_gb,
mem_per_cpu_gb=mem_per_cpu_gb,
mem_alloc_tres=mem_alloc_tres_gb,
reqwall_hours=reqwall_hours,
reqmem_bytes_total=reqmem_total,
elapsed_sec=elapsed,
@@ -516,6 +568,8 @@ def make_single_row(rec: JobRecord) -> OutputRow:
CPUs=rec.cpus,
Nodes=rec.nodes,
ReqMem=rec.reqmem_gb,
AllocMem=rec.mem_alloc_tres,
UsedMem=rec.mem_used_tres,
MemPerCPU=rec.mem_per_cpu_gb,
ReqWalltime=rec.reqwall_hours,
Count=1,
@@ -545,6 +599,13 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
if walltime is not None:
walltime /= 3600
# We average over all jobs' allocated memory. Some jobs could have received
# different allocations, even though all of them had the same user required
# Memory. Maybe should generate a warning. Most of the time, what the user
# requested should match what the scheduler gave
alloc_mem = mean_or_none([r.mem_alloc_tres for r in records if r.mem_alloc_tres is not None])
used_mem = mean_or_none([r.mem_used_tres for r in records if r.mem_used_tres is not None])
memory_efficiency=mean_or_none(mem_eff_vals)
count=len(records)
@@ -565,6 +626,8 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
CPUs=first.cpus,
Nodes=first.nodes,
ReqMem=first.reqmem_gb,
AllocMem=alloc_mem,
UsedMem=used_mem,
MemPerCPU=first.mem_per_cpu_gb,
ReqWalltime=first.reqwall_hours,
Count=count,
@@ -654,7 +717,7 @@ def format_secs_to_hours(value: float | None) -> str:
def format_value(value: Any, column: str | None = None) -> str:
if column in ["ReqMem", "MemPerCPU", "MaxRSS_max"]:
if column in ["ReqMem", "UsedMem","MemPerCPU", "MaxRSS_max", "AllocMem"]:
return format_gb_value(value)
if column in ["ReqWalltime", "Walltime", "Walltime_max"]:
return format_secs_to_hours(value)
@@ -711,7 +774,23 @@ def print_custom_format(rows: list[OutputRow], fmt: str, sdev: bool) -> None:
def parse_args(argv: list[str]) -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Display seff-style CPU, memory and walltime efficiency values from sacct data."
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Display seff-style CPU, memory and walltime efficiency values from sacct data.",
epilog="""Examples:
# first get an overview (-U/--aggr-user) and write a cachefile
slurm-eff-tool -O sacct.cache -U
# now you can read the cachefile for later runs and e.g. sort based on waste_Mem
slurm-eff-tool -F sacct.cache -U -s=-Y
slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
# supports multiple sort keys
slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
# cluster jobs by Regexps
slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
# supports slurm style formatting (will be further improved)
slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
"""
)
p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S",