diff --git a/slurm-eff-tool.py b/slurm-eff-tool.py index 657b28d..00fd5ac 100755 --- a/slurm-eff-tool.py +++ b/slurm-eff-tool.py @@ -2,31 +2,16 @@ """ slurm-eff-tool.py - Slurm job efficiency reporting and investigation tool. -Examples: - # first get an overview (-U/--aggr-user) and write a cachefile - slurm-eff-tool -O sacct.cache -U - # now you can read the cachefile for later runs and e.g. sort based on waste_Mem - slurm-eff-tool -F sacct.cache -U -s=-Y - - slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich - slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich - # supports multiple sort keys - slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time - # cluster jobs by Regexps - slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json - # supports slurm style formatting (will be further improved) - slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j" - Efficiency definitions: CPU_Efficiency = TotalCPU_seconds / CPUTimeRAW * 100 - Memory_Efficiency = max(MaxRSS_bytes across non-extern job steps) / requested_memory_bytes * 100 Time_Efficiency = ElapsedRaw_seconds / (TimelimitRaw_minutes * 60) * 100 -Memory efficiency intentionally uses the maximum MaxRSS seen across non-extern -Slurm job steps. This conceptually matches the practical "peak RSS" style of -seff, but it is not a perfect sum of all ranks. + Memory_Efficiency = 100 (sum of peaks over all job steps) / (mem allocated by system) This script intentionally asks sacct for raw parsable fields and caches those rows. +TODO: allow to cache a faster already parsed and binary format. Keep sacct text option for +debug option + """ # initial version of script produced by vibe coding of an exact functionality # description using chatgpt-5.5 @@ -64,6 +49,8 @@ SACCT_FIELDS = [ "CPUTimeRAW", "TotalCPU", "MaxRSS", + "AllocTRES", + "TRESUsageInTot", ] # Default columns to print in output, and their order @@ -76,6 +63,8 @@ DEFAULT_COLUMNS = [ "CPU_Efficiency", "waste_CPU", "ReqMem", + "UsedMem", + "AllocMem", "MemPerCPU", "MaxRSS_max", "Memory_Efficiency", @@ -94,6 +83,7 @@ ALIASES = { "c": "CPUs", "N": "Nodes", "m": "ReqMem", + "n": "UsedMem", "p": "MemPerCPU", "l": "ReqWalltime", "C": "Count", @@ -109,6 +99,8 @@ NUMERIC_COLUMNS = { "CPUs", "Nodes", "ReqMem", + "AllocMem", + "UsedMem", "MemPerCPU", "ReqWalltime", "Count", @@ -131,6 +123,8 @@ class JobRecord: nodes: int reqmem_gb: float | None mem_per_cpu_gb: float | None + mem_alloc_tres: float | None + mem_used_tres: float | None reqwall_hours: float | None reqmem_bytes_total: float | None elapsed_sec: int @@ -149,6 +143,8 @@ class OutputRow: CPUs: int Nodes: int ReqMem: float | None + AllocMem: float | None + UsedMem: float | None MemPerCPU: float | None ReqWalltime: float | None Count: int @@ -172,6 +168,8 @@ class OutputRow: "CPUs": self.CPUs, "Nodes": self.Nodes, "ReqMem": self.ReqMem, + "AllocMem": self.AllocMem, + "UsedMem": self.UsedMem, "MaxRSS_max": self.maxrss_max, "MemPerCPU": self.MemPerCPU, "ReqWalltime": self.ReqWalltime, @@ -241,7 +239,7 @@ def format_seconds(seconds: int | float | None) -> str: return f"{hours:02d}:{minutes:02d}:{sec:02d}" -def parse_size_to_bytes(value: str) -> float | None: +def parse_size_to_bytes(value: str) -> int | None: """Parse Slurm memory size fields such as 1024K, 2000M, 8Gn, 4Gc, 1.5T.""" s = (value or "").strip() if not s or s in {"Unknown", "0", "0K", "0M", "0G", "0T"}: @@ -255,7 +253,7 @@ def parse_size_to_bytes(value: str) -> float | None: if not m: return None - num = float(m.group(1)) + num = int(m.group(1)) unit = m.group(2).upper() or "K" # Slurm memory fields are normally KiB when unitless. mult = { "K": 1024, @@ -266,7 +264,30 @@ def parse_size_to_bytes(value: str) -> float | None: }[unit] return num * mult +def parse_tres_mem_bytes(alloc_tres: str) -> int | None: + """ + Extract memory allocation from an AllocTRES string. + Examples + -------- + + cpu=16,mem=64G,node=1,billing=16 + -> 68719476736 + + billing=64,cpu=64,mem=257698M,node=2 + -> 270215421952 + """ + + if not alloc_tres: + return None + + for field in alloc_tres.split(","): + field = field.strip() + + if field.startswith("mem="): + return parse_size_to_bytes(field[4:]) + + return None def reqmem_total_bytes(reqmem: str, cpus: int, nodes: int) -> float | None: """Convert ReqMem into total requested bytes for the whole job allocation.""" raw = (reqmem or "").strip() @@ -394,6 +415,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) grouped[base_job_id(row.get("JobIDRaw") or row.get("JobID", ""))].append(row) records: list[JobRecord] = [] + # Each group consists of sacct rows belonging to a job for _, group in grouped.items(): top = next((r for r in group if is_top_level_row(r)), group[0]) if only_user and top.get("User") != only_user: @@ -411,8 +433,21 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) if ".extern" not in (r.get("JobID") or r.get("JobIDRaw") or "") ] maxrss_values = [parse_size_to_bytes(r.get("MaxRSS", "")) for r in rss_source_rows] - maxrss = max([x for x in maxrss_values if x is not None], default=None) + # stores the largest maxrss value over all job steps + maxrss_cleaned = [x for x in maxrss_values if x is not None] + maxrss = max(maxrss_cleaned, default=None) + # find mem_used_tres over reported job steps. It may be in the + # *.interactive or *.0 steps + mem_used_tres_values = [parse_tres_mem_bytes(r.get("TRESUsageInTot", "")) \ + for r in rss_source_rows] + # mem_used_tres_cleaned = [x for x in mem_used_tres_values if x is not None] + mem_used_tres = max([x for x in mem_used_tres_values if x is not None], + default=None) + mem_used_tres_gb = None + if mem_used_tres is not None: + mem_used_tres_gb = mem_used_tres / (1024**3) + cpus = int(float(top.get("AllocCPUS") or 0)) nodes = int(float(top.get("NNodes") or 0)) elapsed = int(float(top.get("ElapsedRaw") or 0)) @@ -424,22 +459,37 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) cputime_raw = float(top.get("CPUTimeRAW") or 0) totalcpu = slurm_duration_to_seconds(top.get("TotalCPU", "")) + # Memory requested in total by the user reqmem = top.get("ReqMem", "") reqmem_total = reqmem_total_bytes(reqmem, cpus, nodes) + # Memory allocated by the scheduler, recorded in AllocTRES string + mem_alloc_tres = parse_tres_mem_bytes(top.get("AllocTRES") or "") + mem_alloc_tres_gb = None + if mem_alloc_tres is not None: + mem_alloc_tres_gb = mem_alloc_tres / (1024**3) + reqmem_gb = None if reqmem_total is not None: reqmem_gb = reqmem_total / (1024**3) reqwall_hours = timelimit_seconds / 3600.0 if timelimit_seconds > 0 else None - mem_per_cpu_gb = reqmem_gb / cpus if reqmem_gb is not None and cpus > 0 else None + mem_per_cpu_gb = mem_alloc_tres_gb / cpus if reqmem_gb is not None \ + and mem_alloc_tres_gb is not None and cpus > 0 else None + # EFFFICIENCY CALCULATIONS + # # TODO: find out why cpu_eff for some jobs is > 100%. Probably this is # related to 2 hyperthreads, and cputime_raw should have been doubled # for these cases cpu_eff = pct(totalcpu, cputime_raw) - mem_eff = pct(maxrss, reqmem_total) time_eff = pct(elapsed, timelimit_seconds) + # Old metric: + # mem_eff = (Peak RSS of all job steps) / (user requested memory) + # mem_eff = pct(maxrss, reqmem_total) + # Better metric: + # mem_eff = (sum of peaks over all job steps) / (mem allocated by system) + mem_eff = pct(mem_used_tres, mem_alloc_tres) records.append( JobRecord( @@ -449,7 +499,9 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) cpus=cpus, nodes=nodes, reqmem_gb=reqmem_gb, + mem_used_tres=mem_used_tres_gb, mem_per_cpu_gb=mem_per_cpu_gb, + mem_alloc_tres=mem_alloc_tres_gb, reqwall_hours=reqwall_hours, reqmem_bytes_total=reqmem_total, elapsed_sec=elapsed, @@ -516,6 +568,8 @@ def make_single_row(rec: JobRecord) -> OutputRow: CPUs=rec.cpus, Nodes=rec.nodes, ReqMem=rec.reqmem_gb, + AllocMem=rec.mem_alloc_tres, + UsedMem=rec.mem_used_tres, MemPerCPU=rec.mem_per_cpu_gb, ReqWalltime=rec.reqwall_hours, Count=1, @@ -545,6 +599,13 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) -> if walltime is not None: walltime /= 3600 + # We average over all jobs' allocated memory. Some jobs could have received + # different allocations, even though all of them had the same user required + # Memory. Maybe should generate a warning. Most of the time, what the user + # requested should match what the scheduler gave + alloc_mem = mean_or_none([r.mem_alloc_tres for r in records if r.mem_alloc_tres is not None]) + used_mem = mean_or_none([r.mem_used_tres for r in records if r.mem_used_tres is not None]) + memory_efficiency=mean_or_none(mem_eff_vals) count=len(records) @@ -565,6 +626,8 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) -> CPUs=first.cpus, Nodes=first.nodes, ReqMem=first.reqmem_gb, + AllocMem=alloc_mem, + UsedMem=used_mem, MemPerCPU=first.mem_per_cpu_gb, ReqWalltime=first.reqwall_hours, Count=count, @@ -654,7 +717,7 @@ def format_secs_to_hours(value: float | None) -> str: def format_value(value: Any, column: str | None = None) -> str: - if column in ["ReqMem", "MemPerCPU", "MaxRSS_max"]: + if column in ["ReqMem", "UsedMem","MemPerCPU", "MaxRSS_max", "AllocMem"]: return format_gb_value(value) if column in ["ReqWalltime", "Walltime", "Walltime_max"]: return format_secs_to_hours(value) @@ -711,7 +774,23 @@ def print_custom_format(rows: list[OutputRow], fmt: str, sdev: bool) -> None: def parse_args(argv: list[str]) -> argparse.Namespace: p = argparse.ArgumentParser( - description="Display seff-style CPU, memory and walltime efficiency values from sacct data." + formatter_class=argparse.RawDescriptionHelpFormatter, + description="Display seff-style CPU, memory and walltime efficiency values from sacct data.", + epilog="""Examples: + # first get an overview (-U/--aggr-user) and write a cachefile + slurm-eff-tool -O sacct.cache -U + # now you can read the cachefile for later runs and e.g. sort based on waste_Mem + slurm-eff-tool -F sacct.cache -U -s=-Y + + slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich + slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich + # supports multiple sort keys + slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time + # cluster jobs by Regexps + slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json + # supports slurm style formatting (will be further improved) + slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j" + """ ) p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S",