fixed/improved memory efficiency calculations

added parsing from TRES strings
2026-06-03 17:16:39 +02:00
parent 70d3fe7a26
commit d544cde7d9
1 changed files with 105 additions and 26 deletions
@@ -2,31 +2,16 @@
 """
 slurm-eff-tool.py - Slurm job efficiency reporting and investigation tool.

-Examples:
-  # first get an overview (-U/--aggr-user) and write a cachefile 
-  slurm-eff-tool -O sacct.cache -U
-  # now you can read the cachefile for later runs and e.g. sort based on waste_Mem
-  slurm-eff-tool -F sacct.cache -U -s=-Y
-
-  slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
-  slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
-  # supports multiple sort keys
-  slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
-  # cluster jobs by Regexps
-  slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
-  # supports slurm style formatting (will be further improved)
-  slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
-
 Efficiency definitions:
  CPU_Efficiency    = TotalCPU_seconds / CPUTimeRAW * 100
-  Memory_Efficiency = max(MaxRSS_bytes across non-extern job steps) / requested_memory_bytes * 100
  Time_Efficiency   = ElapsedRaw_seconds / (TimelimitRaw_minutes * 60) * 100

-Memory efficiency intentionally uses the maximum MaxRSS seen across non-extern
-Slurm job steps. This conceptually matches the practical "peak RSS" style of
-seff, but it is not a perfect sum of all ranks.
+  Memory_Efficiency  = 100 (sum of peaks over all job steps) / (mem allocated by system)

 This script intentionally asks sacct for raw parsable fields and caches those rows.
+TODO: allow to cache a faster already parsed and binary format. Keep sacct text option for
+debug option
+
 """
 # initial version of script produced by vibe coding of an exact functionality
 # description using chatgpt-5.5
@@ -64,6 +49,8 @@ SACCT_FIELDS = [
    "CPUTimeRAW",
    "TotalCPU",
    "MaxRSS",
+    "AllocTRES",
+    "TRESUsageInTot",
 ]

 # Default columns to print in output, and their order
@@ -76,6 +63,8 @@ DEFAULT_COLUMNS = [
    "CPU_Efficiency",
    "waste_CPU",
    "ReqMem",
+    "UsedMem",
+    "AllocMem",
    "MemPerCPU",
    "MaxRSS_max",
    "Memory_Efficiency",
@@ -94,6 +83,7 @@ ALIASES = {
    "c": "CPUs",
    "N": "Nodes",
    "m": "ReqMem",
+    "n": "UsedMem",
    "p": "MemPerCPU",
    "l": "ReqWalltime",
    "C": "Count",
@@ -109,6 +99,8 @@ NUMERIC_COLUMNS = {
    "CPUs",
    "Nodes",
    "ReqMem",
+    "AllocMem",
+    "UsedMem",
    "MemPerCPU",
    "ReqWalltime",
    "Count",
@@ -131,6 +123,8 @@ class JobRecord:
    nodes: int
    reqmem_gb: float | None
    mem_per_cpu_gb: float | None
+    mem_alloc_tres: float | None
+    mem_used_tres: float | None
    reqwall_hours: float | None
    reqmem_bytes_total: float | None
    elapsed_sec: int
@@ -149,6 +143,8 @@ class OutputRow:
    CPUs: int
    Nodes: int
    ReqMem: float | None
+    AllocMem: float | None
+    UsedMem: float | None
    MemPerCPU: float | None
    ReqWalltime: float | None
    Count: int
@@ -172,6 +168,8 @@ class OutputRow:
            "CPUs": self.CPUs,
            "Nodes": self.Nodes,
            "ReqMem": self.ReqMem,
+            "AllocMem": self.AllocMem,
+            "UsedMem": self.UsedMem,
            "MaxRSS_max": self.maxrss_max,
            "MemPerCPU": self.MemPerCPU,
            "ReqWalltime": self.ReqWalltime,
@@ -241,7 +239,7 @@ def format_seconds(seconds: int | float | None) -> str:
    return f"{hours:02d}:{minutes:02d}:{sec:02d}"


-def parse_size_to_bytes(value: str) -> float | None:
+def parse_size_to_bytes(value: str) -> int | None:
    """Parse Slurm memory size fields such as 1024K, 2000M, 8Gn, 4Gc, 1.5T."""
    s = (value or "").strip()
    if not s or s in {"Unknown", "0", "0K", "0M", "0G", "0T"}:
@@ -255,7 +253,7 @@ def parse_size_to_bytes(value: str) -> float | None:
    if not m:
        return None

-    num = float(m.group(1))
+    num = int(m.group(1))
    unit = m.group(2).upper() or "K"  # Slurm memory fields are normally KiB when unitless.
    mult = {
        "K": 1024,
@@ -266,7 +264,30 @@ def parse_size_to_bytes(value: str) -> float | None:
    }[unit]
    return num * mult

+def parse_tres_mem_bytes(alloc_tres: str) -> int | None:
+    """
+    Extract memory allocation from an AllocTRES string.

+    Examples
+    --------
+
+    cpu=16,mem=64G,node=1,billing=16
+        -> 68719476736
+
+    billing=64,cpu=64,mem=257698M,node=2
+        -> 270215421952
+    """
+
+    if not alloc_tres:
+        return None
+
+    for field in alloc_tres.split(","):
+        field = field.strip()
+
+        if field.startswith("mem="):
+            return parse_size_to_bytes(field[4:])
+
+    return None
 def reqmem_total_bytes(reqmem: str, cpus: int, nodes: int) -> float | None:
    """Convert ReqMem into total requested bytes for the whole job allocation."""
    raw = (reqmem or "").strip()
@@ -394,6 +415,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
        grouped[base_job_id(row.get("JobIDRaw") or row.get("JobID", ""))].append(row)

    records: list[JobRecord] = []
+    # Each group consists of sacct rows belonging to a job
    for _, group in grouped.items():
        top = next((r for r in group if is_top_level_row(r)), group[0])
        if only_user and top.get("User") != only_user:
@@ -411,8 +433,21 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
            if ".extern" not in (r.get("JobID") or r.get("JobIDRaw") or "")
        ]
        maxrss_values = [parse_size_to_bytes(r.get("MaxRSS", "")) for r in rss_source_rows]
-        maxrss = max([x for x in maxrss_values if x is not None], default=None)
+        # stores the largest maxrss value over all job steps
+        maxrss_cleaned = [x for x in maxrss_values if x is not None]
+        maxrss = max(maxrss_cleaned, default=None)

+        # find mem_used_tres over reported job steps. It may be in the
+        #  *.interactive or *.0 steps
+        mem_used_tres_values = [parse_tres_mem_bytes(r.get("TRESUsageInTot", "")) \
+                                 for r in rss_source_rows]
+        # mem_used_tres_cleaned = [x for x in mem_used_tres_values if x is not None]
+        mem_used_tres = max([x for x in mem_used_tres_values if x is not None],
+                            default=None)
+        mem_used_tres_gb = None
+        if mem_used_tres is not None:
+            mem_used_tres_gb = mem_used_tres / (1024**3)
+ 
        cpus = int(float(top.get("AllocCPUS") or 0))
        nodes = int(float(top.get("NNodes") or 0))
        elapsed = int(float(top.get("ElapsedRaw") or 0))
@@ -424,22 +459,37 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
        cputime_raw = float(top.get("CPUTimeRAW") or 0)
        totalcpu = slurm_duration_to_seconds(top.get("TotalCPU", ""))

+        # Memory requested in total by the user
        reqmem = top.get("ReqMem", "")
        reqmem_total = reqmem_total_bytes(reqmem, cpus, nodes)

+        # Memory allocated by the scheduler, recorded in AllocTRES string
+        mem_alloc_tres = parse_tres_mem_bytes(top.get("AllocTRES") or "")
+        mem_alloc_tres_gb = None
+        if mem_alloc_tres is not None:
+            mem_alloc_tres_gb = mem_alloc_tres / (1024**3)
+
        reqmem_gb = None
        if reqmem_total is not None:
            reqmem_gb = reqmem_total / (1024**3)

        reqwall_hours = timelimit_seconds / 3600.0 if timelimit_seconds > 0 else None
-        mem_per_cpu_gb = reqmem_gb / cpus if reqmem_gb is not None and cpus > 0 else None
+        mem_per_cpu_gb = mem_alloc_tres_gb / cpus if reqmem_gb is not None \
+            and mem_alloc_tres_gb is not None and cpus > 0 else None

+        # EFFFICIENCY CALCULATIONS
+        #
        # TODO: find out why cpu_eff for some jobs is > 100%. Probably this is
        #       related to 2 hyperthreads, and cputime_raw should have been doubled
        #       for these cases
        cpu_eff = pct(totalcpu, cputime_raw)
-        mem_eff = pct(maxrss, reqmem_total)
        time_eff = pct(elapsed, timelimit_seconds)
+        # Old metric:
+        #     mem_eff = (Peak RSS of all job steps) / (user requested memory)
+        # mem_eff = pct(maxrss, reqmem_total)
+        # Better metric:
+        #     mem_eff = (sum of peaks over all job steps) / (mem allocated by system)
+        mem_eff = pct(mem_used_tres, mem_alloc_tres)

        records.append(
            JobRecord(
@@ -449,7 +499,9 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None)
                cpus=cpus,
                nodes=nodes,
                reqmem_gb=reqmem_gb,
+                mem_used_tres=mem_used_tres_gb,
                mem_per_cpu_gb=mem_per_cpu_gb,
+                mem_alloc_tres=mem_alloc_tres_gb,
                reqwall_hours=reqwall_hours,
                reqmem_bytes_total=reqmem_total,
                elapsed_sec=elapsed,
@@ -516,6 +568,8 @@ def make_single_row(rec: JobRecord) -> OutputRow:
        CPUs=rec.cpus,
        Nodes=rec.nodes,
        ReqMem=rec.reqmem_gb,
+        AllocMem=rec.mem_alloc_tres,
+        UsedMem=rec.mem_used_tres,
        MemPerCPU=rec.mem_per_cpu_gb,
        ReqWalltime=rec.reqwall_hours,
        Count=1,
@@ -545,6 +599,13 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
    if walltime is not None:
        walltime /= 3600

+    # We average over all jobs' allocated memory. Some jobs could have received
+    # different allocations, even though all of them had the same user required
+    # Memory. Maybe should generate a warning. Most of the time, what the user
+    # requested should match what the scheduler gave
+    alloc_mem = mean_or_none([r.mem_alloc_tres for r in records if r.mem_alloc_tres is not None])
+    used_mem = mean_or_none([r.mem_used_tres for r in records if r.mem_used_tres is not None])
+
    memory_efficiency=mean_or_none(mem_eff_vals)
    count=len(records)

@@ -565,6 +626,8 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) ->
        CPUs=first.cpus,
        Nodes=first.nodes,
        ReqMem=first.reqmem_gb,
+        AllocMem=alloc_mem,
+        UsedMem=used_mem,
        MemPerCPU=first.mem_per_cpu_gb,
        ReqWalltime=first.reqwall_hours,
        Count=count,
@@ -654,7 +717,7 @@ def format_secs_to_hours(value: float | None) -> str:


 def format_value(value: Any, column: str | None = None) -> str:
-    if column in ["ReqMem", "MemPerCPU", "MaxRSS_max"]:
+    if column in ["ReqMem", "UsedMem","MemPerCPU", "MaxRSS_max", "AllocMem"]:
        return format_gb_value(value)
    if column in ["ReqWalltime", "Walltime", "Walltime_max"]:
        return format_secs_to_hours(value)
@@ -711,7 +774,23 @@ def print_custom_format(rows: list[OutputRow], fmt: str, sdev: bool) -> None:

 def parse_args(argv: list[str]) -> argparse.Namespace:
    p = argparse.ArgumentParser(
-        description="Display seff-style CPU, memory and walltime efficiency values from sacct data."
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description="Display seff-style CPU, memory and walltime efficiency values from sacct data.",
+        epilog="""Examples:
+  # first get an overview (-U/--aggr-user) and write a cachefile 
+  slurm-eff-tool -O sacct.cache -U
+  # now you can read the cachefile for later runs and e.g. sort based on waste_Mem
+  slurm-eff-tool -F sacct.cache -U -s=-Y
+
+  slurm-eff-tool.py -F sacct.cache --start 2026-05-01 --end 2026-05-22 -u dfeich
+  slurm-eff-tool.py -F sacct.cache -S 2026-05-01 -E now -u dfeich
+  # supports multiple sort keys
+  slurm-eff-tool.py -F sacct.cache --aggr-user --sdev -s cpu,-mem,time
+  # cluster jobs by Regexps
+  slurm-eff-tool.py -F sacct.cache -u dfeich -R '^vasp','^gromacs' --json
+  # supports slurm style formatting (will be further improved)
+  slurm-eff-tool.py -F sacct.cache -o "%.12u %c %N %m %.12l %C %.8e %.8M %.8t %.30j"
+        """
    )

    p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S",