diff --git a/slurm-eff-tool.py b/slurm-eff-tool.py index 7acaf70..729fd14 100755 --- a/slurm-eff-tool.py +++ b/slurm-eff-tool.py @@ -62,6 +62,7 @@ DEFAULT_COLUMNS = [ "CPUs", "Nodes", "ReqMem", + "MemPerCPU", "ReqWalltime", "Count", "CPU_Efficiency", @@ -76,6 +77,7 @@ ALIASES = { "c": "CPUs", "N": "Nodes", "m": "ReqMem", + "p": "MemPerCPU", "l": "ReqWalltime", "C": "Count", "e": "CPU_Efficiency", @@ -88,6 +90,7 @@ NUMERIC_COLUMNS = { "CPUs", "Nodes", "ReqMem", + "MemPerCPU", "ReqWalltime", "Count", "CPU_Efficiency", @@ -104,6 +107,7 @@ class JobRecord: cpus: int nodes: int reqmem_gb: float | None + mem_per_cpu_gb: float | None reqwall_hours: float | None reqmem_bytes_total: float | None elapsed_sec: int @@ -121,6 +125,7 @@ class OutputRow: CPUs: int Nodes: int ReqMem: float | None + MemPerCPU: float | None ReqWalltime: float | None Count: int CPU_Efficiency: float | None @@ -137,6 +142,7 @@ class OutputRow: "CPUs": self.CPUs, "Nodes": self.Nodes, "ReqMem": self.ReqMem, + "MemPerCPU": self.MemPerCPU, "ReqWalltime": self.ReqWalltime, "Count": self.Count, "CPU_Efficiency": self.CPU_Efficiency, @@ -385,6 +391,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) reqmem_gb = reqmem_total / (1024**3) reqwall_hours = timelimit_seconds / 3600.0 if timelimit_seconds > 0 else None + mem_per_cpu_gb = reqmem_gb / cpus if reqmem_gb is not None and cpus > 0 else None cpu_eff = pct(totalcpu, cputime_raw) mem_eff = pct(maxrss, reqmem_total) @@ -398,6 +405,7 @@ def build_job_records(rows: list[dict[str, str]], only_user: str | None = None) cpus=cpus, nodes=nodes, reqmem_gb=reqmem_gb, + mem_per_cpu_gb=mem_per_cpu_gb, reqwall_hours=reqwall_hours, reqmem_bytes_total=reqmem_total, elapsed_sec=elapsed, @@ -450,6 +458,7 @@ def make_single_row(rec: JobRecord) -> OutputRow: CPUs=rec.cpus, Nodes=rec.nodes, ReqMem=rec.reqmem_gb, + MemPerCPU=rec.mem_per_cpu_gb, ReqWalltime=rec.reqwall_hours, Count=1, CPU_Efficiency=rec.cpu_eff, @@ -472,6 +481,7 @@ def make_aggregate_row(records: list[JobRecord], username: str, jobname: str) -> CPUs=first.cpus, Nodes=first.nodes, ReqMem=first.reqmem_gb, + MemPerCPU=first.mem_per_cpu_gb, ReqWalltime=first.reqwall_hours, Count=len(records), CPU_Efficiency=mean_or_none(cpu_vals), @@ -510,18 +520,23 @@ def sort_rows(rows: list[OutputRow], spec: str | None) -> list[OutputRow]: asc = t.startswith("+") name = t[1:] if (desc or asc) else t col = resolve_column(name) - if col not in NUMERIC_COLUMNS: - die(f"--sort column must be numeric, got {col}") parsed.append((col, desc)) sorted_rows = rows # Stable-sort from least significant to most significant. for col, desc in reversed(parsed): - sorted_rows = sorted( - sorted_rows, - key=lambda r: float("-inf") if getattr(r, col) is None else getattr(r, col), - reverse=desc, - ) + if col in NUMERIC_COLUMNS: + sorted_rows = sorted( + sorted_rows, + key=lambda r: float("-inf") if getattr(r, col) is None else getattr(r, col), + reverse=desc, + ) + else: + sorted_rows = sorted( + sorted_rows, + key=lambda r: "" if getattr(r, col) is None else str(getattr(r, col)), + reverse=desc, + ) return sorted_rows @@ -551,6 +566,8 @@ def format_reqwall_hours(value: float | None) -> str: def format_value(value: Any, column: str | None = None) -> str: if column == "ReqMem": return format_reqmem_gb(value) + if column == "MemPerCPU": + return format_reqmem_gb(value) if column == "ReqWalltime": return format_reqwall_hours(value) @@ -609,10 +626,14 @@ def parse_args(argv: list[str]) -> argparse.Namespace: description="Display seff-style CPU, memory and walltime efficiency values from sacct data." ) - p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S") - p.add_argument("-E", "--end", help="sacct end time, passed to sacct -E") + p.add_argument("-S", "--start", help="sacct start time, passed to sacct -S", + default="now - 24 hours") + p.add_argument("-E", "--end", help="sacct end time, passed to sacct -E", + default="now") p.add_argument("-u", "--user", help="restrict to one user; passed as sacct -u unless reading from cache") - p.add_argument("--state", "--job-state", dest="state", help="sacct state filter, e.g. COMPLETED,FAILED,TIMEOUT") + p.add_argument("--state", "--job-state", dest="state", + default="COMPLETED", + help="sacct state filter, e.g. COMPLETED,FAILED,TIMEOUT") p.add_argument("-O", "--output-cache", help="write raw sacct output cache to this file") p.add_argument("-F", "--from-cache", help="read raw sacct output cache from this file instead of running sacct") @@ -633,7 +654,7 @@ def parse_args(argv: list[str]) -> argparse.Namespace: "--format", help=( "Slurm-like output format using aliases: " - "u=username,c=CPUs,N=Nodes,m=ReqMem,l=ReqWalltime,C=Count," + "u=username,c=CPUs,N=Nodes,m=ReqMem,p=MemPerCPU,l=ReqWalltime,C=Count," "e=CPU_Efficiency,M=Memory_Efficiency,t=Time_Efficiency,j=jobname" ), )