Initial public release of competence factor extractor

2025-10-29 11:22:58 +01:00
commit 7ff26cb00b
1 changed files with 232 additions and 0 deletions
@@ -0,0 +1,232 @@
+"""
+extract_competence_factor.py
+----------------------------
+
+Purpose:
+    Extract and analyze reviewer competence factors from Excel sheets.
+
+Outputs:
+    - competence_factor_audit.csv   (per-review entries)
+    - competence_rowavg_audit.csv  (per-row averages)
+    - competence_factor_histogram.png
+    - competence_rowavg_histogram.png
+
+Behavior:
+    - Reads all sheets from the specified Excel files.
+    - Detects 'comp:' values in any column labeled 'Review'.
+    - Computes both per-review statistics and per-row averages.
+    - Produces CSVs and histograms without altering the input data.
+"""
+
+#!/usr/bin/env python3
+__version__ = "1.0"
+__author__ = "Andreas Menzel"
+__last_modified__ = "2025-10-29"
+
+# =============================================================================
+# IMPORTS
+# =============================================================================
+import sys, re, math
+from pathlib import Path
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import logging
+
+# suppressing warnings that I will not act upon anyways
+import warnings
+
+warnings.filterwarnings(
+    "ignore",
+    message="Workbook contains no default style",
+    category=UserWarning,
+    module="openpyxl",
+)
+
+
+# =============================================================================
+# FILE NAMES etc.
+# =============================================================================
+AUDIT_ALL_FILE = "competence_factor_audit.csv"
+AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
+HIST_ALL_FILE = "competence_factor_histogram.png"
+HIST_AVG_FILE = "competence_rowavg_histogram.png"
+
+
+# =============================================================================
+# SCREEN OUTPUT
+# =============================================================================
+logging.basicConfig(level=logging.INFO, format="%(message)s")
+log = logging.getLogger(__name__)
+
+
+def print_stats(title: str, data: pd.Series) -> None:
+    """Prints mean, median, std, and range for a numeric Series."""
+    log.info(f"\n=== {title} ===")
+    log.info(f"N       = {len(data)}")
+    log.info(f"Mean    = {data.mean():.3f}")
+    log.info(f"Median  = {data.median():.3f}")
+    log.info(f"Std     = {data.std():.3f}")
+    log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}")
+
+
+# =============================================================================
+# HISTOGRAM OUTPUT
+# =============================================================================
+def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
+    """Create and save a histogram centered on 0–1 competence values."""
+    bin_width = 0.1
+    centers = np.arange(0, 1.0 + bin_width, bin_width)
+    edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
+    edges = np.clip(edges, 0, 1.05)
+
+    plt.figure(figsize=(7, 4))
+    plt.hist(data, bins=edges, edgecolor="black", align="mid")
+    plt.xlim(-0.05, 1.05)
+    plt.xticks(np.arange(0, 1.01, 0.1))
+    plt.xlabel(xlabel)
+    plt.ylabel("Count")
+    plt.title(title)
+    plt.tight_layout()
+    plt.savefig(filename, dpi=150)
+    plt.close()
+    log.info(f"Saved histogram → {filename}")
+
+
+# =============================================================================
+# PARSING FUNCTION
+# =============================================================================
+def parse_competence(text):
+    comp_regex = re.compile(
+        r"(?ix)"  # case-insensitive, verbose
+        r"\bcomp\s*[:=]\s*"  # 'comp:' or 'comp='
+        r"(1(?:[.,]\d*)?|0*[.,]?\d+)"  # numeric formats like 1, 1.0, 1., .5, 0.75
+        r"\b"
+    )
+
+    if not isinstance(text, str):
+        return None
+    match_all = list(comp_regex.finditer(text))
+
+    if not match_all:
+        return None
+    match_last = match_all[-1]
+    num_value = float(match_last.group(1).replace(",", "."))
+    return num_value, match_last.group(0)
+
+
+# =============================================================================
+# MAIN ENTRY POINT
+# =============================================================================
+def main(paths):
+    recs = []
+    all_avg_rows = []
+    for p in paths:
+        try:
+            xl = pd.ExcelFile(p)
+        except Exception as e:
+            log.info(f"! open failed {p}: {e}")
+            continue
+        for sh in xl.sheet_names:
+            try:
+                df = xl.parse(sh, dtype=str)
+            except Exception:
+                continue
+
+            # --- identify all "Review" columns once per sheet ---
+            review_cols = [c for c in df.columns if "review" in str(c).lower()]
+
+            # --- extract individual competence values ---
+            for col in review_cols:
+                for idx, raw in df[col].dropna().items():
+                    r = parse_competence(raw)
+                    if not r:
+                        continue
+                    comp_value, token = r
+                    recs.append(
+                        {
+                            "file": Path(p).name,
+                            "sheet": sh,
+                            "column": str(col),
+                            "row_index": idx if isinstance(idx, int) else None,
+                            "raw_text": raw,
+                            "token": token,
+                            "comp": comp_value,
+                        }
+                    )
+            if review_cols:
+                # drop completely empty rows across all Review columns
+                df_reviews = df[review_cols].dropna(how="all")
+
+                comp_per_row = []
+                for idx, row in df_reviews.iterrows():
+                    vals = []
+                    for cell in row.dropna().astype(str):
+                        r = parse_competence(cell)
+                        if r is None:
+                            continue
+                        # Handle both (op,val,token) tuples and plain numerics
+                        if isinstance(r, (tuple, list)):
+                            # get numeric element
+                            val = None
+                            for x in r:
+                                if isinstance(x, (int, float)):
+                                    val = float(x)
+                                    break
+                            if val is not None:
+                                vals.append(val)
+                        elif isinstance(r, (int, float)):
+                            vals.append(float(r))
+                    if vals:  # only if we actually found competence numbers
+                        comp_per_row.append(
+                            {
+                                "file": Path(p).name,
+                                "sheet": sh,
+                                "row_index": idx,
+                                "n_reviews": len(vals),
+                                "avg_competence": sum(vals) / len(vals),
+                            }
+                        )
+
+                # only append once per sheet
+                if comp_per_row:
+                    all_avg_rows.append(pd.DataFrame(comp_per_row))
+
+    # Individual reviews
+    if not recs:
+        log.info("No competence factors found.")
+        return
+    out = pd.DataFrame(recs)
+    out.to_csv(AUDIT_ALL_FILE, index=False)
+    s_all = pd.Series(out["comp"], dtype=float)
+
+    print_stats("Competence Factors (Individual Reviews)", s_all)
+    save_histogram(
+        s_all,
+        HIST_ALL_FILE,
+        xlabel="Competence factor (individual reviews)",
+        title="Distribution of Competence Factors (Individual Reviews)",
+    )
+
+    # Per-row averages
+    if not all_avg_rows:
+        log.info("No review averages created.")
+        return
+    avg_df = pd.concat(all_avg_rows, ignore_index=True)
+    avg_df.to_csv(AUDIT_AVG_FILE, index=False)
+    s_avg = avg_df["avg_competence"]
+
+    print_stats("Competence Factors (Per-Row Averages)", s_avg)
+    save_histogram(
+        s_avg,
+        HIST_AVG_FILE,
+        xlabel="Average competence per proposal",
+        title="Distribution of Average Competence (Per Proposal)",
+    )
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
+        sys.exit(1)
+    main(sys.argv[1:])