From 7ff26cb00b2e6a3021fc1015dc816a695371e7d2 Mon Sep 17 00:00:00 2001 From: menzel Date: Wed, 29 Oct 2025 11:22:58 +0100 Subject: [PATCH] Initial public release of competence factor extractor --- extract_competence_factor.py | 232 +++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 extract_competence_factor.py diff --git a/extract_competence_factor.py b/extract_competence_factor.py new file mode 100644 index 0000000..665732c --- /dev/null +++ b/extract_competence_factor.py @@ -0,0 +1,232 @@ +""" +extract_competence_factor.py +---------------------------- + +Purpose: + Extract and analyze reviewer competence factors from Excel sheets. + +Outputs: + - competence_factor_audit.csv (per-review entries) + - competence_rowavg_audit.csv (per-row averages) + - competence_factor_histogram.png + - competence_rowavg_histogram.png + +Behavior: + - Reads all sheets from the specified Excel files. + - Detects 'comp:' values in any column labeled 'Review'. + - Computes both per-review statistics and per-row averages. + - Produces CSVs and histograms without altering the input data. +""" + +#!/usr/bin/env python3 +__version__ = "1.0" +__author__ = "Andreas Menzel" +__last_modified__ = "2025-10-29" + +# ============================================================================= +# IMPORTS +# ============================================================================= +import sys, re, math +from pathlib import Path +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import logging + +# suppressing warnings that I will not act upon anyways +import warnings + +warnings.filterwarnings( + "ignore", + message="Workbook contains no default style", + category=UserWarning, + module="openpyxl", +) + + +# ============================================================================= +# FILE NAMES etc. +# ============================================================================= +AUDIT_ALL_FILE = "competence_factor_audit.csv" +AUDIT_AVG_FILE = "competence_rowavg_audit.csv" +HIST_ALL_FILE = "competence_factor_histogram.png" +HIST_AVG_FILE = "competence_rowavg_histogram.png" + + +# ============================================================================= +# SCREEN OUTPUT +# ============================================================================= +logging.basicConfig(level=logging.INFO, format="%(message)s") +log = logging.getLogger(__name__) + + +def print_stats(title: str, data: pd.Series) -> None: + """Prints mean, median, std, and range for a numeric Series.""" + log.info(f"\n=== {title} ===") + log.info(f"N = {len(data)}") + log.info(f"Mean = {data.mean():.3f}") + log.info(f"Median = {data.median():.3f}") + log.info(f"Std = {data.std():.3f}") + log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}") + + +# ============================================================================= +# HISTOGRAM OUTPUT +# ============================================================================= +def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None: + """Create and save a histogram centered on 0–1 competence values.""" + bin_width = 0.1 + centers = np.arange(0, 1.0 + bin_width, bin_width) + edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2)) + edges = np.clip(edges, 0, 1.05) + + plt.figure(figsize=(7, 4)) + plt.hist(data, bins=edges, edgecolor="black", align="mid") + plt.xlim(-0.05, 1.05) + plt.xticks(np.arange(0, 1.01, 0.1)) + plt.xlabel(xlabel) + plt.ylabel("Count") + plt.title(title) + plt.tight_layout() + plt.savefig(filename, dpi=150) + plt.close() + log.info(f"Saved histogram → {filename}") + + +# ============================================================================= +# PARSING FUNCTION +# ============================================================================= +def parse_competence(text): + comp_regex = re.compile( + r"(?ix)" # case-insensitive, verbose + r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp=' + r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75 + r"\b" + ) + + if not isinstance(text, str): + return None + match_all = list(comp_regex.finditer(text)) + + if not match_all: + return None + match_last = match_all[-1] + num_value = float(match_last.group(1).replace(",", ".")) + return num_value, match_last.group(0) + + +# ============================================================================= +# MAIN ENTRY POINT +# ============================================================================= +def main(paths): + recs = [] + all_avg_rows = [] + for p in paths: + try: + xl = pd.ExcelFile(p) + except Exception as e: + log.info(f"! open failed {p}: {e}") + continue + for sh in xl.sheet_names: + try: + df = xl.parse(sh, dtype=str) + except Exception: + continue + + # --- identify all "Review" columns once per sheet --- + review_cols = [c for c in df.columns if "review" in str(c).lower()] + + # --- extract individual competence values --- + for col in review_cols: + for idx, raw in df[col].dropna().items(): + r = parse_competence(raw) + if not r: + continue + comp_value, token = r + recs.append( + { + "file": Path(p).name, + "sheet": sh, + "column": str(col), + "row_index": idx if isinstance(idx, int) else None, + "raw_text": raw, + "token": token, + "comp": comp_value, + } + ) + if review_cols: + # drop completely empty rows across all Review columns + df_reviews = df[review_cols].dropna(how="all") + + comp_per_row = [] + for idx, row in df_reviews.iterrows(): + vals = [] + for cell in row.dropna().astype(str): + r = parse_competence(cell) + if r is None: + continue + # Handle both (op,val,token) tuples and plain numerics + if isinstance(r, (tuple, list)): + # get numeric element + val = None + for x in r: + if isinstance(x, (int, float)): + val = float(x) + break + if val is not None: + vals.append(val) + elif isinstance(r, (int, float)): + vals.append(float(r)) + if vals: # only if we actually found competence numbers + comp_per_row.append( + { + "file": Path(p).name, + "sheet": sh, + "row_index": idx, + "n_reviews": len(vals), + "avg_competence": sum(vals) / len(vals), + } + ) + + # only append once per sheet + if comp_per_row: + all_avg_rows.append(pd.DataFrame(comp_per_row)) + + # Individual reviews + if not recs: + log.info("No competence factors found.") + return + out = pd.DataFrame(recs) + out.to_csv(AUDIT_ALL_FILE, index=False) + s_all = pd.Series(out["comp"], dtype=float) + + print_stats("Competence Factors (Individual Reviews)", s_all) + save_histogram( + s_all, + HIST_ALL_FILE, + xlabel="Competence factor (individual reviews)", + title="Distribution of Competence Factors (Individual Reviews)", + ) + + # Per-row averages + if not all_avg_rows: + log.info("No review averages created.") + return + avg_df = pd.concat(all_avg_rows, ignore_index=True) + avg_df.to_csv(AUDIT_AVG_FILE, index=False) + s_avg = avg_df["avg_competence"] + + print_stats("Competence Factors (Per-Row Averages)", s_avg) + save_histogram( + s_avg, + HIST_AVG_FILE, + xlabel="Average competence per proposal", + title="Distribution of Average Competence (Per Proposal)", + ) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...") + sys.exit(1) + main(sys.argv[1:])