""" extract_competence_factor.py ---------------------------- Purpose: Extract and analyze reviewer competence factors from Excel sheets. Outputs: - competence_factor_audit.csv (per-review entries) - competence_rowavg_audit.csv (per-row averages) - competence_factor_histogram.png - competence_rowavg_histogram.png Behavior: - Reads all sheets from the specified Excel files. - Detects 'comp:' values in any column labeled 'Review'. - Computes both per-review statistics and per-row averages. - Produces CSVs and histograms without altering the input data. """ #!/usr/bin/env python3 __version__ = "1.0" __author__ = "Andreas Menzel" __last_modified__ = "2025-10-29" # ============================================================================= # IMPORTS # ============================================================================= import sys, re, math from pathlib import Path import pandas as pd import numpy as np import matplotlib.pyplot as plt import logging # suppressing warnings that I will not act upon anyways import warnings warnings.filterwarnings( "ignore", message="Workbook contains no default style", category=UserWarning, module="openpyxl", ) # ============================================================================= # FILE NAMES etc. # ============================================================================= AUDIT_ALL_FILE = "competence_factor_audit.csv" AUDIT_AVG_FILE = "competence_rowavg_audit.csv" HIST_ALL_FILE = "competence_factor_histogram.png" HIST_AVG_FILE = "competence_rowavg_histogram.png" # ============================================================================= # SCREEN OUTPUT # ============================================================================= logging.basicConfig(level=logging.INFO, format="%(message)s") log = logging.getLogger(__name__) def print_stats(title: str, data: pd.Series) -> None: """Prints mean, median, std, and range for a numeric Series.""" log.info(f"\n=== {title} ===") log.info(f"N = {len(data)}") log.info(f"Mean = {data.mean():.3f}") log.info(f"Median = {data.median():.3f}") log.info(f"Std = {data.std():.3f}") log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}") # ============================================================================= # HISTOGRAM OUTPUT # ============================================================================= def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None: """Create and save a histogram centered on 0–1 competence values.""" bin_width = 0.1 centers = np.arange(0, 1.0 + bin_width, bin_width) edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2)) edges = np.clip(edges, 0, 1.05) plt.figure(figsize=(7, 4)) plt.hist(data, bins=edges, edgecolor="black", align="mid") plt.xlim(-0.05, 1.05) plt.xticks(np.arange(0, 1.01, 0.1)) plt.xlabel(xlabel) plt.ylabel("Count") plt.title(title) plt.tight_layout() plt.savefig(filename, dpi=150) plt.close() log.info(f"Saved histogram → {filename}") # ============================================================================= # PARSING FUNCTION # ============================================================================= def parse_competence(text): comp_regex = re.compile( r"(?ix)" # case-insensitive, verbose r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp=' r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75 r"\b" ) if not isinstance(text, str): return None match_all = list(comp_regex.finditer(text)) if not match_all: return None match_last = match_all[-1] num_value = float(match_last.group(1).replace(",", ".")) return num_value, match_last.group(0) # ============================================================================= # MAIN ENTRY POINT # ============================================================================= def main(paths): recs = [] all_avg_rows = [] for p in paths: try: xl = pd.ExcelFile(p) except Exception as e: log.info(f"! open failed {p}: {e}") continue for sh in xl.sheet_names: try: df = xl.parse(sh, dtype=str) except Exception: continue # --- identify all "Review" columns once per sheet --- review_cols = [c for c in df.columns if "review" in str(c).lower()] # --- extract individual competence values --- for col in review_cols: for idx, raw in df[col].dropna().items(): r = parse_competence(raw) if not r: continue comp_value, token = r recs.append( { "file": Path(p).name, "sheet": sh, "column": str(col), "row_index": idx if isinstance(idx, int) else None, "raw_text": raw, "token": token, "comp": comp_value, } ) if review_cols: # drop completely empty rows across all Review columns df_reviews = df[review_cols].dropna(how="all") comp_per_row = [] for idx, row in df_reviews.iterrows(): vals = [] for cell in row.dropna().astype(str): r = parse_competence(cell) if r is None: continue # Handle both (op,val,token) tuples and plain numerics if isinstance(r, (tuple, list)): # get numeric element val = None for x in r: if isinstance(x, (int, float)): val = float(x) break if val is not None: vals.append(val) elif isinstance(r, (int, float)): vals.append(float(r)) if vals: # only if we actually found competence numbers comp_per_row.append( { "file": Path(p).name, "sheet": sh, "row_index": idx, "n_reviews": len(vals), "avg_competence": sum(vals) / len(vals), } ) # only append once per sheet if comp_per_row: all_avg_rows.append(pd.DataFrame(comp_per_row)) # Individual reviews if not recs: log.info("No competence factors found.") return out = pd.DataFrame(recs) out.to_csv(AUDIT_ALL_FILE, index=False) s_all = pd.Series(out["comp"], dtype=float) print_stats("Competence Factors (Individual Reviews)", s_all) save_histogram( s_all, HIST_ALL_FILE, xlabel="Competence factor (individual reviews)", title="Distribution of Competence Factors (Individual Reviews)", ) # Per-row averages if not all_avg_rows: log.info("No review averages created.") return avg_df = pd.concat(all_avg_rows, ignore_index=True) avg_df.to_csv(AUDIT_AVG_FILE, index=False) s_avg = avg_df["avg_competence"] print_stats("Competence Factors (Per-Row Averages)", s_avg) save_histogram( s_avg, HIST_AVG_FILE, xlabel="Average competence per proposal", title="Distribution of Average Competence (Per Proposal)", ) if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...") sys.exit(1) main(sys.argv[1:])