public-tools/extract_competence_factor.py

"""
extract_competence_factor.py
----------------------------

Purpose:
    Extract and analyze reviewer competence factors from Excel sheets.

Outputs:
    - competence_factor_audit.csv   (per-review entries)
    - competence_rowavg_audit.csv  (per-row averages)
    - competence_factor_histogram.png
    - competence_rowavg_histogram.png

Behavior:
    - Reads all sheets from the specified Excel files.
    - Detects 'comp:' values in any column labeled 'Review'.
    - Computes both per-review statistics and per-row averages.
    - Produces CSVs and histograms without altering the input data.
"""

#!/usr/bin/env python3
__version__ = "1.0"
__author__ = "Andreas Menzel"
__last_modified__ = "2025-10-29"

# =============================================================================
# IMPORTS
# =============================================================================
import sys, re, math
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging

# suppressing warnings that I will not act upon anyways
import warnings

warnings.filterwarnings(
    "ignore",
    message="Workbook contains no default style",
    category=UserWarning,
    module="openpyxl",
)


# =============================================================================
# FILE NAMES etc.
# =============================================================================
AUDIT_ALL_FILE = "competence_factor_audit.csv"
AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
HIST_ALL_FILE = "competence_factor_histogram.png"
HIST_AVG_FILE = "competence_rowavg_histogram.png"


# =============================================================================
# SCREEN OUTPUT
# =============================================================================
logging.basicConfig(level=logging.INFO, format="%(message)s")
log = logging.getLogger(__name__)


def print_stats(title: str, data: pd.Series) -> None:
    """Prints mean, median, std, and range for a numeric Series."""
    log.info(f"\n=== {title} ===")
    log.info(f"N       = {len(data)}")
    log.info(f"Mean    = {data.mean():.3f}")
    log.info(f"Median  = {data.median():.3f}")
    log.info(f"Std     = {data.std():.3f}")
    log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}")


# =============================================================================
# HISTOGRAM OUTPUT
# =============================================================================
def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
    """Create and save a histogram centered on 0–1 competence values."""
    bin_width = 0.1
    centers = np.arange(0, 1.0 + bin_width, bin_width)
    edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
    edges = np.clip(edges, 0, 1.05)

    plt.figure(figsize=(7, 4))
    plt.hist(data, bins=edges, edgecolor="black", align="mid")
    plt.xlim(-0.05, 1.05)
    plt.xticks(np.arange(0, 1.01, 0.1))
    plt.xlabel(xlabel)
    plt.ylabel("Count")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename, dpi=150)
    plt.close()
    log.info(f"Saved histogram → {filename}")


# =============================================================================
# PARSING FUNCTION
# =============================================================================
def parse_competence(text):
    comp_regex = re.compile(
        r"(?ix)"  # case-insensitive, verbose
        r"\bcomp\s*[:=]\s*"  # 'comp:' or 'comp='
        r"(1(?:[.,]\d*)?|0*[.,]?\d+)"  # numeric formats like 1, 1.0, 1., .5, 0.75
        r"\b"
    )

    if not isinstance(text, str):
        return None
    match_all = list(comp_regex.finditer(text))

    if not match_all:
        return None
    match_last = match_all[-1]
    num_value = float(match_last.group(1).replace(",", "."))
    return num_value, match_last.group(0)


# =============================================================================
# MAIN ENTRY POINT
# =============================================================================
def main(paths):
    recs = []
    all_avg_rows = []
    for p in paths:
        try:
            xl = pd.ExcelFile(p)
        except Exception as e:
            log.info(f"! open failed {p}: {e}")
            continue
        for sh in xl.sheet_names:
            try:
                df = xl.parse(sh, dtype=str)
            except Exception:
                continue

            # --- identify all "Review" columns once per sheet ---
            review_cols = [c for c in df.columns if "review" in str(c).lower()]

            # --- extract individual competence values ---
            for col in review_cols:
                for idx, raw in df[col].dropna().items():
                    r = parse_competence(raw)
                    if not r:
                        continue
                    comp_value, token = r
                    recs.append(
                        {
                            "file": Path(p).name,
                            "sheet": sh,
                            "column": str(col),
                            "row_index": idx if isinstance(idx, int) else None,
                            "raw_text": raw,
                            "token": token,
                            "comp": comp_value,
                        }
                    )
            if review_cols:
                # drop completely empty rows across all Review columns
                df_reviews = df[review_cols].dropna(how="all")

                comp_per_row = []
                for idx, row in df_reviews.iterrows():
                    vals = []
                    for cell in row.dropna().astype(str):
                        r = parse_competence(cell)
                        if r is None:
                            continue
                        # Handle both (op,val,token) tuples and plain numerics
                        if isinstance(r, (tuple, list)):
                            # get numeric element
                            val = None
                            for x in r:
                                if isinstance(x, (int, float)):
                                    val = float(x)
                                    break
                            if val is not None:
                                vals.append(val)
                        elif isinstance(r, (int, float)):
                            vals.append(float(r))
                    if vals:  # only if we actually found competence numbers
                        comp_per_row.append(
                            {
                                "file": Path(p).name,
                                "sheet": sh,
                                "row_index": idx,
                                "n_reviews": len(vals),
                                "avg_competence": sum(vals) / len(vals),
                            }
                        )

                # only append once per sheet
                if comp_per_row:
                    all_avg_rows.append(pd.DataFrame(comp_per_row))

    # Individual reviews
    if not recs:
        log.info("No competence factors found.")
        return
    out = pd.DataFrame(recs)
    out.to_csv(AUDIT_ALL_FILE, index=False)
    s_all = pd.Series(out["comp"], dtype=float)

    print_stats("Competence Factors (Individual Reviews)", s_all)
    save_histogram(
        s_all,
        HIST_ALL_FILE,
        xlabel="Competence factor (individual reviews)",
        title="Distribution of Competence Factors (Individual Reviews)",
    )

    # Per-row averages
    if not all_avg_rows:
        log.info("No review averages created.")
        return
    avg_df = pd.concat(all_avg_rows, ignore_index=True)
    avg_df.to_csv(AUDIT_AVG_FILE, index=False)
    s_avg = avg_df["avg_competence"]

    print_stats("Competence Factors (Per-Row Averages)", s_avg)
    save_histogram(
        s_avg,
        HIST_AVG_FILE,
        xlabel="Average competence per proposal",
        title="Distribution of Average Competence (Per Proposal)",
    )


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
        sys.exit(1)
    main(sys.argv[1:])