Files
public-tools/extract_competence_factor.py

233 lines
8.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
extract_competence_factor.py
----------------------------
Purpose:
Extract and analyze reviewer competence factors from Excel sheets.
Outputs:
- competence_factor_audit.csv (per-review entries)
- competence_rowavg_audit.csv (per-row averages)
- competence_factor_histogram.png
- competence_rowavg_histogram.png
Behavior:
- Reads all sheets from the specified Excel files.
- Detects 'comp:' values in any column labeled 'Review'.
- Computes both per-review statistics and per-row averages.
- Produces CSVs and histograms without altering the input data.
"""
#!/usr/bin/env python3
__version__ = "1.0"
__author__ = "Andreas Menzel"
__last_modified__ = "2025-10-29"
# =============================================================================
# IMPORTS
# =============================================================================
import sys, re, math
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
# suppressing warnings that I will not act upon anyways
import warnings
warnings.filterwarnings(
"ignore",
message="Workbook contains no default style",
category=UserWarning,
module="openpyxl",
)
# =============================================================================
# FILE NAMES etc.
# =============================================================================
AUDIT_ALL_FILE = "competence_factor_audit.csv"
AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
HIST_ALL_FILE = "competence_factor_histogram.png"
HIST_AVG_FILE = "competence_rowavg_histogram.png"
# =============================================================================
# SCREEN OUTPUT
# =============================================================================
logging.basicConfig(level=logging.INFO, format="%(message)s")
log = logging.getLogger(__name__)
def print_stats(title: str, data: pd.Series) -> None:
"""Prints mean, median, std, and range for a numeric Series."""
log.info(f"\n=== {title} ===")
log.info(f"N = {len(data)}")
log.info(f"Mean = {data.mean():.3f}")
log.info(f"Median = {data.median():.3f}")
log.info(f"Std = {data.std():.3f}")
log.info(f"MinMax = {data.min():.3f} {data.max():.3f}")
# =============================================================================
# HISTOGRAM OUTPUT
# =============================================================================
def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
"""Create and save a histogram centered on 01 competence values."""
bin_width = 0.1
centers = np.arange(0, 1.0 + bin_width, bin_width)
edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
edges = np.clip(edges, 0, 1.05)
plt.figure(figsize=(7, 4))
plt.hist(data, bins=edges, edgecolor="black", align="mid")
plt.xlim(-0.05, 1.05)
plt.xticks(np.arange(0, 1.01, 0.1))
plt.xlabel(xlabel)
plt.ylabel("Count")
plt.title(title)
plt.tight_layout()
plt.savefig(filename, dpi=150)
plt.close()
log.info(f"Saved histogram → {filename}")
# =============================================================================
# PARSING FUNCTION
# =============================================================================
def parse_competence(text):
comp_regex = re.compile(
r"(?ix)" # case-insensitive, verbose
r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp='
r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75
r"\b"
)
if not isinstance(text, str):
return None
match_all = list(comp_regex.finditer(text))
if not match_all:
return None
match_last = match_all[-1]
num_value = float(match_last.group(1).replace(",", "."))
return num_value, match_last.group(0)
# =============================================================================
# MAIN ENTRY POINT
# =============================================================================
def main(paths):
recs = []
all_avg_rows = []
for p in paths:
try:
xl = pd.ExcelFile(p)
except Exception as e:
log.info(f"! open failed {p}: {e}")
continue
for sh in xl.sheet_names:
try:
df = xl.parse(sh, dtype=str)
except Exception:
continue
# --- identify all "Review" columns once per sheet ---
review_cols = [c for c in df.columns if "review" in str(c).lower()]
# --- extract individual competence values ---
for col in review_cols:
for idx, raw in df[col].dropna().items():
r = parse_competence(raw)
if not r:
continue
comp_value, token = r
recs.append(
{
"file": Path(p).name,
"sheet": sh,
"column": str(col),
"row_index": idx if isinstance(idx, int) else None,
"raw_text": raw,
"token": token,
"comp": comp_value,
}
)
if review_cols:
# drop completely empty rows across all Review columns
df_reviews = df[review_cols].dropna(how="all")
comp_per_row = []
for idx, row in df_reviews.iterrows():
vals = []
for cell in row.dropna().astype(str):
r = parse_competence(cell)
if r is None:
continue
# Handle both (op,val,token) tuples and plain numerics
if isinstance(r, (tuple, list)):
# get numeric element
val = None
for x in r:
if isinstance(x, (int, float)):
val = float(x)
break
if val is not None:
vals.append(val)
elif isinstance(r, (int, float)):
vals.append(float(r))
if vals: # only if we actually found competence numbers
comp_per_row.append(
{
"file": Path(p).name,
"sheet": sh,
"row_index": idx,
"n_reviews": len(vals),
"avg_competence": sum(vals) / len(vals),
}
)
# only append once per sheet
if comp_per_row:
all_avg_rows.append(pd.DataFrame(comp_per_row))
# Individual reviews
if not recs:
log.info("No competence factors found.")
return
out = pd.DataFrame(recs)
out.to_csv(AUDIT_ALL_FILE, index=False)
s_all = pd.Series(out["comp"], dtype=float)
print_stats("Competence Factors (Individual Reviews)", s_all)
save_histogram(
s_all,
HIST_ALL_FILE,
xlabel="Competence factor (individual reviews)",
title="Distribution of Competence Factors (Individual Reviews)",
)
# Per-row averages
if not all_avg_rows:
log.info("No review averages created.")
return
avg_df = pd.concat(all_avg_rows, ignore_index=True)
avg_df.to_csv(AUDIT_AVG_FILE, index=False)
s_avg = avg_df["avg_competence"]
print_stats("Competence Factors (Per-Row Averages)", s_avg)
save_histogram(
s_avg,
HIST_AVG_FILE,
xlabel="Average competence per proposal",
title="Distribution of Average Competence (Per Proposal)",
)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
sys.exit(1)
main(sys.argv[1:])