Initial public release of competence factor extractor

This commit is contained in:
2025-10-29 11:22:58 +01:00
commit 7ff26cb00b
+232
View File
@@ -0,0 +1,232 @@
"""
extract_competence_factor.py
----------------------------
Purpose:
Extract and analyze reviewer competence factors from Excel sheets.
Outputs:
- competence_factor_audit.csv (per-review entries)
- competence_rowavg_audit.csv (per-row averages)
- competence_factor_histogram.png
- competence_rowavg_histogram.png
Behavior:
- Reads all sheets from the specified Excel files.
- Detects 'comp:' values in any column labeled 'Review'.
- Computes both per-review statistics and per-row averages.
- Produces CSVs and histograms without altering the input data.
"""
#!/usr/bin/env python3
__version__ = "1.0"
__author__ = "Andreas Menzel"
__last_modified__ = "2025-10-29"
# =============================================================================
# IMPORTS
# =============================================================================
import sys, re, math
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
# suppressing warnings that I will not act upon anyways
import warnings
warnings.filterwarnings(
"ignore",
message="Workbook contains no default style",
category=UserWarning,
module="openpyxl",
)
# =============================================================================
# FILE NAMES etc.
# =============================================================================
AUDIT_ALL_FILE = "competence_factor_audit.csv"
AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
HIST_ALL_FILE = "competence_factor_histogram.png"
HIST_AVG_FILE = "competence_rowavg_histogram.png"
# =============================================================================
# SCREEN OUTPUT
# =============================================================================
logging.basicConfig(level=logging.INFO, format="%(message)s")
log = logging.getLogger(__name__)
def print_stats(title: str, data: pd.Series) -> None:
"""Prints mean, median, std, and range for a numeric Series."""
log.info(f"\n=== {title} ===")
log.info(f"N = {len(data)}")
log.info(f"Mean = {data.mean():.3f}")
log.info(f"Median = {data.median():.3f}")
log.info(f"Std = {data.std():.3f}")
log.info(f"MinMax = {data.min():.3f} {data.max():.3f}")
# =============================================================================
# HISTOGRAM OUTPUT
# =============================================================================
def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
"""Create and save a histogram centered on 01 competence values."""
bin_width = 0.1
centers = np.arange(0, 1.0 + bin_width, bin_width)
edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
edges = np.clip(edges, 0, 1.05)
plt.figure(figsize=(7, 4))
plt.hist(data, bins=edges, edgecolor="black", align="mid")
plt.xlim(-0.05, 1.05)
plt.xticks(np.arange(0, 1.01, 0.1))
plt.xlabel(xlabel)
plt.ylabel("Count")
plt.title(title)
plt.tight_layout()
plt.savefig(filename, dpi=150)
plt.close()
log.info(f"Saved histogram → {filename}")
# =============================================================================
# PARSING FUNCTION
# =============================================================================
def parse_competence(text):
comp_regex = re.compile(
r"(?ix)" # case-insensitive, verbose
r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp='
r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75
r"\b"
)
if not isinstance(text, str):
return None
match_all = list(comp_regex.finditer(text))
if not match_all:
return None
match_last = match_all[-1]
num_value = float(match_last.group(1).replace(",", "."))
return num_value, match_last.group(0)
# =============================================================================
# MAIN ENTRY POINT
# =============================================================================
def main(paths):
recs = []
all_avg_rows = []
for p in paths:
try:
xl = pd.ExcelFile(p)
except Exception as e:
log.info(f"! open failed {p}: {e}")
continue
for sh in xl.sheet_names:
try:
df = xl.parse(sh, dtype=str)
except Exception:
continue
# --- identify all "Review" columns once per sheet ---
review_cols = [c for c in df.columns if "review" in str(c).lower()]
# --- extract individual competence values ---
for col in review_cols:
for idx, raw in df[col].dropna().items():
r = parse_competence(raw)
if not r:
continue
comp_value, token = r
recs.append(
{
"file": Path(p).name,
"sheet": sh,
"column": str(col),
"row_index": idx if isinstance(idx, int) else None,
"raw_text": raw,
"token": token,
"comp": comp_value,
}
)
if review_cols:
# drop completely empty rows across all Review columns
df_reviews = df[review_cols].dropna(how="all")
comp_per_row = []
for idx, row in df_reviews.iterrows():
vals = []
for cell in row.dropna().astype(str):
r = parse_competence(cell)
if r is None:
continue
# Handle both (op,val,token) tuples and plain numerics
if isinstance(r, (tuple, list)):
# get numeric element
val = None
for x in r:
if isinstance(x, (int, float)):
val = float(x)
break
if val is not None:
vals.append(val)
elif isinstance(r, (int, float)):
vals.append(float(r))
if vals: # only if we actually found competence numbers
comp_per_row.append(
{
"file": Path(p).name,
"sheet": sh,
"row_index": idx,
"n_reviews": len(vals),
"avg_competence": sum(vals) / len(vals),
}
)
# only append once per sheet
if comp_per_row:
all_avg_rows.append(pd.DataFrame(comp_per_row))
# Individual reviews
if not recs:
log.info("No competence factors found.")
return
out = pd.DataFrame(recs)
out.to_csv(AUDIT_ALL_FILE, index=False)
s_all = pd.Series(out["comp"], dtype=float)
print_stats("Competence Factors (Individual Reviews)", s_all)
save_histogram(
s_all,
HIST_ALL_FILE,
xlabel="Competence factor (individual reviews)",
title="Distribution of Competence Factors (Individual Reviews)",
)
# Per-row averages
if not all_avg_rows:
log.info("No review averages created.")
return
avg_df = pd.concat(all_avg_rows, ignore_index=True)
avg_df.to_csv(AUDIT_AVG_FILE, index=False)
s_avg = avg_df["avg_competence"]
print_stats("Competence Factors (Per-Row Averages)", s_avg)
save_histogram(
s_avg,
HIST_AVG_FILE,
xlabel="Average competence per proposal",
title="Distribution of Average Competence (Per Proposal)",
)
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
sys.exit(1)
main(sys.argv[1:])