233 lines
8.0 KiB
Python
233 lines
8.0 KiB
Python
"""
|
||
extract_competence_factor.py
|
||
----------------------------
|
||
|
||
Purpose:
|
||
Extract and analyze reviewer competence factors from Excel sheets.
|
||
|
||
Outputs:
|
||
- competence_factor_audit.csv (per-review entries)
|
||
- competence_rowavg_audit.csv (per-row averages)
|
||
- competence_factor_histogram.png
|
||
- competence_rowavg_histogram.png
|
||
|
||
Behavior:
|
||
- Reads all sheets from the specified Excel files.
|
||
- Detects 'comp:' values in any column labeled 'Review'.
|
||
- Computes both per-review statistics and per-row averages.
|
||
- Produces CSVs and histograms without altering the input data.
|
||
"""
|
||
|
||
#!/usr/bin/env python3
|
||
__version__ = "1.0"
|
||
__author__ = "Andreas Menzel"
|
||
__last_modified__ = "2025-10-29"
|
||
|
||
# =============================================================================
|
||
# IMPORTS
|
||
# =============================================================================
|
||
import sys, re, math
|
||
from pathlib import Path
|
||
import pandas as pd
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
import logging
|
||
|
||
# suppressing warnings that I will not act upon anyways
|
||
import warnings
|
||
|
||
warnings.filterwarnings(
|
||
"ignore",
|
||
message="Workbook contains no default style",
|
||
category=UserWarning,
|
||
module="openpyxl",
|
||
)
|
||
|
||
|
||
# =============================================================================
|
||
# FILE NAMES etc.
|
||
# =============================================================================
|
||
AUDIT_ALL_FILE = "competence_factor_audit.csv"
|
||
AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
|
||
HIST_ALL_FILE = "competence_factor_histogram.png"
|
||
HIST_AVG_FILE = "competence_rowavg_histogram.png"
|
||
|
||
|
||
# =============================================================================
|
||
# SCREEN OUTPUT
|
||
# =============================================================================
|
||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||
log = logging.getLogger(__name__)
|
||
|
||
|
||
def print_stats(title: str, data: pd.Series) -> None:
|
||
"""Prints mean, median, std, and range for a numeric Series."""
|
||
log.info(f"\n=== {title} ===")
|
||
log.info(f"N = {len(data)}")
|
||
log.info(f"Mean = {data.mean():.3f}")
|
||
log.info(f"Median = {data.median():.3f}")
|
||
log.info(f"Std = {data.std():.3f}")
|
||
log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}")
|
||
|
||
|
||
# =============================================================================
|
||
# HISTOGRAM OUTPUT
|
||
# =============================================================================
|
||
def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
|
||
"""Create and save a histogram centered on 0–1 competence values."""
|
||
bin_width = 0.1
|
||
centers = np.arange(0, 1.0 + bin_width, bin_width)
|
||
edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
|
||
edges = np.clip(edges, 0, 1.05)
|
||
|
||
plt.figure(figsize=(7, 4))
|
||
plt.hist(data, bins=edges, edgecolor="black", align="mid")
|
||
plt.xlim(-0.05, 1.05)
|
||
plt.xticks(np.arange(0, 1.01, 0.1))
|
||
plt.xlabel(xlabel)
|
||
plt.ylabel("Count")
|
||
plt.title(title)
|
||
plt.tight_layout()
|
||
plt.savefig(filename, dpi=150)
|
||
plt.close()
|
||
log.info(f"Saved histogram → {filename}")
|
||
|
||
|
||
# =============================================================================
|
||
# PARSING FUNCTION
|
||
# =============================================================================
|
||
def parse_competence(text):
|
||
comp_regex = re.compile(
|
||
r"(?ix)" # case-insensitive, verbose
|
||
r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp='
|
||
r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75
|
||
r"\b"
|
||
)
|
||
|
||
if not isinstance(text, str):
|
||
return None
|
||
match_all = list(comp_regex.finditer(text))
|
||
|
||
if not match_all:
|
||
return None
|
||
match_last = match_all[-1]
|
||
num_value = float(match_last.group(1).replace(",", "."))
|
||
return num_value, match_last.group(0)
|
||
|
||
|
||
# =============================================================================
|
||
# MAIN ENTRY POINT
|
||
# =============================================================================
|
||
def main(paths):
|
||
recs = []
|
||
all_avg_rows = []
|
||
for p in paths:
|
||
try:
|
||
xl = pd.ExcelFile(p)
|
||
except Exception as e:
|
||
log.info(f"! open failed {p}: {e}")
|
||
continue
|
||
for sh in xl.sheet_names:
|
||
try:
|
||
df = xl.parse(sh, dtype=str)
|
||
except Exception:
|
||
continue
|
||
|
||
# --- identify all "Review" columns once per sheet ---
|
||
review_cols = [c for c in df.columns if "review" in str(c).lower()]
|
||
|
||
# --- extract individual competence values ---
|
||
for col in review_cols:
|
||
for idx, raw in df[col].dropna().items():
|
||
r = parse_competence(raw)
|
||
if not r:
|
||
continue
|
||
comp_value, token = r
|
||
recs.append(
|
||
{
|
||
"file": Path(p).name,
|
||
"sheet": sh,
|
||
"column": str(col),
|
||
"row_index": idx if isinstance(idx, int) else None,
|
||
"raw_text": raw,
|
||
"token": token,
|
||
"comp": comp_value,
|
||
}
|
||
)
|
||
if review_cols:
|
||
# drop completely empty rows across all Review columns
|
||
df_reviews = df[review_cols].dropna(how="all")
|
||
|
||
comp_per_row = []
|
||
for idx, row in df_reviews.iterrows():
|
||
vals = []
|
||
for cell in row.dropna().astype(str):
|
||
r = parse_competence(cell)
|
||
if r is None:
|
||
continue
|
||
# Handle both (op,val,token) tuples and plain numerics
|
||
if isinstance(r, (tuple, list)):
|
||
# get numeric element
|
||
val = None
|
||
for x in r:
|
||
if isinstance(x, (int, float)):
|
||
val = float(x)
|
||
break
|
||
if val is not None:
|
||
vals.append(val)
|
||
elif isinstance(r, (int, float)):
|
||
vals.append(float(r))
|
||
if vals: # only if we actually found competence numbers
|
||
comp_per_row.append(
|
||
{
|
||
"file": Path(p).name,
|
||
"sheet": sh,
|
||
"row_index": idx,
|
||
"n_reviews": len(vals),
|
||
"avg_competence": sum(vals) / len(vals),
|
||
}
|
||
)
|
||
|
||
# only append once per sheet
|
||
if comp_per_row:
|
||
all_avg_rows.append(pd.DataFrame(comp_per_row))
|
||
|
||
# Individual reviews
|
||
if not recs:
|
||
log.info("No competence factors found.")
|
||
return
|
||
out = pd.DataFrame(recs)
|
||
out.to_csv(AUDIT_ALL_FILE, index=False)
|
||
s_all = pd.Series(out["comp"], dtype=float)
|
||
|
||
print_stats("Competence Factors (Individual Reviews)", s_all)
|
||
save_histogram(
|
||
s_all,
|
||
HIST_ALL_FILE,
|
||
xlabel="Competence factor (individual reviews)",
|
||
title="Distribution of Competence Factors (Individual Reviews)",
|
||
)
|
||
|
||
# Per-row averages
|
||
if not all_avg_rows:
|
||
log.info("No review averages created.")
|
||
return
|
||
avg_df = pd.concat(all_avg_rows, ignore_index=True)
|
||
avg_df.to_csv(AUDIT_AVG_FILE, index=False)
|
||
s_avg = avg_df["avg_competence"]
|
||
|
||
print_stats("Competence Factors (Per-Row Averages)", s_avg)
|
||
save_histogram(
|
||
s_avg,
|
||
HIST_AVG_FILE,
|
||
xlabel="Average competence per proposal",
|
||
title="Distribution of Average Competence (Per Proposal)",
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
if len(sys.argv) < 2:
|
||
print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
|
||
sys.exit(1)
|
||
main(sys.argv[1:])
|