Initial public release of competence factor extractor
This commit is contained in:
@@ -0,0 +1,232 @@
|
||||
"""
|
||||
extract_competence_factor.py
|
||||
----------------------------
|
||||
|
||||
Purpose:
|
||||
Extract and analyze reviewer competence factors from Excel sheets.
|
||||
|
||||
Outputs:
|
||||
- competence_factor_audit.csv (per-review entries)
|
||||
- competence_rowavg_audit.csv (per-row averages)
|
||||
- competence_factor_histogram.png
|
||||
- competence_rowavg_histogram.png
|
||||
|
||||
Behavior:
|
||||
- Reads all sheets from the specified Excel files.
|
||||
- Detects 'comp:' values in any column labeled 'Review'.
|
||||
- Computes both per-review statistics and per-row averages.
|
||||
- Produces CSVs and histograms without altering the input data.
|
||||
"""
|
||||
|
||||
#!/usr/bin/env python3
|
||||
__version__ = "1.0"
|
||||
__author__ = "Andreas Menzel"
|
||||
__last_modified__ = "2025-10-29"
|
||||
|
||||
# =============================================================================
|
||||
# IMPORTS
|
||||
# =============================================================================
|
||||
import sys, re, math
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import logging
|
||||
|
||||
# suppressing warnings that I will not act upon anyways
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="Workbook contains no default style",
|
||||
category=UserWarning,
|
||||
module="openpyxl",
|
||||
)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# FILE NAMES etc.
|
||||
# =============================================================================
|
||||
AUDIT_ALL_FILE = "competence_factor_audit.csv"
|
||||
AUDIT_AVG_FILE = "competence_rowavg_audit.csv"
|
||||
HIST_ALL_FILE = "competence_factor_histogram.png"
|
||||
HIST_AVG_FILE = "competence_rowavg_histogram.png"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SCREEN OUTPUT
|
||||
# =============================================================================
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def print_stats(title: str, data: pd.Series) -> None:
|
||||
"""Prints mean, median, std, and range for a numeric Series."""
|
||||
log.info(f"\n=== {title} ===")
|
||||
log.info(f"N = {len(data)}")
|
||||
log.info(f"Mean = {data.mean():.3f}")
|
||||
log.info(f"Median = {data.median():.3f}")
|
||||
log.info(f"Std = {data.std():.3f}")
|
||||
log.info(f"Min–Max = {data.min():.3f} – {data.max():.3f}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# HISTOGRAM OUTPUT
|
||||
# =============================================================================
|
||||
def save_histogram(data: pd.Series, filename: str, xlabel: str, title: str) -> None:
|
||||
"""Create and save a histogram centered on 0–1 competence values."""
|
||||
bin_width = 0.1
|
||||
centers = np.arange(0, 1.0 + bin_width, bin_width)
|
||||
edges = np.concatenate(([centers[0] - bin_width / 2], centers + bin_width / 2))
|
||||
edges = np.clip(edges, 0, 1.05)
|
||||
|
||||
plt.figure(figsize=(7, 4))
|
||||
plt.hist(data, bins=edges, edgecolor="black", align="mid")
|
||||
plt.xlim(-0.05, 1.05)
|
||||
plt.xticks(np.arange(0, 1.01, 0.1))
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel("Count")
|
||||
plt.title(title)
|
||||
plt.tight_layout()
|
||||
plt.savefig(filename, dpi=150)
|
||||
plt.close()
|
||||
log.info(f"Saved histogram → {filename}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# PARSING FUNCTION
|
||||
# =============================================================================
|
||||
def parse_competence(text):
|
||||
comp_regex = re.compile(
|
||||
r"(?ix)" # case-insensitive, verbose
|
||||
r"\bcomp\s*[:=]\s*" # 'comp:' or 'comp='
|
||||
r"(1(?:[.,]\d*)?|0*[.,]?\d+)" # numeric formats like 1, 1.0, 1., .5, 0.75
|
||||
r"\b"
|
||||
)
|
||||
|
||||
if not isinstance(text, str):
|
||||
return None
|
||||
match_all = list(comp_regex.finditer(text))
|
||||
|
||||
if not match_all:
|
||||
return None
|
||||
match_last = match_all[-1]
|
||||
num_value = float(match_last.group(1).replace(",", "."))
|
||||
return num_value, match_last.group(0)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# MAIN ENTRY POINT
|
||||
# =============================================================================
|
||||
def main(paths):
|
||||
recs = []
|
||||
all_avg_rows = []
|
||||
for p in paths:
|
||||
try:
|
||||
xl = pd.ExcelFile(p)
|
||||
except Exception as e:
|
||||
log.info(f"! open failed {p}: {e}")
|
||||
continue
|
||||
for sh in xl.sheet_names:
|
||||
try:
|
||||
df = xl.parse(sh, dtype=str)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# --- identify all "Review" columns once per sheet ---
|
||||
review_cols = [c for c in df.columns if "review" in str(c).lower()]
|
||||
|
||||
# --- extract individual competence values ---
|
||||
for col in review_cols:
|
||||
for idx, raw in df[col].dropna().items():
|
||||
r = parse_competence(raw)
|
||||
if not r:
|
||||
continue
|
||||
comp_value, token = r
|
||||
recs.append(
|
||||
{
|
||||
"file": Path(p).name,
|
||||
"sheet": sh,
|
||||
"column": str(col),
|
||||
"row_index": idx if isinstance(idx, int) else None,
|
||||
"raw_text": raw,
|
||||
"token": token,
|
||||
"comp": comp_value,
|
||||
}
|
||||
)
|
||||
if review_cols:
|
||||
# drop completely empty rows across all Review columns
|
||||
df_reviews = df[review_cols].dropna(how="all")
|
||||
|
||||
comp_per_row = []
|
||||
for idx, row in df_reviews.iterrows():
|
||||
vals = []
|
||||
for cell in row.dropna().astype(str):
|
||||
r = parse_competence(cell)
|
||||
if r is None:
|
||||
continue
|
||||
# Handle both (op,val,token) tuples and plain numerics
|
||||
if isinstance(r, (tuple, list)):
|
||||
# get numeric element
|
||||
val = None
|
||||
for x in r:
|
||||
if isinstance(x, (int, float)):
|
||||
val = float(x)
|
||||
break
|
||||
if val is not None:
|
||||
vals.append(val)
|
||||
elif isinstance(r, (int, float)):
|
||||
vals.append(float(r))
|
||||
if vals: # only if we actually found competence numbers
|
||||
comp_per_row.append(
|
||||
{
|
||||
"file": Path(p).name,
|
||||
"sheet": sh,
|
||||
"row_index": idx,
|
||||
"n_reviews": len(vals),
|
||||
"avg_competence": sum(vals) / len(vals),
|
||||
}
|
||||
)
|
||||
|
||||
# only append once per sheet
|
||||
if comp_per_row:
|
||||
all_avg_rows.append(pd.DataFrame(comp_per_row))
|
||||
|
||||
# Individual reviews
|
||||
if not recs:
|
||||
log.info("No competence factors found.")
|
||||
return
|
||||
out = pd.DataFrame(recs)
|
||||
out.to_csv(AUDIT_ALL_FILE, index=False)
|
||||
s_all = pd.Series(out["comp"], dtype=float)
|
||||
|
||||
print_stats("Competence Factors (Individual Reviews)", s_all)
|
||||
save_histogram(
|
||||
s_all,
|
||||
HIST_ALL_FILE,
|
||||
xlabel="Competence factor (individual reviews)",
|
||||
title="Distribution of Competence Factors (Individual Reviews)",
|
||||
)
|
||||
|
||||
# Per-row averages
|
||||
if not all_avg_rows:
|
||||
log.info("No review averages created.")
|
||||
return
|
||||
avg_df = pd.concat(all_avg_rows, ignore_index=True)
|
||||
avg_df.to_csv(AUDIT_AVG_FILE, index=False)
|
||||
s_avg = avg_df["avg_competence"]
|
||||
|
||||
print_stats("Competence Factors (Per-Row Averages)", s_avg)
|
||||
save_histogram(
|
||||
s_avg,
|
||||
HIST_AVG_FILE,
|
||||
xlabel="Average competence per proposal",
|
||||
title="Distribution of Average Competence (Per Proposal)",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python extract_competence_factor.py file1.xls file2.xlsx ...")
|
||||
sys.exit(1)
|
||||
main(sys.argv[1:])
|
||||
Reference in New Issue
Block a user