From 123362f4dbfef5188d733d147e81c7c1ec73f2c7 Mon Sep 17 00:00:00 2001
From: Sven Augustin <sven.augustin@psi.ch>
Date: Sat, 27 Mar 2021 18:53:21 +0100
Subject: [PATCH] added author collection/conversion tools

---
 collect_authors.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++
 convert_authors.py | 73 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 147 insertions(+)
 create mode 100755 collect_authors.py
 create mode 100755 convert_authors.py

diff --git a/collect_authors.py b/collect_authors.py
new file mode 100755
index 0000000..9363cab
--- /dev/null
+++ b/collect_authors.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+DEFAULT_SEP = " : "
+
+
+import argparse
+
+parser = argparse.ArgumentParser(description="Collect authors from an elog dump ...")
+
+parser.add_argument("-d", "--dump", default="dump", help="Folder containing the elog dump")
+parser.add_argument("-o", "--output", default="authors", help="Output file name")
+parser.add_argument("-s", "--separator", default=DEFAULT_SEP, help=f"Key-value separator in the output (default: \"{DEFAULT_SEP}\")")
+parser.add_argument("-p", "--print", action="store_true", help="Print authors")
+
+clargs = parser.parse_args()
+
+
+
+from pathlib import Path
+import json
+
+
+def collect(folder):
+    dump = Path(folder)
+    fns = dump.glob("msg*.json")
+    authors = set()
+    for fn in sorted(fns):
+        data = json_load(fn)
+        author = data["Author"]
+        authors.add(author)
+
+    authors = sorted(authors)
+    return authors
+
+
+def check(authors, print_all=False):
+    for a in authors:
+        if print_all:
+            print(a)
+
+        stripped_author = a.strip()
+        if a != stripped_author:
+            print(f"Warning: Author \"{author}\" has strippable spaces.")
+
+        if a == "":
+            print("Warning: Author is the empty string.")
+
+
+def save(authors, output, sep):
+    data = [f"{a}{sep}" for a in authors]
+    text_save(data, output)
+
+
+def json_load(fname):
+    with open(fname, "r") as f:
+        return json.load(f)
+
+def text_save(data, fname):
+    with open(fname, "w") as f:
+        for line in data:
+            f.write(line)
+            f.write("\n")
+
+
+
+
+
+if __name__ == "__main__":
+    authors = collect(clargs.dump)
+    check(authors, clargs.print)
+    save(authors, clargs.output, clargs.separator)
+
+
+
diff --git a/convert_authors.py b/convert_authors.py
new file mode 100755
index 0000000..cd7d196
--- /dev/null
+++ b/convert_authors.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+DEFAULT_SEP = " : "
+
+
+import argparse
+
+parser = argparse.ArgumentParser(description="Convert collected and mapped authors to json ...")
+
+parser.add_argument("-i", "--input", default="authors", help="Input file name")
+parser.add_argument("-o", "--output", default="authors.json", help="Output file name")
+parser.add_argument("-s", "--separator", default=DEFAULT_SEP, help=f"Key-value separator in the output (default: \"{DEFAULT_SEP}\")")
+parser.add_argument("-d", "--default", help="Default user")
+parser.add_argument("-p", "--print", action="store_true", help="Print authors")
+
+clargs = parser.parse_args()
+
+
+
+from pathlib import Path
+import json
+
+
+def author_load(fname, sep, default):
+    data = text_load(fname)
+    res = {}
+    for line in data:
+        line = line.split(sep)
+        old, new = line
+        if new == "":
+            print(f"Warning: will use default ({default}) for author \"{old}\".")
+            new = default
+        res[old] = new
+    return res
+
+def text_load(fname):
+    res = []
+    with open(fname, "r") as f:
+        for line in f:
+            line = line.split("#")[0] # remove comments
+            line = line.rstrip("\n")
+            if not line:
+                continue
+            res.append(line)
+    return res
+
+def print_dict(d):
+    length = maxstrlen(d.keys())
+    for k, v in d.items():
+        print(k.rjust(length), "->", v)
+
+def maxstrlen(seq):
+    return max(strlen(i) for i in seq)
+
+def strlen(val):
+    return len(str(val))
+
+def json_dump(data, fname):
+    with open(fname, "w") as f:
+        json.dump(data, f, sort_keys=True, indent=4)
+
+
+
+
+
+if __name__ == "__main__":
+    author_map = author_load(clargs.input, clargs.separator, clargs.default)
+    if clargs.print:
+        print_dict(author_map)
+    json_dump(author_map, clargs.output)
+
+
+