commit 0ccbaf2b264ca194a30b829016aa5bc5239c6a18 Author: Sven Augustin Date: Fri Mar 26 01:18:40 2021 +0100 first prototype diff --git a/elogdump.py b/elogdump.py new file mode 100755 index 0000000..08aeecc --- /dev/null +++ b/elogdump.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python + +import argparse + +parser = argparse.ArgumentParser( + description="Dump an elog ...", + formatter_class=argparse.ArgumentDefaultsHelpFormatter +) + +parser.add_argument("url", help="elog URL, e.g., https://elog-gfa.psi.ch/SwissFEL+test/") +parser.add_argument("-o", "--output", default="dump", help="Output folder") +parser.add_argument("-a", "--attachments", default="attachments", + help="Attachments sub-folder relative to the output folder" +) + +clargs = parser.parse_args() + + + +import builtins +import functools +import json +import os +import shutil + +from tqdm import tqdm +import elog +import urllib3 + + +# Certs are not valid... +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +http = urllib3.PoolManager(cert_reqs="CERT_NONE") + + + +class ELogScraper: + + def __init__(self, url, output_folder=".", attachment_subfolder="attachments"): + self.url = url = url if url.endswith("/") else url + "/" #TODO: only needed for attachments bug! + + self.output_folder = output_folder + mkdirs(output_folder) + + self.attachment_folder = attachment_folder = os.path.join(output_folder, attachment_subfolder) + self.fd = FileDownloader(attachment_folder) + + self.lb = elog.open(url) + self.elog_read = retry(self.lb.read) + + mids = self.lb.get_message_ids() + self.mids = sorted(mids) + self.nmsgs = nmsgs = len(mids) + self.counter_width = len(str(nmsgs)) + + + def dump(self): + print() + print(f"Dumping {self.nmsgs} messages from {self.url}") + print(f"- Messages to: {self.output_folder}") + print(f"- Attachments to: {self.attachment_folder}") + print() + try: + builtins.print = tqdm.write # otherwise print() breaks tqdm + for msg in tqdm(self.get_entries(), total=self.nmsgs): + pass + except KeyboardInterrupt: + raise SystemExit(f"\nDump not finished! You might want to delete the \"{output}\" folder.") + finally: + builtins.print = print + + + def get_entries(self): + for i in self.mids: + yield self.get_entry(i) + + + def get_entry(self, index): + message, attributes, attachments = self.elog_read(index) + attributes = sanitize_attributes(index, attributes) + attachments = sanitize_attachments(attachments, self.url) + fns = self.fd.get(attachments) + entry = build_entry(index, message, attributes, fns) + + counter = str(index).zfill(self.counter_width) + fname = f"msg{counter}.json" + fname = os.path.join(self.output_folder, fname) + + json_dump(entry, fname) + return entry + + + +def retry(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + n = 1 + while True: + try: + res = func(*args, **kwargs) + except Exception as e: + print_func = func.__name__ + all_args = [str(a) for a in args] + ["{k}={v}" for k, v in kwargs.items()] + print_args = ", ".join(all_args) + print(f"retry #{n}: {print_func}({print_args}), failed due to:\n{e}") + n += 1 + else: + return res + return wrapper + +def sanitize_attributes(i, attributes): + mid = attributes.pop("$@MID@$") + mid = int(mid) + assert i == mid + attributes["MID"] = i + return attributes + +def sanitize_attachments(attachments, url): + if attachments == [url]: #TODO: WTF?! + attachments = [] + return attachments + +def build_entry(i, message, attributes, attachments): + entry = {} + entry.update(attributes) + entry["attachments"] = attachments + entry["message"] = message + return entry + +def json_dump(data, fname): + with open(fname, "w") as f: + json.dump(data, f, sort_keys=True, indent=4) + + + +class FileDownloader: + + def __init__(self, folder="."): + self.folder = folder + mkdirs(folder) + + def get(self, urls): + return [self.get_file(u) for u in urls] + + def get_file(self, url): + fname = extract_filename(url) + full_fname = os.path.join(self.folder, fname) +# print(f"{url} -> {fname}") + download(url, full_fname) + return fname + + + +def extract_filename(url): + parsed_url = urllib3.util.parse_url(url) + path = parsed_url.path + fname = os.path.basename(path) + return fname + +def download(url, fname): + with http.request("GET", url, preload_content=False) as resp: + with open(fname, "wb") as f: + shutil.copyfileobj(resp, f) + resp.release_conn() + +def mkdirs(folder): + os.makedirs(folder, exist_ok=True) + + + + + +if __name__ == "__main__": + url = clargs.url + output = clargs.output + attachments = clargs.attachments + els = ELogScraper(url, output_folder=output, attachment_subfolder=attachments) + els.dump() + + +