first prototype

This commit is contained in:
2021-03-26 01:18:40 +01:00
commit 0ccbaf2b26

181
elogdump.py Executable file
View File

@ -0,0 +1,181 @@
#!/usr/bin/env python
import argparse
parser = argparse.ArgumentParser(
description="Dump an elog ...",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("url", help="elog URL, e.g., https://elog-gfa.psi.ch/SwissFEL+test/")
parser.add_argument("-o", "--output", default="dump", help="Output folder")
parser.add_argument("-a", "--attachments", default="attachments",
help="Attachments sub-folder relative to the output folder"
)
clargs = parser.parse_args()
import builtins
import functools
import json
import os
import shutil
from tqdm import tqdm
import elog
import urllib3
# Certs are not valid...
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
http = urllib3.PoolManager(cert_reqs="CERT_NONE")
class ELogScraper:
def __init__(self, url, output_folder=".", attachment_subfolder="attachments"):
self.url = url = url if url.endswith("/") else url + "/" #TODO: only needed for attachments bug!
self.output_folder = output_folder
mkdirs(output_folder)
self.attachment_folder = attachment_folder = os.path.join(output_folder, attachment_subfolder)
self.fd = FileDownloader(attachment_folder)
self.lb = elog.open(url)
self.elog_read = retry(self.lb.read)
mids = self.lb.get_message_ids()
self.mids = sorted(mids)
self.nmsgs = nmsgs = len(mids)
self.counter_width = len(str(nmsgs))
def dump(self):
print()
print(f"Dumping {self.nmsgs} messages from {self.url}")
print(f"- Messages to: {self.output_folder}")
print(f"- Attachments to: {self.attachment_folder}")
print()
try:
builtins.print = tqdm.write # otherwise print() breaks tqdm
for msg in tqdm(self.get_entries(), total=self.nmsgs):
pass
except KeyboardInterrupt:
raise SystemExit(f"\nDump not finished! You might want to delete the \"{output}\" folder.")
finally:
builtins.print = print
def get_entries(self):
for i in self.mids:
yield self.get_entry(i)
def get_entry(self, index):
message, attributes, attachments = self.elog_read(index)
attributes = sanitize_attributes(index, attributes)
attachments = sanitize_attachments(attachments, self.url)
fns = self.fd.get(attachments)
entry = build_entry(index, message, attributes, fns)
counter = str(index).zfill(self.counter_width)
fname = f"msg{counter}.json"
fname = os.path.join(self.output_folder, fname)
json_dump(entry, fname)
return entry
def retry(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
n = 1
while True:
try:
res = func(*args, **kwargs)
except Exception as e:
print_func = func.__name__
all_args = [str(a) for a in args] + ["{k}={v}" for k, v in kwargs.items()]
print_args = ", ".join(all_args)
print(f"retry #{n}: {print_func}({print_args}), failed due to:\n{e}")
n += 1
else:
return res
return wrapper
def sanitize_attributes(i, attributes):
mid = attributes.pop("$@MID@$")
mid = int(mid)
assert i == mid
attributes["MID"] = i
return attributes
def sanitize_attachments(attachments, url):
if attachments == [url]: #TODO: WTF?!
attachments = []
return attachments
def build_entry(i, message, attributes, attachments):
entry = {}
entry.update(attributes)
entry["attachments"] = attachments
entry["message"] = message
return entry
def json_dump(data, fname):
with open(fname, "w") as f:
json.dump(data, f, sort_keys=True, indent=4)
class FileDownloader:
def __init__(self, folder="."):
self.folder = folder
mkdirs(folder)
def get(self, urls):
return [self.get_file(u) for u in urls]
def get_file(self, url):
fname = extract_filename(url)
full_fname = os.path.join(self.folder, fname)
# print(f"{url} -> {fname}")
download(url, full_fname)
return fname
def extract_filename(url):
parsed_url = urllib3.util.parse_url(url)
path = parsed_url.path
fname = os.path.basename(path)
return fname
def download(url, fname):
with http.request("GET", url, preload_content=False) as resp:
with open(fname, "wb") as f:
shutil.copyfileobj(resp, f)
resp.release_conn()
def mkdirs(folder):
os.makedirs(folder, exist_ok=True)
if __name__ == "__main__":
url = clargs.url
output = clargs.output
attachments = clargs.attachments
els = ELogScraper(url, output_folder=output, attachment_subfolder=attachments)
els.dump()