first prototype
This commit is contained in:
181
elogdump.py
Executable file
181
elogdump.py
Executable file
@ -0,0 +1,181 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Dump an elog ...",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("url", help="elog URL, e.g., https://elog-gfa.psi.ch/SwissFEL+test/")
|
||||||
|
parser.add_argument("-o", "--output", default="dump", help="Output folder")
|
||||||
|
parser.add_argument("-a", "--attachments", default="attachments",
|
||||||
|
help="Attachments sub-folder relative to the output folder"
|
||||||
|
)
|
||||||
|
|
||||||
|
clargs = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import builtins
|
||||||
|
import functools
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
import elog
|
||||||
|
import urllib3
|
||||||
|
|
||||||
|
|
||||||
|
# Certs are not valid...
|
||||||
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
http = urllib3.PoolManager(cert_reqs="CERT_NONE")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ELogScraper:
|
||||||
|
|
||||||
|
def __init__(self, url, output_folder=".", attachment_subfolder="attachments"):
|
||||||
|
self.url = url = url if url.endswith("/") else url + "/" #TODO: only needed for attachments bug!
|
||||||
|
|
||||||
|
self.output_folder = output_folder
|
||||||
|
mkdirs(output_folder)
|
||||||
|
|
||||||
|
self.attachment_folder = attachment_folder = os.path.join(output_folder, attachment_subfolder)
|
||||||
|
self.fd = FileDownloader(attachment_folder)
|
||||||
|
|
||||||
|
self.lb = elog.open(url)
|
||||||
|
self.elog_read = retry(self.lb.read)
|
||||||
|
|
||||||
|
mids = self.lb.get_message_ids()
|
||||||
|
self.mids = sorted(mids)
|
||||||
|
self.nmsgs = nmsgs = len(mids)
|
||||||
|
self.counter_width = len(str(nmsgs))
|
||||||
|
|
||||||
|
|
||||||
|
def dump(self):
|
||||||
|
print()
|
||||||
|
print(f"Dumping {self.nmsgs} messages from {self.url}")
|
||||||
|
print(f"- Messages to: {self.output_folder}")
|
||||||
|
print(f"- Attachments to: {self.attachment_folder}")
|
||||||
|
print()
|
||||||
|
try:
|
||||||
|
builtins.print = tqdm.write # otherwise print() breaks tqdm
|
||||||
|
for msg in tqdm(self.get_entries(), total=self.nmsgs):
|
||||||
|
pass
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
raise SystemExit(f"\nDump not finished! You might want to delete the \"{output}\" folder.")
|
||||||
|
finally:
|
||||||
|
builtins.print = print
|
||||||
|
|
||||||
|
|
||||||
|
def get_entries(self):
|
||||||
|
for i in self.mids:
|
||||||
|
yield self.get_entry(i)
|
||||||
|
|
||||||
|
|
||||||
|
def get_entry(self, index):
|
||||||
|
message, attributes, attachments = self.elog_read(index)
|
||||||
|
attributes = sanitize_attributes(index, attributes)
|
||||||
|
attachments = sanitize_attachments(attachments, self.url)
|
||||||
|
fns = self.fd.get(attachments)
|
||||||
|
entry = build_entry(index, message, attributes, fns)
|
||||||
|
|
||||||
|
counter = str(index).zfill(self.counter_width)
|
||||||
|
fname = f"msg{counter}.json"
|
||||||
|
fname = os.path.join(self.output_folder, fname)
|
||||||
|
|
||||||
|
json_dump(entry, fname)
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def retry(func):
|
||||||
|
@functools.wraps(func)
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
n = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
res = func(*args, **kwargs)
|
||||||
|
except Exception as e:
|
||||||
|
print_func = func.__name__
|
||||||
|
all_args = [str(a) for a in args] + ["{k}={v}" for k, v in kwargs.items()]
|
||||||
|
print_args = ", ".join(all_args)
|
||||||
|
print(f"retry #{n}: {print_func}({print_args}), failed due to:\n{e}")
|
||||||
|
n += 1
|
||||||
|
else:
|
||||||
|
return res
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
def sanitize_attributes(i, attributes):
|
||||||
|
mid = attributes.pop("$@MID@$")
|
||||||
|
mid = int(mid)
|
||||||
|
assert i == mid
|
||||||
|
attributes["MID"] = i
|
||||||
|
return attributes
|
||||||
|
|
||||||
|
def sanitize_attachments(attachments, url):
|
||||||
|
if attachments == [url]: #TODO: WTF?!
|
||||||
|
attachments = []
|
||||||
|
return attachments
|
||||||
|
|
||||||
|
def build_entry(i, message, attributes, attachments):
|
||||||
|
entry = {}
|
||||||
|
entry.update(attributes)
|
||||||
|
entry["attachments"] = attachments
|
||||||
|
entry["message"] = message
|
||||||
|
return entry
|
||||||
|
|
||||||
|
def json_dump(data, fname):
|
||||||
|
with open(fname, "w") as f:
|
||||||
|
json.dump(data, f, sort_keys=True, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class FileDownloader:
|
||||||
|
|
||||||
|
def __init__(self, folder="."):
|
||||||
|
self.folder = folder
|
||||||
|
mkdirs(folder)
|
||||||
|
|
||||||
|
def get(self, urls):
|
||||||
|
return [self.get_file(u) for u in urls]
|
||||||
|
|
||||||
|
def get_file(self, url):
|
||||||
|
fname = extract_filename(url)
|
||||||
|
full_fname = os.path.join(self.folder, fname)
|
||||||
|
# print(f"{url} -> {fname}")
|
||||||
|
download(url, full_fname)
|
||||||
|
return fname
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def extract_filename(url):
|
||||||
|
parsed_url = urllib3.util.parse_url(url)
|
||||||
|
path = parsed_url.path
|
||||||
|
fname = os.path.basename(path)
|
||||||
|
return fname
|
||||||
|
|
||||||
|
def download(url, fname):
|
||||||
|
with http.request("GET", url, preload_content=False) as resp:
|
||||||
|
with open(fname, "wb") as f:
|
||||||
|
shutil.copyfileobj(resp, f)
|
||||||
|
resp.release_conn()
|
||||||
|
|
||||||
|
def mkdirs(folder):
|
||||||
|
os.makedirs(folder, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
url = clargs.url
|
||||||
|
output = clargs.output
|
||||||
|
attachments = clargs.attachments
|
||||||
|
els = ELogScraper(url, output_folder=output, attachment_subfolder=attachments)
|
||||||
|
els.dump()
|
||||||
|
|
||||||
|
|
||||||
|
|
Reference in New Issue
Block a user