crystfel_tools/reduction_tools/cat_lst.py

#!/usr/bin/python

# author J.Beale

"""
# aim
script to append lst files from different run locations
so you give the script individual run numbers and/or

# usage
python cat_lst.py -r <run_range> e.g. 45,50 - for runs 45-50, optional argument
                  <a series of individual run numbers> e.g., 45 47 50 - for these specific runs
                  #### note #### both of these can be used together - but you can't specify two lists
                  -e endstation - "alvra" or "cristallina"
                  -p pgroup
                  -l label - i.e. 'light', 'dark' or 'both'
                  -o output file name

# output
a concatentated list file of all the request runs
"""

import argparse
import pandas as pd
import glob
import os
import numpy as np

def concatenate_files( input_file_lst, output ):

    output_file = "{0}.lst".format( output )

    # create output file
    with open( output_file, "w" ) as output:

        # loop through input list - read and write to output file
        for lst_file_pwd in input_file_lst:

            # open and write to output file
            with open( lst_file_pwd, "r" ) as lst_file:
                output.write( lst_file.read() )

def make_pwd( run_no, endstation, pgroup ):

    # construct lst folder path
    lst_pwd = "/sf/{0}/data/{1}/raw/".format( endstation, pgroup ) + "run" + run_no + "*/data"

    return lst_pwd

def find_lst( lst_dir, label ):

    # if label = both, i.e. both lights and darks, set label to lst - so it's alwasy found
    if label == "both":
        label = "lst"

    # create df for all lst
    lst_dir_df = pd.DataFrame()

    # search for lst with appropriate labels
    for path, dirs, files in os.walk( lst_dir ):
        for name in files:
            if name.endswith( ".lst" ):

                # get lst pwd
                lst_pwd = os.path.join( path, name )

                # put clen and stream pwd into df
                data = [ {  "lst_pwd" : lst_pwd
                         } ]
                lst_dir_df_1 = pd.DataFrame( data )
                lst_dir_df = pd.concat( ( lst_dir_df, lst_dir_df_1 ) )

    # reset df index
    lst_dir_df = lst_dir_df.reset_index( drop=True )

    # return df lst from this directory
    return lst_dir_df

def generate_lst_df( run_lst, endstation, label, pgroup ):

    # make run number df
    cols = [ "run_no" ]
    range_df = pd.DataFrame( run_lst, columns=cols )
    # add zeros to left hand of number
    range_df[ "run_no" ] = range_df.run_no.str.zfill(4)

    # make new column of list paths
    range_df[ "lst_app_dir" ] = range_df[ "run_no" ].apply( lambda x: make_pwd( x, endstation, pgroup ) )

    # make df of lsts to be concatenated
    lst_df = pd.DataFrame()

    for index, row in range_df.iterrows():

        # get approximate dir pwd
        lst_app_dir = row[ "lst_app_dir" ]
        # find matching file
        lst_dir = glob.glob( lst_app_dir )

        # find lsts in lst directory depending on label
        lst_dir_df = find_lst( lst_dir[0], label )

        # append lst dir dfs
        lst_df = pd.concat( [ lst_df, lst_dir_df ], ignore_index=True )

    # reset df index
    lst_df = lst_df.reset_index( drop=True )

    return lst_df

def main( run_lst, endstation, label, pgroup, output_file ):

    # make df of lst files
    lst_df = generate_lst_df( run_lst, endstation, label, pgroup )

    # concatinate all lst file in lst_df
    concatenate_files( lst_df.lst_pwd, output_file )

def range_of_runs(arg):
    return list(map(int, arg.split(',')))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-r",
        "--range",
        help="list files in a range of run number to concatentate",
        type=range_of_runs
    )
    parser.add_argument(
        "runs",
        help="type in indivdual run numbers for list to be concantenated",
        type=str,
        nargs='*',
        default=[]
        )
    parser.add_argument(
        "-e",
        "--endstation",
        help="which endstation did you collect these data from, e.g., alvra or cristallina",
        type=str,
        default="cristallina"
        )
    parser.add_argument(
        "-p",
        "--pgroup",
        help="pgroup the data are collected in",
        type=str
        )
    parser.add_argument(
        "-l",
        "--label",
        help="the label of the lst file, i.e. 'light', 'dark' or 'both'",
        type=str,
        required=True
        )
    parser.add_argument(
        "-o",
        "--output",
        help="name of output file",
        type=str,
        )
    args = parser.parse_args()
    # make continuous list from input range limits
    range = []
    if args.range is not None:
        limits = args.range
        range = np.arange( limits[0], limits[1]+1 )
        # convert to list
        range = range.tolist()
        # convert to strings
        range = list( map( str, range ) )
    # concat range and run lists
    runs = args.runs
    run_lst = range + runs
    print( "appending {0} lst files from runs {1}".format( args.label, run_lst ) )
    # run main
    main( run_lst, args.endstation, args.label, args.pgroup, args.output )
    print( "done" )