#!/usr/bin/env python3

# authors T.Mason and J.Beale

"""
# aim
to refine the detector distance using crystfel
- naming covention = #.###/#.###.stream

# usage
python detector-distance-refinement.py -l <path to lst file generated by daq> 
                                       -g <path to geom file>
                                       -d central clen to refine around
                                       -c cell_file
                                       -s sample size
                                       -e endstation - alvra or cristallina

# other variables
-f = fine only = only perform fine scan

# output
plot files of the analysis and a suggest for the clen
"""

# modules
import pandas as pd
import subprocess
import os, errno
import regex as re
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import argparse

def h5_sample( lst, sample ):

    # create sample of images from run
    # read h5.lst - note - removes // from image column
    cols = [ "h5", "image" ]
    sample_df = pd.read_csv( lst, sep="\s//", engine="python", names=cols )

    # take defined sample
    sample_df = sample_df.sample( sample )

    # sort list
    sample_df = sample_df.sort_index()

    # re-add // to image columm
    sample_df[ "image" ] = "//" + sample_df.image.astype(str)

    # write sample to file
    sample_file = "h5_{0}_sample.lst".format( sample )
    sample_df.to_csv( sample_file, sep=" ", index=False, header=False )

    # return sample file name
    return sample_file

def geom_amend( lab6_geom_file, clen ):

    # read lab6 geom
    lab6_geom = open( lab6_geom_file, "r" )

    # use regex to find clen and replace with new
    # clen example => clen = 0.1217
    clen_geom = re.sub( "clen = 0\.\d+", "clen = {0}".format( clen ), lab6_geom.read() )

    # close lab6 geom file
    lab6_geom.close()

    # write new clen_geom to file
    clen_geom_file = "{0}.geom".format( clen )
    geom = open( clen_geom_file, "w" )
    geom.write( clen_geom )
    geom.close()

    # return clen_geom file name
    return clen_geom_file

def write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold ):

    # crystfel file name
    cryst_run_file = "{0}_run.sh".format( clen )

    # write file
    run_sh = open( cryst_run_file, "w" )
    run_sh.write( "#!/bin/sh\n\n" )
    run_sh.write( "module purge\n" )
    run_sh.write( "module load crystfel/0.10.2\n" )
    run_sh.write( "indexamajig -i {0} \\\n".format( sample_h5_file ) )
    run_sh.write( "  --output={0}.stream \\\n".format( clen ) )
    run_sh.write( "  --geometry={0}\\\n".format( clen_geom_file ) )
    run_sh.write( "  --pdb={0} \\\n".format( cell_file ) )
    run_sh.write( "  --indexing=mosflm --peaks=peakfinder8 \\\n" )
    run_sh.write( "  --threshold={0} --min-snr=5 --int-radius=3,5,9 \\\n".format( threshold ) )
    run_sh.write( "  -j 32 --no-multi --no-retry --max-res=3000 --min-pix-count=2 --min-res=85\n\n" )
    run_sh.close()

    # make file executable
    subprocess.call( [ "chmod", "+x", "{0}".format( cryst_run_file ) ] )

    # return crystfel file name
    return cryst_run_file

def make_sample( lst, sample ):
    
    # make sample list
    print("making {0} sample of images".format(sample))
    sample_h5 = h5_sample(lst, sample)
    sample_h5_file = "{0}/{1}".format(cwd, sample_h5)
    print("done")
    
    return sample_h5_file
    
def make_process_dir(proc_dir):
        # make process directory
        try:
            os.makedirs( proc_dir )
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise
                
def make_step_range(centre_clen, step_size, steps):
    # make list of clen steps above and below the central clen
    print( "make clen array around {0}".format( centre_clen ) )
    step_range = step_size*steps
    bottom_step = centre_clen-step_range/2
    top_step = bottom_step+step_range
    step_range = np.arange( bottom_step, top_step, step_size )
    step_range = step_range.round( 6 ) # important - otherwise np gives your .99999999 instead of 1 somethimes
    print( "done" )
    
    return step_range

def submit_job( job_file ):

    # submit the job
    submit_cmd = [ "sbatch", "-p", "day", "--cpus-per-task=32", "--", job_file ]
    job_output = subprocess.check_output(submit_cmd)

    # scrub job id from - example Submitted batch job 742403
    pattern = r"Submitted batch job (\d+)"
    job_id = re.search( pattern, job_output.decode().strip() ).group(1)

    return int(job_id)

def wait_for_jobs( job_ids, total_jobs ):

    with tqdm(total=total_jobs, desc="Jobs Completed", unit="job") as pbar:
        while job_ids:
            completed_jobs = set()
            for job_id in job_ids:
                status_cmd = ["squeue", "-h", "-j", str(job_id)]
                status = subprocess.check_output(status_cmd)
                if not status:
                    completed_jobs.add(job_id)
                    pbar.update(1)
            job_ids.difference_update(completed_jobs)
            time.sleep(30)
        
def scrub_clen( stream_pwd ):

    # get clen from stream name
    # example - /sf/cristallina/data/p20590/work/process/jhb/detector_refinement/coarse_scan/0.115/0.115.stream
    # scrub clen and return - else nan
    try:
        pattern = r"0\.\d+/(0\.\d+)\.stream"
        re_search = re.search( pattern, stream_pwd )
        clen = re_search.group( 1 )
        if AttributeError:
            return float( clen )
    except AttributeError:
        return 1

def find_streams( top_dir ):

    # create df for streams
    stream_df = pd.DataFrame()

    # search for all files that end with .stream

    for path, dirs, files in os.walk( top_dir ):
        for name in files:
            if name.endswith( ".stream" ):

                # get stream pwd
                stream_pwd = os.path.join( path, name )

                # scrub clen from stream
                clen = scrub_clen( stream_pwd )

                # put clen and stream pwd into df
                data = [ {  "stream_pwd" : stream_pwd,
                            "clen" : clen
                         } ]
                stream_df_1 = pd.DataFrame( data )
                stream_df = pd.concat( ( stream_df, stream_df_1 ) )
    
    # sort df based on clen
    stream_df = stream_df.sort_values( by="clen" )

    # reset df index
    stream_df = stream_df.reset_index( drop=True )

    # return df of streams and clens
    return stream_df

def scrub_us( stream ):

    # get uc values from stream file
    # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
    # scrub clen and return - else nan
    try:
        pattern = r"Cell\sparameters\s(\d\.\d+)\s(\d\.\d+)\s(\d\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
        cells = re.findall( pattern, stream )
        if AttributeError:
            return cells
    except AttributeError:
        return np.nan
    
def scrub_helper( top_dir ):

    # find stream files from process directory
    print( "finding stream files" )
    stream_df = find_streams( top_dir )
    print( "done" )

    # making results df for unit cell and index no.
    stats_df = pd.DataFrame()

    # loop through stream files and collect unit_cell information
    print( "looping through stream files to collect unit cell, indexed information" )
    for index, row in stream_df.iterrows():

        stream_pwd, clen = row[ "stream_pwd" ], row[ "clen" ]

        # open stream file
        stream = open( stream_pwd, "r" ).read()

        # scrub unit cell information
        cells = scrub_us( stream )

        # put cells in df
        cols = [ "a", "b", "c", "alpha", "beta", "gamma" ]
        cells_df = pd.DataFrame( cells, columns=cols )
        cells_df = cells_df.astype( float )

        # calc stats
        indexed = len( cells_df )
        std_a = cells_df.a.std()
        std_b = cells_df.b.std()
        std_c = cells_df.c.std()
        std_alpha = cells_df.alpha.std()
        std_beta = cells_df.beta.std()
        std_gamma = cells_df.gamma.std()

        # put stats in results df
        stats = [ { "clen" : clen,
                    "indexed" : indexed,
                    "std_a" : std_a,
                    "std_b" : std_b,
                    "std_c" : std_c,
                    "std_alpha" : std_alpha,
                    "std_beta" : std_beta,
                    "std_gamma" : std_gamma,
                } ]
        stats_df_1 = pd.DataFrame( stats )
        stats_df = pd.concat( ( stats_df, stats_df_1 ) )

    # reset index
    stats_df = stats_df.reset_index( drop=True )
    print( "done" )

    return stats_df

def find_clen_values(stats_df):

    def find_min_clen(col_name):
        min_val = stats_df[col_name].min()
        min_row = stats_df[stats_df[col_name] == min_val]
        min_clen = min_row['clen'].values[0]
        return min_val, min_clen

    min_alpha_val, min_alpha_clen = find_min_clen('std_alpha')
    min_beta_val, min_beta_clen = find_min_clen('std_beta')
    min_gamma_val, min_gamma_clen = find_min_clen('std_gamma')
    min_c_val, min_c_clen = find_min_clen('std_c')
    
    print("The value of clen for the minimum alpha value of {} is {}".format(min_alpha_val, min_alpha_clen))
    print("The value of clen for the minimum beta value of {} is {}".format(min_beta_val, min_beta_clen))
    print("The value of clen for the minimum gamma value of {} is {}".format(min_gamma_val, min_gamma_clen))
    print("The value of clen for the minimum c value of {} is {}".format(min_c_val, min_c_clen))
    
    return min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val

def plot_indexed_std( stats_df, ax1, ax2 ):

    # indexed images plot
    color = "tab:red"
    ax1.set_xlabel("clen")
    ax1.set_ylabel("indexed", color=color)
    ax1.plot(stats_df.clen, stats_df.indexed, color=color)
    ax1.tick_params(axis="y", labelcolor=color)

    # label color
    color = "tab:blue"
    ax2.set_ylabel("a,b,c st.deviation", color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    # std_a plot
    color = "lightsteelblue"
    ax2.plot(stats_df.clen, stats_df.std_a, color=color)

    # std_b plot
    color = "cornflowerblue"
    ax2.plot(stats_df.clen, stats_df.std_b, color=color)

    # std_c plot
    color = "royalblue"
    ax2.plot(stats_df.clen, stats_df.std_c, color=color)

def plot_indexed_std_alpha_beta_gamma( stats_df, ax1, ax2 ):

    # indexed images plot
    color = "tab:red"
    ax1.set_xlabel("clen")
    ax1.set_ylabel("indexed", color=color)
    ax1.plot(stats_df.clen, stats_df.indexed, color=color)
    ax1.tick_params(axis="y", labelcolor=color)

    # label color
    color = "tab:green"
    ax2.set_ylabel("alpha, beta, gamma st.deviation", color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    # std_alpha plot
    color = "limegreen"
    ax2.plot(stats_df.clen, stats_df.std_alpha, color=color)

    # std_beta plot
    color = "darkgreen"
    ax2.plot(stats_df.clen, stats_df.std_beta, color=color)

    # std_gamma plot
    color = "green"
    ax2.plot(stats_df.clen, stats_df.std_gamma, color=color)
        
def scan( cwd, lst, sample, lab6_geom_file, centre_clen, cell_file, threshold, step_size ):
    
    # define coarse or fine scan
    if step_size == "coarse":
        steps = 20
        step_size = 0.0005 # m
        scan_name = "coarse"
    if step_size == "fine":
        steps = 50
        step_size = 0.00005 # m
        scan_name = "fine"

    #make sample list
    sample_h5_file = make_sample(lst, sample)
    
    # make list of clen steps above and below the central clen
    step_range = make_step_range(centre_clen, step_size, steps)
    
    # submitted job set and job_list
    submitted_job_ids = set()
    job_list = []

    # make directorys for results
    print( "begin CrystFEL anaylsis of different clens" )

    # loop to cycle through clen steps
    for clen in step_range:

        # define process directory
        proc_dir = "{0}/{1}/{2}".format( cwd, scan_name, clen )

        # make process directory
        make_process_dir(proc_dir)
        
        # move to process directory
        os.chdir( proc_dir )

        # make geom file
        clen_geom_file = geom_amend( lab6_geom_file, clen )

        # make crystfel run file
        cryst_run_file = write_crystfel_run( clen, sample_h5_file, clen_geom_file, cell_file, threshold )  

        # run crystfel file
        job_list.append( cryst_run_file )
        job_id = submit_job( cryst_run_file )
        submitted_job_ids.add( job_id )

        # move back to cwd
        os.chdir( cwd )

    #wait for jobs to complete
    wait_for_jobs(submitted_job_ids, len(job_list))
    print("slurm processing done")

def scrub_scan( scan_top_dir, scan ):

    stats_df = scrub_helper(scan_top_dir)

    #print clen for minimum alpha, beta, and gamma values
    min_alpha_clen, min_beta_clen, min_gamma_clen, min_c_clen, min_alpha_val, min_beta_val, min_gamma_val, min_c_val = find_clen_values(stats_df)

    # print suggested clen
    suggested_clen = (min_alpha_clen + min_beta_clen + min_gamma_clen )/3
    suggested_clen = round(suggested_clen, 4)
    print ("The suggested clen = {0}".format(suggested_clen))
    
    # plot results
    fig, (ax1, ax3) = plt.subplots(1, 2)
    ax2 = ax1.twinx()
    ax4 = ax3.twinx()

    plot_indexed_std(stats_df, ax1, ax2)
    plot_indexed_std_alpha_beta_gamma(stats_df, ax3, ax4)

    fig.tight_layout()
    plt.savefig("{0}.png".format(scan))

    return suggested_clen

def main( cwd, lst, sample, geom, centre_clen, cell_file, fine_only, threshold ):

    # if statement to do coarse if fine_only==False
    if fine_only == False:
        top_dir_coarse = "{0}/coarse".format( cwd )

        scan( cwd, lst, sample, geom, centre_clen, cell_file, threshold, step_size="coarse" )

        suggested_clen = scrub_scan( top_dir_coarse, scan="coarse" )
    else:
        suggested_clen = centre_clen

    top_dir_fine = "{0}/fine".format( cwd )

    scan( cwd, lst, sample, geom, suggested_clen, cell_file, threshold, step_size="fine" )

    scrub_scan( top_dir_fine, scan="fine" )
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-l",
        "--lst",
        help="path to crystfel list file containing enough patterns for detector distance refinement",
        type=os.path.abspath,
        required=True
    )
    parser.add_argument(
        "-g",
        "--geom",
        help="path to geom file to be used in the refinement",
        type=os.path.abspath,
        required=True
    )
    parser.add_argument(
        "-d",
        "--central_distance",
        help="intial clen to use for refinement - usually from detector shift refinement",
        type=float,
        required=True
    )
    parser.add_argument(
        "-c",
        "--cell_file",
        help="path to cell file of the crystals used in the refinement",
        type=os.path.abspath,
        required=True
    )
    parser.add_argument(
        "-s",
        "--sample",
        help="sample size to use in the refinement",
        type=int,
        default=500
    )
    parser.add_argument(
        "-f",
        "--fine_only",
        help="True = only do fine scan",
        type=bool,
        default=False
    )
    parser.add_argument(
        "-e",
        "--endstation",
        help="which endstation did you collect these data from, e.g., alvra or cristallina",
        type=str,
        default="cristallina"
    )
    args = parser.parse_args()
    # set threshold based on endstation
    if args.endstation == "alvra":
        threshold = 3000
    elif args.endstation == "cristallina":
        threshold = 10
    else:
        print( "you must say which beamline you collected the data on, alvra or cristallina, to set the threshold value correctly for crystfel" )
        exit()
    # run main
    cwd = os.getcwd()
    print( "top working directory = {0}".format( cwd ) )
    main( cwd, args.lst, args.sample, args.geom, args.central_distance, args.cell_file, args.fine_only, threshold )