#!/usr/bin/env python3

# author J.Beale

"""
# aim
selects crystals based on their unit cell values

# usage
python stream_random.py -s <path to stream>
                        -o output file names
                        -c tolerance of cell axes to accept
                        -a tolerance of angles to accept
                        -i ideal central lengths to accept

# output
.stream file containing images that fall within requested range of value
"""

# modules
import re
import argparse
import pandas as pd
import numpy as np
import os

def extract_chunks( input_file ):

    # setup
    chunk_df = pd.DataFrame()
    image_no = []
    chunks = []
    hits = []
    collect_lines = False
    # Open the input file for reading
    with open(input_file, 'r') as f:
        for line in f:

            # Check for the start condition
            if line.startswith('----- Begin chunk -----'):
                hit = False
                collect_lines = True
                chunk_lines = []
            if collect_lines:
                chunk_lines.append(line)
            
            # find image_no
            if line.startswith( "Event:" ):
                image_search = re.findall( r"Event: //(\d+)", line )
                image = int(image_search[0])
                image_no.append( image )
            
            # is there a hit in chunk
            if line.startswith( "Cell parameters" ):
                hit = True

            if line.startswith('----- End chunk -----'):
                collect_lines = False  # Stop collecting lines
                chunks.append( chunk_lines )
                hits.append( hit )
    
    chunk_df[ "chunks" ] = chunks
    chunk_df[ "image_no" ] = image_no
    chunk_df[ "hit" ] = hits

    return chunk_df

def scrub_cells( line ):

    # get uc values from stream file
    # example - Cell parameters 7.71784 7.78870 3.75250 nm, 90.19135 90.77553 90.19243 deg
    pattern = r"Cell\sparameters\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\snm,\s(\d+\.\d+)\s(\d+\.\d+)\s(\d+\.\d+)\sdeg"
    a = re.search( pattern, line ).group(1)
    b = re.search( pattern, line ).group(2)
    c = re.search( pattern, line ).group(3)
    alpha = re.search( pattern, line ).group(4)
    beta = re.search( pattern, line ).group(5)
    gamma = re.search( pattern, line ).group(6)

    data = [ { "a" : float( a )*10,
               "b" : float( b )*10,
               "c" : float( c )*10,
               "alpha" : float( alpha ),
               "beta" : float( beta ),
               "gamma" : float( gamma )
                } ]
    cell_df = pd.DataFrame( data )

    return cell_df

def extract_xtals( chunk ):

    # setup
    xtals = []
    cells_df = pd.DataFrame()
    collect_crystal_lines = False
    # Open the input file for reading
    for line in chunk:

        # Check for the xtals start condition
        if line.startswith('--- Begin crystal'):
            collect_crystal_lines = True
            xtal_lines = []
        if collect_crystal_lines:
            xtal_lines.append(line)
        if line.startswith('--- End crystal\n'):
            collect_crystal_lines = False  # Stop collecting lines
            xtals.append( xtal_lines )
        if line.startswith( "Cell" ):
            cell_df = scrub_cells( line )
            cells_df = pd.concat( ( cells_df, cell_df ) )

    # reset index of cells_df
    cells_df = cells_df.reset_index( drop=True )

    return xtals, cells_df

def extract_header( chunk ):

    # setup
    header = []
    collect_header_lines = False
    # Open the input file for reading
    for line in chunk:

        # Check for the xtals start condition
        if line.startswith('----- Begin chunk -----'):
            collect_header_lines = True
            header_lines = []
        if collect_header_lines:
            header_lines.append(line)
        if line.startswith('End of peak list'):
            collect_header_lines = False  # Stop collecting lines
            header.append( header_lines )

    return header

def get_header( header, input_file ):

    if header == "geom":
        start_keyword = "----- Begin geometry file -----"
        end_keyword = "----- End geometry file -----"
    if header == "cell":
        start_keyword = "----- Begin unit cell -----"
        end_keyword = "----- End unit cell -----"

    # setup
    collect_lines = False
    headers = []

    # Open the input file for reading
    with open(input_file, 'r') as f:
        for line in f:
            # Check for the start condition
            if line.strip() == start_keyword:
                collect_lines = True
                headers_lines = []
            # Collect lines between start and end conditions
            if collect_lines:
                headers_lines.append(line)
            # Check for the end condition
            if line.strip() == end_keyword:
                collect_lines = False  # Stop collecting lines
                headers.append(headers_lines)
    
    return headers[0]

def write_to_file( geom, cell, chunk_header, crystals, output_file ):

    # Write sections with matching cell parameters to the output file
    with open(output_file, 'w') as out_file:
        out_file.write('CrystFEL stream format 2.3\n')
        out_file.write('Generated by CrystFEL 0.10.2\n')
        out_file.writelines(geom)
        out_file.writelines(cell)
        for crystal, header in zip( crystals, chunk_header ):
            out_file.writelines( header )
            out_file.writelines( crystal )
            out_file.writelines( "----- End chunk -----\n" )

def is_within_range( a, b, c, alpha, beta, gamma, cell_tolerance, angle_tolerance, ideal_centre ):
    
    cell = [ a, b, c, alpha, beta, gamma ]
    returner = False
    within_range = []    

    # Function to check if cell parameters are within the specified range
    for i in range(3):
        if cell[i] < ideal_centre[i]+cell_tolerance and cell[i] > ideal_centre[i]-cell_tolerance:
            within_range.append(True)
        else:
            within_range.append(False)
            
    for i in range(3, 6):
        if cell[i] < ideal_centre[i]+angle_tolerance and cell[i] > ideal_centre[i]-angle_tolerance:
            within_range.append(True)
        else:
            within_range.append(False)

    if all(within_range):
        returner = True

    return returner

def sort_xtals( chunk_df ):

    # extract xtals
    xtal_df = pd.DataFrame()
    counter = 0
    for index, row in chunk_df.iterrows():

        chunk, hit, image_no = row[ "chunks" ], row[ "hit" ], row[ "image_no" ]

        if hit:
        
            # find xtals and header
            header = extract_header( chunk )
            xtals, cells_df = extract_xtals( chunk )

            # make header same length as xtals
            header = header*len(xtals)

            # concat results
            xtal_df_1 = pd.DataFrame()
            xtal_df_1[ "header" ] = header
            xtal_df_1[ "xtals" ] = xtals
            xtal_df_1[ "image_no" ] = image_no
            xtal_df_1 = pd.concat( ( xtal_df_1, cells_df ), axis=1 )
            xtal_df = pd.concat( ( xtal_df, xtal_df_1 ) )

            # add count and print every 1000s
            counter = counter + len(xtals)
            if counter % 1000 == 0:
                print( counter, end='\r' )
    print( "done" )

    # sort by image no and reindex
    xtal_df = xtal_df.sort_values( by=[ "image_no" ] )
    xtal_df = xtal_df.reset_index( drop=True )

    return xtal_df

def main( input_file, output, cell_tolerance, angle_tolerance, ideal_centre ):

    # get geom and cell file headers
    print( "getting header info from .stream file" )
    geom = get_header( "geom", input_file )
    cell = get_header( "cell", input_file )
    print( "done" )
    
    # extract chunks
    print( "finding chucks" )
    chunk_df = extract_chunks( input_file )
    # display no. of chunks
    print( "found {0} chunks".format( len(chunk_df) ) )
    # remove rows without xtals
    chunk_df = chunk_df.loc[chunk_df.hit, :]
    print( "found {0} hits (not including multiples)".format( len(chunk_df) ) )
    print( "done" )

    print( "sorting xtals from chunks" )
    xtal_df = sort_xtals( chunk_df )
    print( "done" )

    print( "finding cells that are within tolerance" )
    # select cells that are within tolerance
    xtal_df[ "select" ] = xtal_df.apply(lambda x: is_within_range( x[ "a" ], x[ "b" ], x[ "c" ], 
                                                                   x[ "alpha" ], x[ "beta" ], x[ "gamma" ], 
                                                                   cell_tolerance, 
                                                                   angle_tolerance, 
                                                                   ideal_centre ), axis=1
                                        )
    select_df = xtal_df[ xtal_df[ "select" ] == True ]
    print( "done" )

    print( "writing {0} to output file".format( len( select_df ) ) )
    crystals = select_df.xtals.to_list()
    chunk_header = select_df.header.to_list()
    output_file = "{0}.stream".format( output )
    write_to_file( geom, cell, chunk_header, crystals, output_file )
    print( "done" )

def list_of_floats(arg):
    return list(map(float, arg.split(',')))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--stream",
        help="input stream file",
        required=True,
        type=os.path.abspath
    )
    parser.add_argument(
        "-o",
        "--output",
        help="output stream file name. '.stream will be added'",
        type=str,
        default="sample"
    )
    parser.add_argument(
        "-c",
        "--cell_tolerance",
        help="tolerance for cell lengths. 0.2 is default.",
        type=float,
        default=0.2
    )
    parser.add_argument(
        "-a",
        "--angle_tolerance",
        help="tolerance for cell angles. 0.25 is default.",
        type=float,
        default=0.25
    )
    parser.add_argument(
        "-i",
        "--ideal_centre",
        help="ideal lengths to be selected around. List of floats - e.g. 78.5, 78.5, 38.45, 90, 90, 90",
        type=list_of_floats,
        required=True
    )
    args = parser.parse_args()

    # run main
    main( args.stream, args.output, args.cell_tolerance, args.angle_tolerance, args.ideal_centre )