import argparse
import re

def extract_data_from_chunks( input_file, output_file, name ):

    print( "reading input file" )
    with open(input_file, 'r') as file:
        data = file.read()
    print( "done" )

    print( "finding chunks" )
    chunks = re.findall(r'----- Begin chunk -----.*?----- End chunk -----', data, re.DOTALL)
    print( "done. {0} found".format( len( chunks ) ) )

    print( "cycle through chunks" )
    with open(output_file, 'w') as out_file:
        for chunk in chunks:
            indexed_by_line = re.search(r'indexed_by = (.+)', chunk)
            if indexed_by_line:
                indexed_by_value = indexed_by_line.group(1).strip()
                if indexed_by_value != 'none':
                    image_filename_match = re.search(r'Image filename: (.+\.h5)', chunk)
                    event_match = re.search(r'Event: (//\d+)', chunk)

                    if image_filename_match and event_match:
                        image_filename = image_filename_match.group(1)
                        event_number = event_match.group(1)

                        out_file.write(f"{image_filename} {event_number} {name}\n")
    print( "done" )

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Extract data from chunks in a text file.")
    parser.add_argument(    "-i",
                            "--input",
                            help="Input file path",
                            required=True
                        )
    parser.add_argument(    "-o",
                            "--output",
                            help="Output file path",
                            required=True
                        )
    parser.add_argument(    "-n",
                            "--name",
                            help="name of dataset",
                            required=True
                        )
    args = parser.parse_args()

    extract_data_from_chunks( args.input, args.output, args.name )