import argparse import re def extract_data_from_chunks( input_file, output_file, name ): print( "reading input file" ) with open(input_file, 'r') as file: data = file.read() print( "done" ) print( "finding chunks" ) chunks = re.findall(r'----- Begin chunk -----.*?----- End chunk -----', data, re.DOTALL) print( "done. {0} found".format( len( chunks ) ) ) print( "cycle through chunks" ) with open(output_file, 'w') as out_file: for chunk in chunks: indexed_by_line = re.search(r'indexed_by = (.+)', chunk) if indexed_by_line: indexed_by_value = indexed_by_line.group(1).strip() if indexed_by_value != 'none': image_filename_match = re.search(r'Image filename: (.+\.h5)', chunk) event_match = re.search(r'Event: (//\d+)', chunk) if image_filename_match and event_match: image_filename = image_filename_match.group(1) event_number = event_match.group(1) out_file.write(f"{image_filename} {event_number} {name}\n") print( "done" ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Extract data from chunks in a text file.") parser.add_argument( "-i", "--input", help="Input file path", required=True ) parser.add_argument( "-o", "--output", help="Output file path", required=True ) parser.add_argument( "-n", "--name", help="name of dataset", required=True ) args = parser.parse_args() extract_data_from_chunks( args.input, args.output, args.name )