acsm-fairifier/workflows/utils.py

import os
import sys
import inspect
try:
    thisFilePath = os.path.abspath(__file__)
    print(thisFilePath)
except NameError:
    print("[Notice] The __file__ attribute is unavailable in this environment (e.g., Jupyter or IDLE).")
    print("When using a terminal, make sure the working directory is set to the script's location to prevent path issues (for the DIMA submodule)")
    #print("Otherwise, path to submodule DIMA may not be resolved properly.")
    thisFilePath = os.getcwd()  # Use current directory or specify a default


projectPath = os.path.normpath(os.path.join(thisFilePath, "..", ".."))  # Move up to project root

if projectPath not in sys.path:
    sys.path.insert(0,projectPath)

import yaml
from collections import OrderedDict

def generate_command(base_command='python', inputs=None, outputs=None, parameters=None):
    inputs = inputs or []
    outputs = outputs or []
    placeholders = [
        f"${name}"
        for name, value in inputs + parameters + outputs
        if isinstance(value, dict) and not value.get('implicit', False)
    ]
    return f"{base_command} {' '.join(placeholders)}"

class RenkuWorkflowBuilder:
    """
    Build and manage a Renku workflow definition (YAML-based).
    Steps can be added, merged, serialized, and reloaded.
    """

    def __init__(self, name):
        """
        Initialize a workflow builder.

        Args:
            name (str): Workflow name.
        """
        self.name = name
        self.directory_path = os.path.join(projectPath, 'workflows').replace(os.sep, '/')
        self.steps = OrderedDict()

    @staticmethod
    def _hash_content(step_def):
        """
        Compute a stable hash of a step definition (for collision detection).
        """
        import json, hashlib
        content_str = json.dumps(step_def, sort_keys=True, default=str)
        return hashlib.md5(content_str.encode()).hexdigest()

    @staticmethod
    def _normalize_paths(items : list):
        """
        Normalize file paths inside a list of (key, value) pairs.
        """
        if not items:
            return items
        normalized = []
        for key, value in items:
            if isinstance(value, dict) and 'path' in value:
                value['path'] = value['path'].replace(os.sep, '/')
            normalized.append({key: value})
        return normalized

    def create_workflow_file(self):
        """
        Create (or update) a workflow YAML file on disk.
        """
        self.save_to_file(self.directory_path)
        filepath = os.path.join(self.directory_path, f'{self.name}.yaml').replace(os.sep, '/')
        if os.path.exists(filepath):
            print(f'Workflow file created at : {filepath}')
        # TODO: add else-case handling (currently silent if file not created)

    def add_step(self, step_name, base_command,
                 inputs : list = [],
                 outputs : list = [],
                 parameters : list = []):
        """
        Add a step to the workflow and persist it to file.
        Normalizes input/output/parameter paths and avoids duplicates.
        """
        command = generate_command(base_command, inputs, outputs, parameters)
        step = {'command': command}

        step['inputs'] = self._normalize_paths(inputs)
        step['outputs'] = self._normalize_paths(outputs)
        step['parameters'] = self._normalize_paths(parameters)

        # Deduplicate or version step
        if step_name not in self.steps:
            self.steps[step_name] = step
        elif self.steps[step_name] != step:
            content_hash = self._hash_content(step)
            hashed_name = f"{step_name}_{content_hash[:8]}"
            print(f"[Added] Step '{step_name}' as '{hashed_name}'")
            self.steps[hashed_name] = step

        self.save_to_file(self.directory_path)

    def run_and_add_step(self, step, *args, **kwargs):
        """
        Run a step function, collect its provenance, and add it to the workflow.
        """
        file_path = inspect.getfile(step)
        step_name = os.path.splitext(os.path.basename(file_path))[0]
        provenance = step(*args, **kwargs)
        # TODO: validate provenance has 'inputs', 'outputs', 'parameters'
        self.add_step(step_name, "python",
                      provenance["inputs"],
                      provenance["outputs"],
                      provenance["parameters"])
        return provenance

    @staticmethod
    def parse_workflow(yaml_content: str):
        """
        Parse YAML content and return a populated RenkuWorkflowBuilder.
        """
        data = yaml.safe_load(yaml_content)
        builder = RenkuWorkflowBuilder(data['name'])

        for step_name, step_def in data['steps'].items():
            builder.steps[step_name] = {
                'command': step_def.get('command'),
                'inputs': [{k: v} for item in step_def.get('inputs', []) for k, v in item.items()] if step_def.get('inputs') else [],
                'outputs': [{k: v} for item in step_def.get('outputs', []) for k, v in item.items()] if step_def.get('outputs') else [],
                'parameters': [{k: v} for item in step_def.get('parameters', []) for k, v in item.items()] if step_def.get('parameters') else []
            }
        return builder

    @staticmethod
    def from_file(filepath):
        """
        Load a workflow from file, or return a new empty one if file does not exist.
        """
        if not os.path.exists(filepath):
            workflow_name = os.path.splitext(os.path.basename(filepath))[0]
            return RenkuWorkflowBuilder(name=workflow_name)
        with open(filepath, 'r') as f:
            return RenkuWorkflowBuilder.parse_workflow(f.read())

    def to_dict(self):
        """
        Return workflow definition as a dict, normalizing paths in inputs/outputs.
        """
        for step_name, step_value in self.steps.items():
            for segment in ['inputs', 'outputs']:
                for item in step_value.get(segment, []):
                    for _, value in item.items():
                        if not isinstance(value, dict):
                            raise ValueError(
                                f"Invalid input. Step {step_name} must have {segment} as dict or str type."
                            )
                        if 'path' in value:
                            value['path'] = value['path'].replace(os.sep, '/')

        return {'name': self.name, 'steps': dict(self.steps)}

    def to_yaml(self):
        """
        Serialize workflow definition to YAML.
        """
        return yaml.dump(self.to_dict(), sort_keys=False)

    def append_from(self, other, force=False):
        """
        Merge steps from another workflow into this one.
        If step content differs, a hashed suffix is added.
        """
        if other.name != self.name:
            raise ValueError(f"Cannot merge workflows with different names: {self.name} != {other.name}")
        curr_steps = self.steps.copy()
        for step_name, step_def in other.steps.items():
            if step_name not in curr_steps:
                self.steps[step_name] = step_def
            elif self.steps[step_name] != step_def:
                content_hash = self._hash_content(step_def)
                hashed_name = f"{step_name}_{content_hash[:8]}"
                if hashed_name not in self.steps:
                    self.steps[hashed_name] = step_def
                    print(f"[Added] Step '{step_name}' → '{hashed_name}'")

    def save_to_file(self, directory):
        """
        Save workflow definition to a YAML file in the given directory.
        Merges with existing file if present.
        """
        os.makedirs(directory, exist_ok=True)
        filepath = os.path.join(directory, f"{self.name}.yaml")

        if os.path.exists(filepath):
            existing = RenkuWorkflowBuilder.from_file(filepath)
            if existing and existing.name == self.name:
                existing.append_from(self)
                with open(filepath, 'w') as f:
                    f.write(existing.to_yaml())
                return

        with open(filepath, 'w') as f:
            f.write(self.to_yaml())

import os
import re
import yaml
from graphviz import Digraph
from IPython.display import Image

def plot_workflow_graph(yaml_file_path,
                        output_dir=".",
                        output_name="workflow_graph",
                        output_format="png",
                        dpi=300,
                        show_parameters=False):

    def shorten_path(path, keep_start=1, keep_end=2):
        """
        Shortens a long path by keeping a few elements from the start and end.
        E.g. 'a/b/c/d/e/f.txt' -> 'a/.../e/f.txt'
        """
        parts = path.strip('/').split('/')
        if len(parts) <= (keep_start + keep_end):
            return path
        return '/'.join(parts[:keep_start]) + '/.../' + '/'.join(parts[-keep_end:])

    def split_path_label(path):
        parts = path.split('/')
        if len(parts) >= 2:
            return f"{'/'.join(parts[:-1])}/\n{parts[-1]}"
        return path

    # Load YAML workflow file
    with open(yaml_file_path, 'r') as f:
        workflow_full = yaml.safe_load(f)

    dot = Digraph(format=output_format)
    dot.attr(rankdir='LR') #'TB')  # vertical layout
    dot.node_attr.update(fontsize='48')

    # Set DPI only if format supports it (like png)
    if output_format.lower() == 'png':
        dot.attr(dpi=str(dpi))

    used_paths = set()

    for step_name, step in workflow_full['steps'].items():
        # Extract parameters
        params = step.get("parameters", [])

        # Extract parameters if enabled
        param_lines = []
        if show_parameters:
            params = step.get("parameters", [])
            for param in params:
                for k, v in param.items():
                    val = v.get("value", "")
                    param_lines.append(f"{k} = {val}")

        param_label = "\n".join(param_lines)
        label = f"{param_label}\n{step_name}" if param_label else step_name

        dot.node(step_name, label=label, shape="box", style="filled", fillcolor="lightblue")

        for input_item in step.get('inputs', []):
            for key, val in input_item.items():
                if isinstance(val, dict) and 'path' in val:
                    path = shorten_path(val['path'])
                    label = split_path_label(path)
                    if path not in used_paths:
                        dot.node(path, label=label, tooltip=path, shape="ellipse", style="filled", fillcolor="lightgrey")
                        used_paths.add(path)
                    dot.edge(path, step_name)

        for output_item in step.get('outputs', []):
            for key, val in output_item.items():
                if isinstance(val, dict) and 'path' in val:
                    path = shorten_path(val['path'])
                    label = split_path_label(path)
                    if path not in used_paths:
                        dot.node(path, label=label, tooltip=path, shape="ellipse", style="filled", fillcolor="lightgreen")
                        used_paths.add(path)
                    dot.edge(step_name, path)

    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_name)
    dot.render(output_path)
    #dot.view()

    # For SVG or PDF, you may want to return the file path or raw output instead of Image()
    if output_format.lower() in ['png', 'jpg', 'jpeg', 'gif']:
        return Image(output_path + f".{output_format}")
    else:
        print(f"Graph saved to: {output_path}.{output_format}")
        return output_path + f".{output_format}"