ro-crate-interoperability-p…/0.2.x/lib/python/lib-ro-crate-schema/examples/full_example.py

#!/usr/bin/env python3
"""
Comprehensive RO-Crate Schema Library Demonstration

This example showcases the full capabilities of the RO-Crate schema library through
a complex scientific workflow involving OpenBIS data management, chemical synthesis, object modification with round-trip persistence.

Features demonstrated:
- Complex nested object hierarchies (Project → Space → Collections/Equipment)
- Self-referential relationships (molecules containing other molecules)
- Mixed ontology namespaces (OpenBIS custom + schema.org)
- Dynamic experimental workflow simulation
- Large-scale RDF generation and serialization
- Round-trip fidelity with state modifications
- Real-world scientific data modeling

Run with: uv run python examples/full_example.py
"""

import json
from math import e
import sys
import csv
import tempfile
from pathlib import Path
from datetime import datetime
from tkinter import E
from typing import List, Optional, Dict, Any

# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))

from pydantic import BaseModel
from lib_ro_crate_schema.crate.decorators import ro_crate_schema, Field
from lib_ro_crate_schema.crate.schema_facade import SchemaFacade


# Removed print_section function - using direct print statements instead


# ============================================================================
# MODEL DEFINITIONS
# ============================================================================

@ro_crate_schema(ontology="http://openbis.org/Project")
class Project(BaseModel):
    """OpenBIS research project"""
    code: str = Field(comment="Unique project identifier")
    name: str = Field(ontology="https://schema.org/name")
    description: str = Field(ontology="https://schema.org/description")
    created_date: datetime = Field(ontology="https://schema.org/dateCreated")
    space: Optional['Space'] = Field(default=None, ontology="http://openbis.org/hasSpace")


@ro_crate_schema(ontology="http://openbis.org/Space")
class Space(BaseModel):
    """OpenBIS laboratory space"""
    name: str = Field(ontology="https://schema.org/name")
    description: str = Field(ontology="https://schema.org/description")
    created_date: datetime = Field(ontology="https://schema.org/dateCreated")
    collections: List['Collection'] = Field(default=[], ontology="http://openbis.org/hasCollection")

@ro_crate_schema(ontology="http://openbis.org/Collection")
class Collection(BaseModel):
    """OpenBIS sample/data collection"""
    name: str = Field(ontology="https://schema.org/name")
    sample_type: str = Field(comment="Type of samples stored")
    storage_conditions: str = Field(comment="Storage requirements")
    created_date: datetime = Field(ontology="https://schema.org/dateCreated")
    contains: List[Any] = Field(default=[], comment="Entities contained in the collection")


@ro_crate_schema(ontology="http://openbis.org/Equipment")
class Equipment(BaseModel):
    """Laboratory equipment with optional nesting"""
    name: str = Field(ontology="https://schema.org/name")
    model: str = Field(comment="Equipment model/version")
    serial_number: str = Field(ontology="https://schema.org/serialNumber")
    created_date: datetime = Field(ontology="https://schema.org/dateCreated")
    parent_equipment: Optional['Equipment'] = Field(default=None, ontology="https://schema.org/isPartOf")
    configuration: Dict[str, Any] = Field(default={}, comment="Equipment configuration parameters")


@ro_crate_schema(ontology="https://schema.org/ChemicalSubstance")
class Molecule(BaseModel):
    """Chemical compound with SMILES notation"""
    name: str = Field(ontology="https://schema.org/name")
    smiles: str = Field(comment="SMILES notation for chemical structure")
    molecular_weight: float = Field(comment="Molecular weight in g/mol")
    contains_molecules: List['Molecule'] = Field(default=[], ontology="https://schema.org/hasPart")
    cas_number: Optional[str] = Field(default=None, comment="CAS Registry Number")
    created_date: datetime = Field(ontology="https://schema.org/dateCreated")
    experimental_notes: Optional[str] = Field(default=None, comment="Lab notes or modifications")


@ro_crate_schema(ontology="https://schema.org/Person")
class Person(BaseModel):
    """Research author/scientist"""
    name: str = Field(ontology="https://schema.org/name")
    orcid: str = Field(ontology="https://schema.org/identifier")
    email: str = Field(ontology="https://schema.org/email")
    affiliation: 'Organization' = Field(ontology="https://schema.org/affiliation")
    colleagues: List['Person'] = Field(default=[], ontology="https://schema.org/colleague")


@ro_crate_schema(ontology="https://schema.org/Organization")
class Organization(BaseModel):
    """Research institution"""
    name: str = Field(ontology="https://schema.org/name")
    country: str = Field(ontology="https://schema.org/addressCountry")
    website: str = Field(ontology="https://schema.org/url")


@ro_crate_schema(ontology="https://schema.org/ScholarlyArticle")
class Publication(BaseModel):
    """Scientific publication"""
    title: str = Field(ontology="https://schema.org/name")
    authors: List[Person] = Field(ontology="https://schema.org/author")
    molecules: List[Molecule] = Field(ontology="https://schema.org/mentions")
    equipment: List[Equipment] = Field(ontology="https://schema.org/instrument")
    organization: Organization = Field(ontology="https://schema.org/publisher")
    doi: str = Field(ontology="https://schema.org/identifier")
    publication_date: datetime = Field(ontology="https://schema.org/datePublished")


def create_initial_data():
    """Create all initial model instances"""

    print("\n🎯 PHASE 1: INITIAL DATA CREATION")
    print("=" * 40)

    # Organization
    empa = Organization(
        name="Swiss Federal Laboratories for Materials Science and Technology (Empa)",
        country="Switzerland",
        website="https://www.empa.ch"
    )

    # People (with circular colleague relationships)
    # First create persons without colleagues
    sarah = Person(
        name="Dr. Sarah Chen",
        orcid="0000-0002-1234-5678",
        email="sarah.chen@empa.ch",
        affiliation=empa,
        colleagues=[]
    )

    marcus = Person(
        name="Prof. Marcus Weber",
        orcid="0000-0003-8765-4321",
        email="marcus.weber@empa.ch",
        affiliation=empa,
        colleagues=[]
    )

    # Now establish circular colleague relationships
    # This tests how the system handles circular imports in the schema
    sarah = sarah.model_copy(update={'colleagues': [marcus]})
    marcus = marcus.model_copy(update={'colleagues': [sarah]})

    # Equipment (nested)
    mass_spec = Equipment(
        name="Agilent 7890A GC-MS",
        model="7890A",
        serial_number="DE43151234",
        created_date=datetime(2023, 1, 15),
        configuration={
            "ionization_mode": "EI",
            "mass_range_min": 50,
            "mass_range_max": 500,
            "resolution": "unit_mass",
            "detector_voltage": 1200
        }
    )

    reactor = Equipment(
        name="FlowSyn Reactor",
        model="v2.1",
        serial_number="FSR-2024-001",
        created_date=datetime(2023, 2, 1),
        parent_equipment=mass_spec,  # Mass spec is part of reactor system
        configuration={
            "max_temperature_celsius": 250,
            "max_pressure_bar": 10,
            "flow_rate_ml_per_min": 5,
            "volume_ml": 50,
            "heating_method": "microwave"
        }
    )

    # Collections
    molecules_collection = Collection(
        name="Molecular Library",
        sample_type="Chemical compounds",
        storage_conditions="-20°C, inert atmosphere",
        created_date=datetime(2023, 3, 1),
        contains=[]  # Will populate later
    )

    lab_equipment = Collection(
        name="Laboratory Equipment",
        sample_type="Analytical instruments",
        storage_conditions="Room temperature, calibrated monthly",
        created_date=datetime(2023, 2, 15),
        contains=[reactor, mass_spec]  # Equipment collection contains these items
    )

    # Molecules (with complex relationships)
    benzene = Molecule(
        name="Benzene",
        smiles="c1ccccc1",
        molecular_weight=78.11,
        cas_number="71-43-2",
        created_date=datetime(2024, 1, 10)
    )

    toluene = Molecule(
        name="Toluene",
        smiles="Cc1ccccc1",
        molecular_weight=92.14,
        cas_number="108-88-3",
        created_date=datetime(2024, 1, 12)
    )

    phenol = Molecule(
        name="Phenol",
        smiles="c1ccc(cc1)O",
        molecular_weight=94.11,
        cas_number="108-95-2",
        created_date=datetime(2024, 1, 15)
    )

    aniline = Molecule(
        name="Aniline",
        smiles="c1ccc(cc1)N",
        molecular_weight=93.13,
        cas_number="62-53-3",
        created_date=datetime(2024, 1, 18)
    )

    # Complex polymer containing other molecules
    complex_polymer = Molecule(
        name="Benzene-Toluene Polymer",
        smiles="[*]c1ccccc1[*].[*]Cc1ccccc1[*]",  # Polymer SMILES
        molecular_weight=340.45,
        contains_molecules=[benzene, toluene],  # Self-reference
        created_date=datetime(2024, 2, 1)
    )

    # Add molecules to collection
    molecules_collection.contains.extend([benzene, toluene, phenol, aniline, complex_polymer])

    # OpenBIS hierarchy
    science_space = Space(
        name="Advanced Materials Laboratory",
        description="State-of-the-art facility for nanomaterial synthesis and characterization",
        created_date=datetime(2023, 1, 1),
        collections=[molecules_collection, lab_equipment]
    )

    openbis_project = Project(
        code="NANO-2024",
        name="Nanocomposite Materials Research",
        description="Development of advanced nanocomposite materials for industrial applications",
        created_date=datetime(2024, 1, 1),
        space=science_space
    )

    # Publication tying everything together
    publication = Publication(
        title="Advanced Nanocomposite Materials: From Molecular Design to Industrial Applications",
        authors=[sarah, marcus],
        molecules=[benzene, toluene, phenol, aniline, complex_polymer],
        equipment=[reactor, mass_spec],
        organization=empa,
        doi="10.1021/acs.nanolett.2024.12345",
        publication_date=datetime(2024, 6, 15)
    )

    return {
        'openbis_project': openbis_project,
        'science_space': science_space,
        'molecules_collection': molecules_collection,
        'lab_equipment': lab_equipment,
        'reactor': reactor,
        'mass_spec': mass_spec,
        'benzene': benzene,
        'toluene': toluene,
        'phenol': phenol,
        'aniline': aniline,
        'complex_polymer': complex_polymer,
        'sarah': sarah,
        'marcus': marcus,
        'empa': empa,
        'publication': publication
    }


class MoleculeModel:  # Alias for sake of this example
    pass

# EquipmentModel = Equipment  # Alias for clarity

def experiment(reactant1, reactant2, catalyst, equipment) -> tuple[dict, Path]:
    """
    Simulate chemical synthesis experiment and create observation file

    Creates a new product molecule by combining reactants and modifies
    the original reactants with experimental notes. Also generates a CSV
    file with experimental observations.

    Args:
        reactant1: Primary reactant molecule
        reactant2: Secondary reactant molecule
        catalyst: Catalytic molecule (unchanged)
        equipment: Equipment used for reaction

    Returns:
        Tuple of (new product molecule, path to observations CSV file)
    """

    print("\n🔹 EXPERIMENTAL SYNTHESIS")
    print(f"  Reactants: {reactant1.name} + {reactant2.name}")
    print(f"  Catalyst: {catalyst.name}")
    print(f"  Equipment: {equipment.name}")

    # Experimental parameters and observations
    experiment_time = datetime.now()

    # Create product molecule with combined SMILES
    # Simple concatenation for demo (real chemistry would be more complex)
    product_smiles = f"({reactant1.smiles}).({reactant2.smiles})"
    product_mw = reactant1.molecular_weight + reactant2.molecular_weight

    product_dict = {
        "name": f"{reactant1.name}-{reactant2.name} Adduct",
        "smiles": product_smiles,
        "molecular_weight": product_mw,
        "contains_molecules": [reactant1, reactant2],  # Names instead of objects
        "created_date": experiment_time.isoformat(),
        "experimental_notes": f"Synthesized via {catalyst.name} catalysis using {equipment.name}"
    }

    # Get sample experimental observations CSV file (located in same folder as this scipt)
    csv_path = Path(__file__).parent / "experimental_observations.csv"

    # Check for file
    if not csv_path.exists():
        print(f"  ⚠️  Warning: Observations CSV file not found at {csv_path}. Skipping file adding.")
    else:
        print(f"  📁  Found observations CSV file at: {csv_path}")

    # Modify original reactants with experimental data
    reactant1.experimental_notes = f"Consumed 0.5 mol in synthesis reaction at {experiment_time.strftime('%Y-%m-%d %H:%M')}"
    reactant2.experimental_notes = f"Partially consumed, 0.3 mol remaining after reaction"

    print(f"  Product: {product_dict['name']}")
    print(f"  Product SMILES: {product_dict['smiles']}")

    return product_dict, csv_path


def analyze_rocrate_changes(initial_path: Path, final_path: Path):
    """Compare initial and final RO-Crate files"""

    print("\n🔹 RO-CRATE COMPARISON ANALYSIS")

    with open(initial_path / "ro-crate-metadata.json", 'r') as f:
        initial_data = json.load(f)

    with open(final_path / "ro-crate-metadata.json", 'r') as f:
        final_data = json.load(f)

    initial_entities = len(initial_data["@graph"])
    final_entities = len(final_data["@graph"])

    print(f"  📊 Initial entities: {initial_entities}")
    print(f"  📊 Final entities: {final_entities}")
    print(f"  📈 Change: +{final_entities - initial_entities} entities")

    # Count entity types
    def count_types(data):
        types = {}
        for entity in data["@graph"]:
            entity_type = entity.get("@type", "Unknown")
            if isinstance(entity_type, list):
                for t in entity_type:
                    types[t] = types.get(t, 0) + 1
            else:
                types[entity_type] = types.get(entity_type, 0) + 1
        return types

    initial_types = count_types(initial_data)
    final_types = count_types(final_data)

    print("\n  📋 Entity type changes:")
    all_types = set(initial_types.keys()) | set(final_types.keys())
    for entity_type in sorted(all_types):
        initial_count = initial_types.get(entity_type, 0)
        final_count = final_types.get(entity_type, 0)
        if initial_count != final_count:
            print(f"    {entity_type}: {initial_count} → {final_count} ({final_count - initial_count:+d})")
        else:
            print(f"    {entity_type}: {initial_count} (unchanged)")


def main():
    """Execute the complete workflow demonstration"""

    print("🧪 COMPREHENSIVE RO-CRATE SCHEMA WORKFLOW DEMONSTRATION")
    print("=" * 80)
    print("This demo showcases complex scientific data modeling, experimental workflows,")
    print("and dynamic object modification with full round-trip persistence.")

    # ========================================================================
    # PHASE 1: INITIAL SETUP
    # ========================================================================

    print("\n🎯 Creating Initial Schema and Data")
    print("=" * 40)

    # Create all instances
    instances = create_initial_data()

    print(f"  ✅ Created {len(instances)} model instances")
    print("  📋 Instance types:")
    type_counts = {}
    for instance in instances.values():
        type_name = type(instance).__name__
        type_counts[type_name] = type_counts.get(type_name, 0) + 1

    for type_name, count in sorted(type_counts.items()):
        print(f"    - {type_name}: {count}")

    print(f"\n  🔄 Circular Relationship Test:")
    sarah_instance = instances['sarah']
    marcus_instance = instances['marcus']
    print(f"    - Sarah Chen has {len(sarah_instance.colleagues)} colleague(s): {[c.name for c in sarah_instance.colleagues]}")
    print(f"    - Marcus Weber has {len(marcus_instance.colleagues)} colleague(s): {[c.name for c in marcus_instance.colleagues]}")

    # Build schema facade
    facade = SchemaFacade()
    facade.add_all_registered_models()

    print(f"\n  📊 Schema: {len(facade.types)} types registered")

    # Add all instances
    for instance_id, instance in instances.items():
        facade.add_model_instance(instance, instance_id)

    print(f"  📦 Added {len(facade.metadata_entries)} metadata entries")

    # Generate RDF
    rdf_graph = facade.to_graph()
    print(f"  🕸️  Generated {len(rdf_graph)} RDF triples")

    # Export initial state
    print("\n🔹 Exporting Initial RO-Crate")
    import os
    output_dir = "output_crates"
    os.makedirs(output_dir, exist_ok=True)
    initial_path = os.path.join(output_dir, "full_example_initial")
    facade.write(
        destination=initial_path,
        name="Complex Scientific Workflow - Initial State",
        description="Initial RO-Crate before experimental modifications",
        license="MIT"
    )
    print(f"  💾 Saved initial state: {initial_path}")
    initial_path = Path(initial_path)

    # ========================================================================
    # PHASE 2: IMPORT AND EXPERIMENT
    # ========================================================================

    print("\n🎯 Importing RO-Crate and Running Experiment")
    print("=" * 40)

    # Import the RO-Crate we just exported
    print("\n🔹 Importing RO-Crate from exported files")
    print(f"  📁 Loading RO-Crate from: {initial_path}")

    imported_facade = SchemaFacade.from_ro_crate(initial_path)

    print(f"  ✅ Successfully imported RO-Crate!")
    print(f"  📊 Imported {len(imported_facade.types)} types")
    print(f"  📦 Imported {len(imported_facade.metadata_entries)} metadata entries")

    # Show what was imported
    print("\n  📋 Imported types:")
    for imported_type in imported_facade.types:
        props = len(imported_type.rdfs_property or [])
        restrictions = len(imported_type.get_restrictions())
        print(f"    - {imported_type.id}: {props} properties, {restrictions} restrictions")

    print("\n  📦 Imported metadata entries (first 5):")
    for entry in imported_facade.metadata_entries[:5]:
        print(f"    - {entry.id} (type: {entry.class_id})")

    # Import Molecule and Equipment Models
    MoleculeModel = imported_facade.export_pydantic_model("Molecule")
    EquipmentModel = imported_facade.export_pydantic_model("Equipment")

    # Know we need molecules: benzene, toluene, aniline
    # And equipment: reactor
    benzene = imported_facade.get_entry_as("benzene", MoleculeModel)
    toluene = imported_facade.get_entry_as("toluene", MoleculeModel)
    aniline = imported_facade.get_entry_as("aniline", MoleculeModel)
    reactor = imported_facade.get_entry_as("reactor", EquipmentModel)

    print(f"  ✅ Selected from imported data: {benzene.name}, {toluene.name}, {aniline.name}, {reactor.name}")

    # Run experiment
    product_dict, observations_csv = experiment(benzene, toluene, aniline, reactor)

    # Create new product molecule instance
    product = MoleculeModel(**product_dict)

    print(f"  🧪 Experiment complete, product created: {product.name}")

    # ========================================================================
    # PHASE 3: UPDATE AND RE-EXPORT
    # ========================================================================

    print("\n🎯 Updating Schema with Experimental Results")
    print("=" * 40)

    # Create new facade with updated data
    updated_facade = SchemaFacade()
    updated_facade.add_all_registered_models()

    # Add all original instances (now with modifications)
    for instance_id, instance in instances.items():
        updated_facade.add_model_instance(instance, instance_id)

    # Add new product
    updated_facade.add_model_instance(product, "synthesis_product")

    print(f"  📊 Updated schema: {len(updated_facade.types)} types")
    print(f"  📦 Updated entries: {len(updated_facade.metadata_entries)} metadata entries")

    # Generate updated RDF
    updated_rdf_graph = updated_facade.to_graph()
    print(f"  🕸️  Updated RDF graph: {len(updated_rdf_graph)} triples")
    print(f"  📈 RDF growth: +{len(updated_rdf_graph) - len(rdf_graph)} triples")

    # Export final state
    print("\n🔹 Exporting Final RO-Crate")
    # Add experimental observations file to facade
    updated_facade.add_file(
        file_path=observations_csv,
        name="Experimental Observations",
        description="Detailed measurements from chemical synthesis experiment including temperature, pressure, yields and purity data"
    )

    final_path = os.path.join(output_dir, "full_example_final")
    updated_facade.write(
        destination=final_path,
        name="Complex Scientific Workflow - Final State",
        description="Final RO-Crate after experimental synthesis with observation data",
        license="MIT"
    )
    print(f"  💾 Saved final state: {final_path}")
    final_path = Path(final_path)

    # ========================================================================
    # PHASE 4: ANALYSIS
    # ========================================================================

    print("\n🎯 WORKFLOW ANALYSIS & RESULTS")
    print("=" * 40)

    # Compare facades (original vs imported)
    print("\n🔹 Import Fidelity Analysis")
    print(f"  📊 Original facade: {len(facade.types)} types, {len(facade.metadata_entries)} entries")
    print(f"  📊 Imported facade: {len(imported_facade.types)} types, {len(imported_facade.metadata_entries)} entries")

    # Check if all types were preserved
    original_type_ids = {t.id for t in facade.types}
    imported_type_ids = {t.id for t in imported_facade.types}
    if original_type_ids == imported_type_ids:
        print(f"  ✅ All {len(original_type_ids)} types preserved in import")
    else:
        print(f"  ⚠️  Type mismatch: original={len(original_type_ids)}, imported={len(imported_type_ids)}")
        missing_types = original_type_ids - imported_type_ids
        if missing_types:
            print(f"     Missing: {missing_types}")
        extra_types = imported_type_ids - original_type_ids
        if extra_types:
            print(f"     Extra: {extra_types}")

    # Check if all metadata entries were preserved
    original_entry_ids = {e.id for e in facade.metadata_entries}
    imported_entry_ids = {e.id for e in imported_facade.metadata_entries}
    if original_entry_ids == imported_entry_ids:
        print(f"  ✅ All {len(original_entry_ids)} metadata entries preserved in import")
    else:
        print(f"  ⚠️  Metadata entry mismatch: original={len(original_entry_ids)}, imported={len(imported_entry_ids)}")
        missing_entries = original_entry_ids - imported_entry_ids
        if missing_entries:
            print(f"     Missing: {missing_entries}")
        extra_entries = imported_entry_ids - original_entry_ids
        if extra_entries:
            print(f"     Extra: {extra_entries}")

    # Compare files
    analyze_rocrate_changes(initial_path, final_path)

    # Show experimental modifications
    print("\n🔹 Experimental Modifications Detected")
    print(f"  🧪 New molecule created: {product.name}")
    print(f"     SMILES: {product.smiles}")
    print(f"     Notes: {product.experimental_notes}")

    print(f"\n  📝 Modified molecules:")
    modified_molecules = [instances['benzene'], instances['toluene']]
    for mol in modified_molecules:
        if mol.experimental_notes:
            print(f"     - {mol.name}: {mol.experimental_notes}")

    # Summary statistics
    print("\n🔹 Final Statistics")
    print(f"  📊 Original facade: {len(facade.types)} types, {len(facade.metadata_entries)} entries")
    print(f"  📊 Imported facade: {len(imported_facade.types)} types, {len(imported_facade.metadata_entries)} entries")
    print(f"  � Final facade: {len(updated_facade.types)} types, {len(updated_facade.metadata_entries)} entries")
    print(f"  🕸️  Final RDF triples: {len(updated_rdf_graph)}")
    print(f"  🔄 Round-trip cycles: 3 (export → import → experiment → export)")
    print(f"  ⚗️  Experiments performed: 1")
    print(f"  🆕 New entities created: 1")
    print(f"  ✏️  Entities modified: 2")

    print("\n" + "="*80)
    print("🎉 COMPREHENSIVE WORKFLOW WITH IMPORT DEMONSTRATION COMPLETE!")
    print("   📁 RO-Crates created:")
    print(f"      - Initial: {initial_path}")
    print(f"      - Final: {final_path}")
    print("="*80)

    return {
        'initial_facade': facade,
        'imported_facade': imported_facade,
        'updated_facade': updated_facade,
        'instances': instances,
        'product': product,
        'initial_path': initial_path,
        'final_path': final_path
    }


if __name__ == "__main__":
    results = main()