Case 02: James's Silent Performance Regression

Persona: James, ML Engineer

Situation

James upgrades a library and retrains. Accuracy drops from 0.91 to 0.87. He suspects a dependency change but cannot confirm it — the environment from the previous training was never recorded. His requirements.txt was committed three weeks ago and may not reflect what was actually installed.

Without Contexta

requirements.txt is a snapshot in time, not a training-time record.
Diffing two requirements.txt files requires knowing which commit to compare against.
If the file wasn't committed, the old environment is gone.
Guessing which package caused the regression means iterative manual testing.

With Contexta

env_diff = ctx.compare_environments(old_run_ref, new_run_ref)
# env_diff.python_version_changed → True
# env_diff.changed_packages → [torch: 2.0.0 → 2.1.0, numpy: 1.24.0 → 1.26.0]

audit = ctx.audit_reproducibility(old_run_ref)
# audit.python_version, audit.package_count, audit.reproducibility_status

EnvironmentSnapshot is recorded at run creation time — not as a file, but as a structured record linked to the run. compare_environments produces an exact diff of packages and Python version between any two runs.

Key APIs: EnvironmentSnapshot, compare_environments, audit_reproducibility

Complete Runnable Code

Run the seed script first, then the analysis script:

uv run examples/case_studies/case02_seed_performance_regression_data.py
uv run examples/case_studies/case02_analyze_performance_regression.py

case02_seed_performance_regression_data.py
"""Create performance-regression run records used by the regression case study."""

from __future__ import annotations

import tempfile
from pathlib import Path
from typing import Any

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
    EnvironmentSnapshot,
    MetricPayload,
    MetricRecord,
    Project,
    RecordEnvelope,
    Run,
    StageExecution,
)


PROJECT_NAME = "product-categorization"

_REC_COUNTER = 0


def _next_rid() -> str:
    global _REC_COUNTER
    _REC_COUNTER += 1
    return f"r{_REC_COUNTER:05d}"


def _create_run(
    store: Any,
    record_store: Any,
    project_name: str,
    run_name: str,
    accuracy: float,
    precision: float,
    recall: float,
    started_at: str,
    ended_at: str,
) -> str:
    run_ref   = f"run:{project_name}.{run_name}"
    stage_ref = f"stage:{project_name}.{run_name}.train"

    store.runs.put_run(
        Run(
            run_ref=run_ref,
            project_ref=f"project:{project_name}",
            name=run_name,
            status="completed",
            started_at=started_at,
            ended_at=ended_at,
        )
    )
    store.stages.put_stage_execution(
        StageExecution(
            stage_execution_ref=stage_ref,
            run_ref=run_ref,
            stage_name="train",
            status="completed",
            started_at=started_at,
            ended_at=ended_at,
            order_index=0,
        )
    )

    for key, val in [("accuracy", accuracy), ("precision", precision), ("recall", recall)]:
        record_store.append(
            MetricRecord(
                envelope=RecordEnvelope(
                    record_ref=f"record:{project_name}.{run_name}.{_next_rid()}",
                    record_type="metric",
                    recorded_at=ended_at,
                    observed_at=ended_at,
                    producer_ref="contexta.case02",
                    run_ref=run_ref,
                    stage_execution_ref=stage_ref,
                    completeness_marker="complete",
                    degradation_marker="none",
                ),
                payload=MetricPayload(
                    metric_key=key,
                    value=val,
                    value_type="float64",
                    aggregation_scope="run",
                ),
            )
        )

    return run_ref


def _capture_environment(
    store: Any,
    project_name: str,
    run_name: str,
    run_ref: str,
    python_version: str,
    platform: str,
    packages: dict[str, str],
    captured_at: str,
) -> None:
    """Store an environment snapshot linked to the run."""
    if not hasattr(store, "environments"):
        return

    # environment_snapshot_ref must add exactly one component to run_ref
    env_ref = f"environment:{project_name}.{run_name}.snapshot"
    store.environments.put_environment_snapshot(
        EnvironmentSnapshot(
            environment_snapshot_ref=env_ref,
            run_ref=run_ref,
            captured_at=captured_at,
            python_version=python_version,
            platform=platform,
            packages=packages,
            environment_variables={},
        )
    )


def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
    """Create two runs representing last month vs this month."""

    if workspace is None:
        root = Path(tempfile.mkdtemp(prefix="contexta-case02-"))
        workspace_path = root / ".contexta"
    else:
        workspace_path = Path(workspace)

    ctx = Contexta(
        config=UnifiedConfig(
            project_name=PROJECT_NAME,
            workspace=WorkspaceConfig(root_path=workspace_path),
        )
    )

    store = ctx.metadata_store
    try:
        store.projects.put_project(
            Project(
                project_ref=f"project:{PROJECT_NAME}",
                name=PROJECT_NAME,
                created_at="2025-02-01T00:00:00Z",
                description="E-commerce product categorization model",
            )
        )

        # Last month: stable environment, high accuracy
        last_month_ref = _create_run(
            store,
            ctx.record_store,
            PROJECT_NAME,
            "last-month",
            accuracy=0.91,
            precision=0.893,
            recall=0.908,
            started_at="2025-02-15T08:00:00Z",
            ended_at="2025-02-15T10:30:00Z",
        )
        _capture_environment(
            store,
            PROJECT_NAME,
            "last-month",
            last_month_ref,
            python_version="3.11.0",
            platform="linux",
            packages={
                "torch": "2.0.0",
                "numpy": "1.24.0",
                "scikit-learn": "1.2.2",
                "transformers": "4.28.0",
                "pandas": "1.5.3",
            },
            captured_at="2025-02-15T08:01:00Z",
        )

        # This month: torch bumped, numpy changed, accuracy dropped
        this_month_ref = _create_run(
            store,
            ctx.record_store,
            PROJECT_NAME,
            "this-month",
            accuracy=0.87,
            precision=0.851,
            recall=0.872,
            started_at="2025-03-15T08:00:00Z",
            ended_at="2025-03-15T10:45:00Z",
        )
        _capture_environment(
            store,
            PROJECT_NAME,
            "this-month",
            this_month_ref,
            python_version="3.11.0",
            platform="linux",
            packages={
                "torch": "2.1.0",       # upgraded - potential culprit
                "numpy": "1.26.4",      # also changed
                "scikit-learn": "1.2.2",
                "transformers": "4.28.0",
                "pandas": "1.5.3",
            },
            captured_at="2025-03-15T08:01:00Z",
        )

        return {
            "last_month_run_id": last_month_ref,
            "this_month_run_id": this_month_ref,
        }
    finally:
        store.close()


def main() -> None:
    from contextlib import redirect_stdout
    import io

    with redirect_stdout(io.StringIO()):
        run_example(Path(".contexta"))

    print(f"Seeded {PROJECT_NAME} data in .contexta.")


if __name__ == "__main__":
    main()

case02_analyze_performance_regression.py
"""Analyze previously recorded performance-regression runs."""

from pathlib import Path

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig


PROJECT_NAME = "product-categorization"
LAST_MONTH = f"run:{PROJECT_NAME}.last-month"
THIS_MONTH = f"run:{PROJECT_NAME}.this-month"

ctx = Contexta(
    config=UnifiedConfig(
        project_name=PROJECT_NAME,
        workspace=WorkspaceConfig(root_path=Path(".contexta")),
    )
)

store = ctx.metadata_store
try:
    comparison = ctx.compare_runs(LAST_MONTH, THIS_MONTH)
    print("Metric comparison: last-month -> this-month")
    for stage in comparison.stage_comparisons:
        for delta in stage.metric_deltas:
            if delta.delta is not None:
                print(f"{stage.stage_name}/{delta.metric_key}: {delta.delta:+.4f}")

    env_diff = ctx.compare_environments(LAST_MONTH, THIS_MONTH)
    print("\nEnvironment changes:")
    print(f"Python changed: {env_diff.python_version_changed}")
    for change in env_diff.changed_packages:
        print(f"{change.key}: {change.left_value} -> {change.right_value}")

    audit = ctx.audit_reproducibility(THIS_MONTH)
    print(f"\nReproducibility: {audit.reproducibility_status}")
    print(f"Packages logged: {audit.package_count}")
finally:
    store.close()

Situation​

Without Contexta​

With Contexta​

Complete Runnable Code​

Situation

Without Contexta

With Contexta

Complete Runnable Code