Case 02: James's Silent Performance Regression
Persona: James, ML Engineer
Situation
James upgrades a library and retrains. Accuracy drops from 0.91 to 0.87. He suspects a
dependency change but cannot confirm it — the environment from the previous training was
never recorded. His requirements.txt was committed three weeks ago and may not reflect
what was actually installed.
Without Contexta
requirements.txtis a snapshot in time, not a training-time record.- Diffing two
requirements.txtfiles requires knowing which commit to compare against. - If the file wasn't committed, the old environment is gone.
- Guessing which package caused the regression means iterative manual testing.
With Contexta
env_diff = ctx.compare_environments(old_run_ref, new_run_ref)
# env_diff.python_version_changed → True
# env_diff.changed_packages → [torch: 2.0.0 → 2.1.0, numpy: 1.24.0 → 1.26.0]
audit = ctx.audit_reproducibility(old_run_ref)
# audit.python_version, audit.package_count, audit.reproducibility_status
EnvironmentSnapshot is recorded at run creation time — not as a file, but as a
structured record linked to the run. compare_environments produces an exact diff of
packages and Python version between any two runs.
Key APIs: EnvironmentSnapshot, compare_environments, audit_reproducibility
Complete Runnable Code
Run the seed script first, then the analysis script:
uv run examples/case_studies/case02_seed_performance_regression_data.py
uv run examples/case_studies/case02_analyze_performance_regression.py
case02_seed_performance_regression_data.py
"""Create performance-regression run records used by the regression case study."""
from __future__ import annotations
import tempfile
from pathlib import Path
from typing import Any
from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
EnvironmentSnapshot,
MetricPayload,
MetricRecord,
Project,
RecordEnvelope,
Run,
StageExecution,
)
PROJECT_NAME = "product-categorization"
_REC_COUNTER = 0
def _next_rid() -> str:
global _REC_COUNTER
_REC_COUNTER += 1
return f"r{_REC_COUNTER:05d}"
def _create_run(
store: Any,
record_store: Any,
project_name: str,
run_name: str,
accuracy: float,
precision: float,
recall: float,
started_at: str,
ended_at: str,
) -> str:
run_ref = f"run:{project_name}.{run_name}"
stage_ref = f"stage:{project_name}.{run_name}.train"
store.runs.put_run(
Run(
run_ref=run_ref,
project_ref=f"project:{project_name}",
name=run_name,
status="completed",
started_at=started_at,
ended_at=ended_at,
)
)
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=stage_ref,
run_ref=run_ref,
stage_name="train",
status="completed",
started_at=started_at,
ended_at=ended_at,
order_index=0,
)
)
for key, val in [("accuracy", accuracy), ("precision", precision), ("recall", recall)]:
record_store.append(
MetricRecord(
envelope=RecordEnvelope(
record_ref=f"record:{project_name}.{run_name}.{_next_rid()}",
record_type="metric",
recorded_at=ended_at,
observed_at=ended_at,
producer_ref="contexta.case02",
run_ref=run_ref,
stage_execution_ref=stage_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=MetricPayload(
metric_key=key,
value=val,
value_type="float64",
aggregation_scope="run",
),
)
)
return run_ref
def _capture_environment(
store: Any,
project_name: str,
run_name: str,
run_ref: str,
python_version: str,
platform: str,
packages: dict[str, str],
captured_at: str,
) -> None:
"""Store an environment snapshot linked to the run."""
if not hasattr(store, "environments"):
return
# environment_snapshot_ref must add exactly one component to run_ref
env_ref = f"environment:{project_name}.{run_name}.snapshot"
store.environments.put_environment_snapshot(
EnvironmentSnapshot(
environment_snapshot_ref=env_ref,
run_ref=run_ref,
captured_at=captured_at,
python_version=python_version,
platform=platform,
packages=packages,
environment_variables={},
)
)
def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
"""Create two runs representing last month vs this month."""
if workspace is None:
root = Path(tempfile.mkdtemp(prefix="contexta-case02-"))
workspace_path = root / ".contexta"
else:
workspace_path = Path(workspace)
ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=workspace_path),
)
)
store = ctx.metadata_store
try:
store.projects.put_project(
Project(
project_ref=f"project:{PROJECT_NAME}",
name=PROJECT_NAME,
created_at="2025-02-01T00:00:00Z",
description="E-commerce product categorization model",
)
)
# Last month: stable environment, high accuracy
last_month_ref = _create_run(
store,
ctx.record_store,
PROJECT_NAME,
"last-month",
accuracy=0.91,
precision=0.893,
recall=0.908,
started_at="2025-02-15T08:00:00Z",
ended_at="2025-02-15T10:30:00Z",
)
_capture_environment(
store,
PROJECT_NAME,
"last-month",
last_month_ref,
python_version="3.11.0",
platform="linux",
packages={
"torch": "2.0.0",
"numpy": "1.24.0",
"scikit-learn": "1.2.2",
"transformers": "4.28.0",
"pandas": "1.5.3",
},
captured_at="2025-02-15T08:01:00Z",
)
# This month: torch bumped, numpy changed, accuracy dropped
this_month_ref = _create_run(
store,
ctx.record_store,
PROJECT_NAME,
"this-month",
accuracy=0.87,
precision=0.851,
recall=0.872,
started_at="2025-03-15T08:00:00Z",
ended_at="2025-03-15T10:45:00Z",
)
_capture_environment(
store,
PROJECT_NAME,
"this-month",
this_month_ref,
python_version="3.11.0",
platform="linux",
packages={
"torch": "2.1.0", # upgraded - potential culprit
"numpy": "1.26.4", # also changed
"scikit-learn": "1.2.2",
"transformers": "4.28.0",
"pandas": "1.5.3",
},
captured_at="2025-03-15T08:01:00Z",
)
return {
"last_month_run_id": last_month_ref,
"this_month_run_id": this_month_ref,
}
finally:
store.close()
def main() -> None:
from contextlib import redirect_stdout
import io
with redirect_stdout(io.StringIO()):
run_example(Path(".contexta"))
print(f"Seeded {PROJECT_NAME} data in .contexta.")
if __name__ == "__main__":
main()
case02_analyze_performance_regression.py
"""Analyze previously recorded performance-regression runs."""
from pathlib import Path
from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
PROJECT_NAME = "product-categorization"
LAST_MONTH = f"run:{PROJECT_NAME}.last-month"
THIS_MONTH = f"run:{PROJECT_NAME}.this-month"
ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=Path(".contexta")),
)
)
store = ctx.metadata_store
try:
comparison = ctx.compare_runs(LAST_MONTH, THIS_MONTH)
print("Metric comparison: last-month -> this-month")
for stage in comparison.stage_comparisons:
for delta in stage.metric_deltas:
if delta.delta is not None:
print(f"{stage.stage_name}/{delta.metric_key}: {delta.delta:+.4f}")
env_diff = ctx.compare_environments(LAST_MONTH, THIS_MONTH)
print("\nEnvironment changes:")
print(f"Python changed: {env_diff.python_version_changed}")
for change in env_diff.changed_packages:
print(f"{change.key}: {change.left_value} -> {change.right_value}")
audit = ctx.audit_reproducibility(THIS_MONTH)
print(f"\nReproducibility: {audit.reproducibility_status}")
print(f"Packages logged: {audit.package_count}")
finally:
store.close()