Skip to main content

Case 06: Elena's Compliance Audit Trail

Persona: Solutions Architect / Compliance

Situation

Elena's team delivers AI solutions to a regulated insurance client. The client's regulator audits the production model and asks five questions:

  1. What dataset version was used to train the production model?
  2. What were the training-time evaluation metrics (original numbers, not summaries)?
  3. What was the Python and library environment at training time?
  4. How does this model compare to the previous version?
  5. Who approved the deployment?

The team spent two days searching Git logs, personal Jupyter notebooks, Slack threads, and a shared drive. Some information was estimated. The auditor rejected it: "provide documented evidence."

Without Contexta

  • Dataset version: written in a notebook cell or a Slack message. Neither is auditable.
  • Eval metrics: in a script's stdout, possibly scrolled off or the terminal was closed.
  • Environment: requirements.txt "might be stale."
  • Model comparison: "we think it improved" — no structured diff.
  • Answers take 2 days and some are estimates. Regulators reject estimates.

With Contexta

# Q1: Dataset version
snapshot = ctx.get_run_snapshot(curr_run_ref)
dataset_event = next(e for e in snapshot.records if e.key == "training.dataset-registered")

# Q2: Original evaluation metrics
eval_records = [o for o in snapshot.records if o.record_type == "metric"]

# Q3: Training environment
audit = ctx.audit_reproducibility(curr_run_ref)
# audit.python_version, audit.platform, audit.package_count

# Q4: Comparison with previous version
env_diff = ctx.compare_environments(prev_run_ref, curr_run_ref)
comp = ctx.compare_runs(prev_run_ref, curr_run_ref)

# Q5: Formal audit document
report = ctx.build_snapshot_report(curr_run_ref)

All answers are backed by records written at training time — not reconstructed. The audit package assembles in under 5 seconds.

Key APIs: EnvironmentSnapshot, get_run_snapshot, audit_reproducibility, compare_environments, compare_runs, build_snapshot_report


Complete Runnable Code

Run the seed script first, then the analysis script:

uv run examples/case_studies/case06_seed_compliance_audit_data.py
uv run examples/case_studies/case06_analyze_compliance_audit.py
case06_seed_compliance_audit_data.py
"""Create compliance-audit records used by the audit case study."""

from __future__ import annotations

import tempfile
from pathlib import Path
from typing import Any

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
DeploymentExecution,
EnvironmentSnapshot,
MetricPayload,
MetricRecord,
Project,
RecordEnvelope,
Run,
StageExecution,
StructuredEventPayload,
StructuredEventRecord,
)

PROJECT_NAME = "loss-ratio-predictor"
_rid = 0


def _next_rid() -> str:
global _rid
_rid += 1
return f"r{_rid:04d}"


def _put_metric(record_store: Any, run_name: str, stage_ref: str, key: str, val: float, ts: str) -> None:
run_ref = f"run:{PROJECT_NAME}.{run_name}"
record_store.append(
MetricRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="metric",
recorded_at=ts,
observed_at=ts,
producer_ref="contexta.case06",
run_ref=run_ref,
stage_execution_ref=stage_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=MetricPayload(
metric_key=key,
value=val,
value_type="float64",
),
)
)


def _make_run_with_env(
store: Any,
record_store: Any,
run_name: str,
dataset_version: str,
metrics: dict[str, float],
python_version: str,
packages: dict[str, str],
started_at: str,
ended_at: str,
) -> str:
run_ref = f"run:{PROJECT_NAME}.{run_name}"

store.runs.put_run(
Run(
run_ref=run_ref,
project_ref=f"project:{PROJECT_NAME}",
name=run_name,
status="completed",
started_at=started_at,
ended_at=ended_at,
)
)

# Training stage
train_ref = f"stage:{PROJECT_NAME}.{run_name}.train"
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=train_ref,
run_ref=run_ref,
stage_name="train",
status="completed",
started_at=started_at,
ended_at=f"{started_at[:10]}T14:00:00Z",
order_index=0,
)
)

# Evaluate stage
eval_ref = f"stage:{PROJECT_NAME}.{run_name}.evaluate"
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=eval_ref,
run_ref=run_ref,
stage_name="evaluate",
status="completed",
started_at=f"{started_at[:10]}T14:00:00Z",
ended_at=ended_at,
order_index=1,
)
)

for key, val in metrics.items():
_put_metric(record_store, run_name, eval_ref, key, val, ended_at)

# Record dataset version as a structured event (answers auditor question 1)
record_store.append(
StructuredEventRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="event",
recorded_at=started_at,
observed_at=started_at,
producer_ref="contexta.case06",
run_ref=run_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=StructuredEventPayload(
event_key="training.dataset-registered",
level="info",
message=f"Training dataset version: {dataset_version}",
attributes={"dataset_version": dataset_version},
origin_marker="explicit_capture",
),
)
)

# Environment snapshot (answers auditor question 3)
all_packages = {**packages, "scikit-learn": "1.3.0", "pandas": "2.0.3"}
env = EnvironmentSnapshot(
environment_snapshot_ref=f"environment:{PROJECT_NAME}.{run_name}.snap",
run_ref=run_ref,
captured_at=started_at,
python_version=python_version,
platform="linux",
packages=all_packages,
environment_variables={},
)
store.environments.put_environment_snapshot(env)

return run_ref


def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
if workspace is None:
workspace = Path(tempfile.mkdtemp(prefix="contexta-case06-")) / ".contexta"

ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=Path(workspace)),
)
)
store = ctx.metadata_store

try:
store.projects.put_project(
Project(
project_ref=f"project:{PROJECT_NAME}",
name=PROJECT_NAME,
created_at="2025-01-01T00:00:00Z",
)
)

# Previous model (v1) -- deployed March
prev_run_ref = _make_run_with_env(
store, ctx.record_store,
run_name="model-v1",
dataset_version="claims-2024q4",
metrics={"auc": 0.871, "precision": 0.834, "recall": 0.819, "f1": 0.826},
python_version="3.10.12",
packages={"torch": "1.13.0", "xgboost": "1.7.6"},
started_at="2025-03-01T09:00:00Z",
ended_at="2025-03-01T16:00:00Z",
)
store.deployments.put_deployment_execution(
DeploymentExecution(
deployment_execution_ref=f"deployment:{PROJECT_NAME}.prod-v1",
project_ref=f"project:{PROJECT_NAME}",
deployment_name="prod-v1",
status="completed",
started_at="2025-03-02T10:00:00Z",
ended_at="2025-03-02T10:15:00Z",
run_ref=prev_run_ref,
order_index=0,
)
)

# Current production model (v2) -- deployed June, under audit
curr_run_ref = _make_run_with_env(
store, ctx.record_store,
run_name="model-v2",
dataset_version="claims-2025q1",
metrics={"auc": 0.891, "precision": 0.852, "recall": 0.841, "f1": 0.846},
python_version="3.11.5",
packages={"torch": "2.0.1", "xgboost": "2.0.0"},
started_at="2025-06-01T09:00:00Z",
ended_at="2025-06-01T16:00:00Z",
)
store.deployments.put_deployment_execution(
DeploymentExecution(
deployment_execution_ref=f"deployment:{PROJECT_NAME}.prod-v2",
project_ref=f"project:{PROJECT_NAME}",
deployment_name="prod-v2",
status="completed",
started_at="2025-06-02T10:00:00Z",
ended_at="2025-06-02T10:15:00Z",
run_ref=curr_run_ref,
order_index=1,
)
)

return {
"previous_run_id": prev_run_ref,
"current_run_id": curr_run_ref,
"deployment_ids": [
f"deployment:{PROJECT_NAME}.prod-v1",
f"deployment:{PROJECT_NAME}.prod-v2",
],
}

finally:
store.close()


def main() -> None:
from contextlib import redirect_stdout
import io

with redirect_stdout(io.StringIO()):
run_example(Path(".contexta"))

print(f"Seeded {PROJECT_NAME} data in .contexta.")


if __name__ == "__main__":
main()
case06_analyze_compliance_audit.py
"""Assemble audit evidence from previously recorded model runs."""

from pathlib import Path

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig


PROJECT_NAME = "loss-ratio-predictor"
PREVIOUS_RUN = f"run:{PROJECT_NAME}.model-v1"
CURRENT_RUN = f"run:{PROJECT_NAME}.model-v2"

ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=Path(".contexta")),
)
)

store = ctx.metadata_store
try:
snapshot = ctx.get_run_snapshot(CURRENT_RUN)
print(f"Audit target: {snapshot.run.name}")
for record in snapshot.records:
if record.record_type == "event" and record.key == "training.dataset-registered":
print(f"Dataset: {record.message}")
if record.record_type == "metric":
print(f"{record.key}: {record.value:.4f}")

audit = ctx.audit_reproducibility(CURRENT_RUN)
print(f"\nEnvironment: python={audit.python_version}, packages={audit.package_count}")

print("\nMetric deltas: model-v1 -> model-v2")
comparison = ctx.compare_runs(PREVIOUS_RUN, CURRENT_RUN)
for stage in comparison.stage_comparisons:
for delta in stage.metric_deltas:
if delta.delta is not None:
print(f"{stage.stage_name}/{delta.metric_key}: {delta.delta:+.4f}")

report = ctx.build_snapshot_report(CURRENT_RUN)
print(f"\nReport: {report.title}")
finally:
store.close()