Case 12: Tom's Delivery Quality Certificate
Persona: Tom, Forward Deployed Engineer
Situation
Tom delivers a trained model to FinanceBank Corp, whose procurement team requires a formal "Model Quality Certificate" before accepting any AI model. The certificate must document:
- Training data version used
- Evaluation metrics (exact numbers, not estimates)
- Training environment (Python version, key packages)
- Pass/fail result for each agreed quality threshold
- Overall PASS or FAIL decision
Without tooling, Tom assembles the certificate manually from training logs,
requirements.txt, and notebook outputs — 3 to 4 hours of work per delivery, plus a
review cycle when numbers don't match. Delays delivery by 1 to 2 days per engagement.
Without Contexta
| Step | Manual process | Time |
|---|---|---|
| Metrics | Open training log, copy numbers into Word | 30 min |
| Environment | Check requirements.txt (may be stale), paste Python version | 20 min |
| Dataset version | Recall from memory or search Slack | 15 min |
| Threshold checks | Manual comparison in Word | 20 min |
| Client review cycle | "AUC number does not match" | +1 day |
With Contexta
# All evidence recorded at training time
snapshot = ctx.get_run_snapshot(run_ref)
audit = ctx.audit_reproducibility(run_ref)
# Threshold checks
THRESHOLDS = {"accuracy": 0.90, "auc": 0.93, "f1": 0.88, ...}
for metric, threshold in THRESHOLDS.items():
value = next(r.value for r in snapshot.records if r.key == metric)
status = "PASS" if value >= threshold else "FAIL"
print(f" {metric:<12} {value:.4f} (>= {threshold}) [{status}]")
# Formal document
report = ctx.build_snapshot_report(run_ref)
The certificate assembles in under 12 seconds. Every number is backed by a recorded
MetricRecord — not estimated from memory. Client review cycles shrink because the numbers
are reproducible.
Key APIs: EnvironmentSnapshot, StructuredEventRecord, get_run_snapshot, audit_reproducibility, build_snapshot_report
Complete Runnable Code
Run the seed script first, then the analysis script:
uv run examples/case_studies/case12_seed_certificate_data.py
uv run examples/case_studies/case12_analyze_certificate.py
"""Create delivery-certificate records used by the certificate case study."""
from __future__ import annotations
import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any
from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
EnvironmentSnapshot,
MetricPayload,
MetricRecord,
Project,
RecordEnvelope,
Run,
StageExecution,
StructuredEventPayload,
StructuredEventRecord,
)
PROJECT_NAME = "fraud-detection-financebank"
# Quality thresholds agreed with the client
QUALITY_THRESHOLDS: dict[str, float] = {
"accuracy": 0.90,
"auc": 0.93,
"f1": 0.88,
"precision": 0.87,
"recall": 0.86,
}
CLIENT_NAME = "FinanceBank Corp"
MODEL_NAME = "FraudShield v2.1"
DELIVERY_DATE = "2025-04-11"
_REC_COUNTER = 0
def _next_rid() -> str:
global _REC_COUNTER
_REC_COUNTER += 1
return f"r{_REC_COUNTER:05d}"
def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
"""Create one run with metrics, dataset, and environment records."""
if workspace is None:
root = Path(tempfile.mkdtemp(prefix="contexta-case12-"))
workspace_path = root / ".contexta"
else:
workspace_path = Path(workspace)
ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=workspace_path),
)
)
store = ctx.metadata_store
try:
store.projects.put_project(
Project(
project_ref=f"project:{PROJECT_NAME}",
name=PROJECT_NAME,
created_at="2025-03-01T00:00:00Z",
description=f"Fraud detection model for {CLIENT_NAME}",
)
)
run_name = "fraud-model-v2-1"
run_ref = f"run:{PROJECT_NAME}.{run_name}"
started_at = "2025-04-08T09:00:00Z"
ended_at = "2025-04-08T15:00:00Z"
store.runs.put_run(
Run(
run_ref=run_ref,
project_ref=f"project:{PROJECT_NAME}",
name=run_name,
status="completed",
started_at=started_at,
ended_at=ended_at,
)
)
train_ref = f"stage:{PROJECT_NAME}.{run_name}.train"
eval_ref = f"stage:{PROJECT_NAME}.{run_name}.evaluate"
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=train_ref,
run_ref=run_ref,
stage_name="train",
status="completed",
started_at=started_at,
ended_at="2025-04-08T13:00:00Z",
order_index=0,
)
)
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=eval_ref,
run_ref=run_ref,
stage_name="evaluate",
status="completed",
started_at="2025-04-08T13:00:00Z",
ended_at=ended_at,
order_index=1,
)
)
# Evaluation metrics
eval_metrics: dict[str, float] = {
"accuracy": 0.934,
"auc": 0.961,
"f1": 0.922,
"precision": 0.917,
"recall": 0.928,
}
record_store = ctx.record_store
obs_ts = ended_at
for key, val in eval_metrics.items():
record_store.append(
MetricRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="metric",
recorded_at=obs_ts,
observed_at=obs_ts,
producer_ref="contexta.case12",
run_ref=run_ref,
stage_execution_ref=eval_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=MetricPayload(
metric_key=key,
value=val,
value_type="float64",
),
)
)
# Dataset version event (answers client Q1)
dataset_version = "fraud-transactions-2025q1-v3"
record_store.append(
StructuredEventRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="event",
recorded_at=started_at,
observed_at=started_at,
producer_ref="contexta.case12",
run_ref=run_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=StructuredEventPayload(
event_key="training.dataset-registered",
level="info",
message=f"Training dataset version: {dataset_version}",
attributes={
"dataset_version": dataset_version,
"record_count": 2_400_000,
"date_range": "2024-01-01 to 2025-03-31",
},
origin_marker="explicit_capture",
),
)
)
# Environment snapshot (answers client Q3)
env_packages = {
"scikit-learn": "1.4.0",
"xgboost": "2.0.3",
"pandas": "2.2.1",
"numpy": "1.26.4",
"imbalanced-learn": "0.12.0",
"shap": "0.45.0",
}
store.environments.put_environment_snapshot(
EnvironmentSnapshot(
environment_snapshot_ref=f"environment:{PROJECT_NAME}.{run_name}.snap",
run_ref=run_ref,
captured_at=started_at,
python_version="3.11.8",
platform="linux",
packages=env_packages,
environment_variables={},
)
)
return {
"client": CLIENT_NAME,
"model": MODEL_NAME,
"run_id": run_ref,
"dataset_version": dataset_version,
}
finally:
store.close()
def main() -> None:
from contextlib import redirect_stdout
import io
with redirect_stdout(io.StringIO()):
run_example(Path(".contexta"))
print(f"Seeded {PROJECT_NAME} data in .contexta.")
if __name__ == "__main__":
main()
"""Create a quality-certificate summary from a recorded delivery run."""
from pathlib import Path
from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
PROJECT_NAME = "fraud-detection-financebank"
CLIENT_NAME = "FinanceBank Corp"
QUALITY_THRESHOLDS: dict[str, float] = {
"accuracy": 0.90,
"auc": 0.93,
"f1": 0.88,
"precision": 0.87,
"recall": 0.86,
}
RUN_REF = f"run:{PROJECT_NAME}.fraud-model-v2-1"
ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=Path(".contexta")),
)
)
store = ctx.metadata_store
try:
snapshot = ctx.get_run_snapshot(RUN_REF)
metrics = {
record.key: float(record.value)
for record in snapshot.records
if record.record_type == "metric" and record.value is not None
}
audit = ctx.audit_reproducibility(RUN_REF)
print(f"MODEL QUALITY CERTIFICATE - {CLIENT_NAME}")
print(f"Run: {snapshot.run.name}")
print(f"Python: {audit.python_version}; packages: {audit.package_count}")
overall = True
for key, threshold in QUALITY_THRESHOLDS.items():
passed = metrics[key] >= threshold
overall = overall and passed
status = "PASS" if passed else "FAIL"
print(f"{key:<10} {metrics[key]:.4f} >= {threshold:.4f} [{status}]")
report = ctx.build_snapshot_report(RUN_REF)
print(f"Overall decision: {'PASS' if overall else 'FAIL'}")
print(f"Report: {report.title}")
finally:
store.close()