Skip to main content

Case 12: Tom's Delivery Quality Certificate

Persona: Tom, Forward Deployed Engineer

Situation

Tom delivers a trained model to FinanceBank Corp, whose procurement team requires a formal "Model Quality Certificate" before accepting any AI model. The certificate must document:

  1. Training data version used
  2. Evaluation metrics (exact numbers, not estimates)
  3. Training environment (Python version, key packages)
  4. Pass/fail result for each agreed quality threshold
  5. Overall PASS or FAIL decision

Without tooling, Tom assembles the certificate manually from training logs, requirements.txt, and notebook outputs — 3 to 4 hours of work per delivery, plus a review cycle when numbers don't match. Delays delivery by 1 to 2 days per engagement.

Without Contexta

StepManual processTime
MetricsOpen training log, copy numbers into Word30 min
EnvironmentCheck requirements.txt (may be stale), paste Python version20 min
Dataset versionRecall from memory or search Slack15 min
Threshold checksManual comparison in Word20 min
Client review cycle"AUC number does not match"+1 day

With Contexta

# All evidence recorded at training time
snapshot = ctx.get_run_snapshot(run_ref)
audit = ctx.audit_reproducibility(run_ref)

# Threshold checks
THRESHOLDS = {"accuracy": 0.90, "auc": 0.93, "f1": 0.88, ...}
for metric, threshold in THRESHOLDS.items():
value = next(r.value for r in snapshot.records if r.key == metric)
status = "PASS" if value >= threshold else "FAIL"
print(f" {metric:<12} {value:.4f} (>= {threshold}) [{status}]")

# Formal document
report = ctx.build_snapshot_report(run_ref)

The certificate assembles in under 12 seconds. Every number is backed by a recorded MetricRecord — not estimated from memory. Client review cycles shrink because the numbers are reproducible.

Key APIs: EnvironmentSnapshot, StructuredEventRecord, get_run_snapshot, audit_reproducibility, build_snapshot_report


Complete Runnable Code

Run the seed script first, then the analysis script:

uv run examples/case_studies/case12_seed_certificate_data.py
uv run examples/case_studies/case12_analyze_certificate.py
case12_seed_certificate_data.py
"""Create delivery-certificate records used by the certificate case study."""

from __future__ import annotations

import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
EnvironmentSnapshot,
MetricPayload,
MetricRecord,
Project,
RecordEnvelope,
Run,
StageExecution,
StructuredEventPayload,
StructuredEventRecord,
)


PROJECT_NAME = "fraud-detection-financebank"

# Quality thresholds agreed with the client
QUALITY_THRESHOLDS: dict[str, float] = {
"accuracy": 0.90,
"auc": 0.93,
"f1": 0.88,
"precision": 0.87,
"recall": 0.86,
}

CLIENT_NAME = "FinanceBank Corp"
MODEL_NAME = "FraudShield v2.1"
DELIVERY_DATE = "2025-04-11"

_REC_COUNTER = 0


def _next_rid() -> str:
global _REC_COUNTER
_REC_COUNTER += 1
return f"r{_REC_COUNTER:05d}"


def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
"""Create one run with metrics, dataset, and environment records."""

if workspace is None:
root = Path(tempfile.mkdtemp(prefix="contexta-case12-"))
workspace_path = root / ".contexta"
else:
workspace_path = Path(workspace)

ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=workspace_path),
)
)

store = ctx.metadata_store
try:
store.projects.put_project(
Project(
project_ref=f"project:{PROJECT_NAME}",
name=PROJECT_NAME,
created_at="2025-03-01T00:00:00Z",
description=f"Fraud detection model for {CLIENT_NAME}",
)
)

run_name = "fraud-model-v2-1"
run_ref = f"run:{PROJECT_NAME}.{run_name}"
started_at = "2025-04-08T09:00:00Z"
ended_at = "2025-04-08T15:00:00Z"

store.runs.put_run(
Run(
run_ref=run_ref,
project_ref=f"project:{PROJECT_NAME}",
name=run_name,
status="completed",
started_at=started_at,
ended_at=ended_at,
)
)

train_ref = f"stage:{PROJECT_NAME}.{run_name}.train"
eval_ref = f"stage:{PROJECT_NAME}.{run_name}.evaluate"

store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=train_ref,
run_ref=run_ref,
stage_name="train",
status="completed",
started_at=started_at,
ended_at="2025-04-08T13:00:00Z",
order_index=0,
)
)
store.stages.put_stage_execution(
StageExecution(
stage_execution_ref=eval_ref,
run_ref=run_ref,
stage_name="evaluate",
status="completed",
started_at="2025-04-08T13:00:00Z",
ended_at=ended_at,
order_index=1,
)
)

# Evaluation metrics
eval_metrics: dict[str, float] = {
"accuracy": 0.934,
"auc": 0.961,
"f1": 0.922,
"precision": 0.917,
"recall": 0.928,
}
record_store = ctx.record_store
obs_ts = ended_at
for key, val in eval_metrics.items():
record_store.append(
MetricRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="metric",
recorded_at=obs_ts,
observed_at=obs_ts,
producer_ref="contexta.case12",
run_ref=run_ref,
stage_execution_ref=eval_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=MetricPayload(
metric_key=key,
value=val,
value_type="float64",
),
)
)

# Dataset version event (answers client Q1)
dataset_version = "fraud-transactions-2025q1-v3"
record_store.append(
StructuredEventRecord(
envelope=RecordEnvelope(
record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
record_type="event",
recorded_at=started_at,
observed_at=started_at,
producer_ref="contexta.case12",
run_ref=run_ref,
completeness_marker="complete",
degradation_marker="none",
),
payload=StructuredEventPayload(
event_key="training.dataset-registered",
level="info",
message=f"Training dataset version: {dataset_version}",
attributes={
"dataset_version": dataset_version,
"record_count": 2_400_000,
"date_range": "2024-01-01 to 2025-03-31",
},
origin_marker="explicit_capture",
),
)
)

# Environment snapshot (answers client Q3)
env_packages = {
"scikit-learn": "1.4.0",
"xgboost": "2.0.3",
"pandas": "2.2.1",
"numpy": "1.26.4",
"imbalanced-learn": "0.12.0",
"shap": "0.45.0",
}
store.environments.put_environment_snapshot(
EnvironmentSnapshot(
environment_snapshot_ref=f"environment:{PROJECT_NAME}.{run_name}.snap",
run_ref=run_ref,
captured_at=started_at,
python_version="3.11.8",
platform="linux",
packages=env_packages,
environment_variables={},
)
)

return {
"client": CLIENT_NAME,
"model": MODEL_NAME,
"run_id": run_ref,
"dataset_version": dataset_version,
}
finally:
store.close()


def main() -> None:
from contextlib import redirect_stdout
import io

with redirect_stdout(io.StringIO()):
run_example(Path(".contexta"))

print(f"Seeded {PROJECT_NAME} data in .contexta.")


if __name__ == "__main__":
main()
case12_analyze_certificate.py
"""Create a quality-certificate summary from a recorded delivery run."""

from pathlib import Path

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig

PROJECT_NAME = "fraud-detection-financebank"
CLIENT_NAME = "FinanceBank Corp"
QUALITY_THRESHOLDS: dict[str, float] = {
"accuracy": 0.90,
"auc": 0.93,
"f1": 0.88,
"precision": 0.87,
"recall": 0.86,
}
RUN_REF = f"run:{PROJECT_NAME}.fraud-model-v2-1"

ctx = Contexta(
config=UnifiedConfig(
project_name=PROJECT_NAME,
workspace=WorkspaceConfig(root_path=Path(".contexta")),
)
)

store = ctx.metadata_store
try:
snapshot = ctx.get_run_snapshot(RUN_REF)
metrics = {
record.key: float(record.value)
for record in snapshot.records
if record.record_type == "metric" and record.value is not None
}
audit = ctx.audit_reproducibility(RUN_REF)

print(f"MODEL QUALITY CERTIFICATE - {CLIENT_NAME}")
print(f"Run: {snapshot.run.name}")
print(f"Python: {audit.python_version}; packages: {audit.package_count}")

overall = True
for key, threshold in QUALITY_THRESHOLDS.items():
passed = metrics[key] >= threshold
overall = overall and passed
status = "PASS" if passed else "FAIL"
print(f"{key:<10} {metrics[key]:.4f} >= {threshold:.4f} [{status}]")

report = ctx.build_snapshot_report(RUN_REF)
print(f"Overall decision: {'PASS' if overall else 'FAIL'}")
print(f"Report: {report.title}")
finally:
store.close()