Case 12: Tom's Delivery Quality Certificate

Persona: Tom, Forward Deployed Engineer

Situation

Tom delivers a trained model to FinanceBank Corp, whose procurement team requires a formal "Model Quality Certificate" before accepting any AI model. The certificate must document:

Training data version used
Evaluation metrics (exact numbers, not estimates)
Training environment (Python version, key packages)
Pass/fail result for each agreed quality threshold
Overall PASS or FAIL decision

Without tooling, Tom assembles the certificate manually from training logs, requirements.txt, and notebook outputs — 3 to 4 hours of work per delivery, plus a review cycle when numbers don't match. Delays delivery by 1 to 2 days per engagement.

Without Contexta

Step	Manual process	Time
Metrics	Open training log, copy numbers into Word	30 min
Environment	Check `requirements.txt` (may be stale), paste Python version	20 min
Dataset version	Recall from memory or search Slack	15 min
Threshold checks	Manual comparison in Word	20 min
Client review cycle	"AUC number does not match"	+1 day

With Contexta

# All evidence recorded at training time
snapshot = ctx.get_run_snapshot(run_ref)
audit    = ctx.audit_reproducibility(run_ref)

# Threshold checks
THRESHOLDS = {"accuracy": 0.90, "auc": 0.93, "f1": 0.88, ...}
for metric, threshold in THRESHOLDS.items():
    value  = next(r.value for r in snapshot.records if r.key == metric)
    status = "PASS" if value >= threshold else "FAIL"
    print(f"  {metric:<12} {value:.4f}  (>= {threshold})  [{status}]")

# Formal document
report = ctx.build_snapshot_report(run_ref)

The certificate assembles in under 12 seconds. Every number is backed by a recorded MetricRecord — not estimated from memory. Client review cycles shrink because the numbers are reproducible.

Key APIs: EnvironmentSnapshot, StructuredEventRecord, get_run_snapshot, audit_reproducibility, build_snapshot_report

Complete Runnable Code

Run the seed script first, then the analysis script:

uv run examples/case_studies/case12_seed_certificate_data.py
uv run examples/case_studies/case12_analyze_certificate.py

case12_seed_certificate_data.py
"""Create delivery-certificate records used by the certificate case study."""

from __future__ import annotations

import tempfile
from datetime import datetime
from pathlib import Path
from typing import Any

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig
from contexta.contract import (
    EnvironmentSnapshot,
    MetricPayload,
    MetricRecord,
    Project,
    RecordEnvelope,
    Run,
    StageExecution,
    StructuredEventPayload,
    StructuredEventRecord,
)


PROJECT_NAME = "fraud-detection-financebank"

# Quality thresholds agreed with the client
QUALITY_THRESHOLDS: dict[str, float] = {
    "accuracy":  0.90,
    "auc":       0.93,
    "f1":        0.88,
    "precision": 0.87,
    "recall":    0.86,
}

CLIENT_NAME = "FinanceBank Corp"
MODEL_NAME = "FraudShield v2.1"
DELIVERY_DATE = "2025-04-11"

_REC_COUNTER = 0


def _next_rid() -> str:
    global _REC_COUNTER
    _REC_COUNTER += 1
    return f"r{_REC_COUNTER:05d}"


def run_example(workspace: Path | str | None = None) -> dict[str, Any]:
    """Create one run with metrics, dataset, and environment records."""

    if workspace is None:
        root = Path(tempfile.mkdtemp(prefix="contexta-case12-"))
        workspace_path = root / ".contexta"
    else:
        workspace_path = Path(workspace)

    ctx = Contexta(
        config=UnifiedConfig(
            project_name=PROJECT_NAME,
            workspace=WorkspaceConfig(root_path=workspace_path),
        )
    )

    store = ctx.metadata_store
    try:
        store.projects.put_project(
            Project(
                project_ref=f"project:{PROJECT_NAME}",
                name=PROJECT_NAME,
                created_at="2025-03-01T00:00:00Z",
                description=f"Fraud detection model for {CLIENT_NAME}",
            )
        )

        run_name = "fraud-model-v2-1"
        run_ref = f"run:{PROJECT_NAME}.{run_name}"
        started_at = "2025-04-08T09:00:00Z"
        ended_at = "2025-04-08T15:00:00Z"

        store.runs.put_run(
            Run(
                run_ref=run_ref,
                project_ref=f"project:{PROJECT_NAME}",
                name=run_name,
                status="completed",
                started_at=started_at,
                ended_at=ended_at,
            )
        )

        train_ref = f"stage:{PROJECT_NAME}.{run_name}.train"
        eval_ref = f"stage:{PROJECT_NAME}.{run_name}.evaluate"

        store.stages.put_stage_execution(
            StageExecution(
                stage_execution_ref=train_ref,
                run_ref=run_ref,
                stage_name="train",
                status="completed",
                started_at=started_at,
                ended_at="2025-04-08T13:00:00Z",
                order_index=0,
            )
        )
        store.stages.put_stage_execution(
            StageExecution(
                stage_execution_ref=eval_ref,
                run_ref=run_ref,
                stage_name="evaluate",
                status="completed",
                started_at="2025-04-08T13:00:00Z",
                ended_at=ended_at,
                order_index=1,
            )
        )

        # Evaluation metrics
        eval_metrics: dict[str, float] = {
            "accuracy":  0.934,
            "auc":       0.961,
            "f1":        0.922,
            "precision": 0.917,
            "recall":    0.928,
        }
        record_store = ctx.record_store
        obs_ts = ended_at
        for key, val in eval_metrics.items():
            record_store.append(
                MetricRecord(
                    envelope=RecordEnvelope(
                        record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
                        record_type="metric",
                        recorded_at=obs_ts,
                        observed_at=obs_ts,
                        producer_ref="contexta.case12",
                        run_ref=run_ref,
                        stage_execution_ref=eval_ref,
                        completeness_marker="complete",
                        degradation_marker="none",
                    ),
                    payload=MetricPayload(
                        metric_key=key,
                        value=val,
                        value_type="float64",
                    ),
                )
            )

        # Dataset version event (answers client Q1)
        dataset_version = "fraud-transactions-2025q1-v3"
        record_store.append(
            StructuredEventRecord(
                envelope=RecordEnvelope(
                    record_ref=f"record:{PROJECT_NAME}.{run_name}.{_next_rid()}",
                    record_type="event",
                    recorded_at=started_at,
                    observed_at=started_at,
                    producer_ref="contexta.case12",
                    run_ref=run_ref,
                    completeness_marker="complete",
                    degradation_marker="none",
                ),
                payload=StructuredEventPayload(
                    event_key="training.dataset-registered",
                    level="info",
                    message=f"Training dataset version: {dataset_version}",
                    attributes={
                        "dataset_version": dataset_version,
                        "record_count": 2_400_000,
                        "date_range": "2024-01-01 to 2025-03-31",
                    },
                    origin_marker="explicit_capture",
                ),
            )
        )

        # Environment snapshot (answers client Q3)
        env_packages = {
            "scikit-learn": "1.4.0",
            "xgboost": "2.0.3",
            "pandas": "2.2.1",
            "numpy": "1.26.4",
            "imbalanced-learn": "0.12.0",
            "shap": "0.45.0",
        }
        store.environments.put_environment_snapshot(
            EnvironmentSnapshot(
                environment_snapshot_ref=f"environment:{PROJECT_NAME}.{run_name}.snap",
                run_ref=run_ref,
                captured_at=started_at,
                python_version="3.11.8",
                platform="linux",
                packages=env_packages,
                environment_variables={},
            )
        )

        return {
            "client": CLIENT_NAME,
            "model": MODEL_NAME,
            "run_id": run_ref,
            "dataset_version": dataset_version,
        }
    finally:
        store.close()


def main() -> None:
    from contextlib import redirect_stdout
    import io

    with redirect_stdout(io.StringIO()):
        run_example(Path(".contexta"))

    print(f"Seeded {PROJECT_NAME} data in .contexta.")


if __name__ == "__main__":
    main()

case12_analyze_certificate.py
"""Create a quality-certificate summary from a recorded delivery run."""

from pathlib import Path

from contexta import Contexta
from contexta.config import UnifiedConfig, WorkspaceConfig

PROJECT_NAME = "fraud-detection-financebank"
CLIENT_NAME = "FinanceBank Corp"
QUALITY_THRESHOLDS: dict[str, float] = {
    "accuracy": 0.90,
    "auc": 0.93,
    "f1": 0.88,
    "precision": 0.87,
    "recall": 0.86,
}
RUN_REF = f"run:{PROJECT_NAME}.fraud-model-v2-1"

ctx = Contexta(
    config=UnifiedConfig(
        project_name=PROJECT_NAME,
        workspace=WorkspaceConfig(root_path=Path(".contexta")),
    )
)

store = ctx.metadata_store
try:
    snapshot = ctx.get_run_snapshot(RUN_REF)
    metrics = {
        record.key: float(record.value)
        for record in snapshot.records
        if record.record_type == "metric" and record.value is not None
    }
    audit = ctx.audit_reproducibility(RUN_REF)

    print(f"MODEL QUALITY CERTIFICATE - {CLIENT_NAME}")
    print(f"Run: {snapshot.run.name}")
    print(f"Python: {audit.python_version}; packages: {audit.package_count}")

    overall = True
    for key, threshold in QUALITY_THRESHOLDS.items():
        passed = metrics[key] >= threshold
        overall = overall and passed
        status = "PASS" if passed else "FAIL"
        print(f"{key:<10} {metrics[key]:.4f} >= {threshold:.4f} [{status}]")

    report = ctx.build_snapshot_report(RUN_REF)
    print(f"Overall decision: {'PASS' if overall else 'FAIL'}")
    print(f"Report: {report.title}")
finally:
    store.close()

Situation​

Without Contexta​

With Contexta​

Complete Runnable Code​

Situation

Without Contexta

With Contexta

Complete Runnable Code