Capture Evidence

Use this page when deciding what belongs inside a run. Observability evidence should be produced by a training, evaluation, or inference action that truly occurred.

What To Capture

Evidence	Why it matters
dataset or input reference	identifies the input behind measured results
stages and batches	locates training and evaluation work
computed metrics	supports comparisons and release gates
events and usage	records calls, choices, and irregularities
artifacts	connects observations to a trained output or evaluation asset

Executable Examples

Each tab is a self-contained program that writes observed results to a local .contexta/ workspace. Copy one tab into the file shown below, install its dependencies, and run it from the directory where you want the workspace to be created.

Example	Save as	Install	What the run records
Machine Learning	`capture_regression.py`	`uv add "contexta[sklearn]"`	dataset event, measured `r2` and `mae`, fitted model artifact
Deep Learning	`capture_cnn.py`	`uv add "contexta[sklearn,torch]"`	epoch loss, validation sample/accuracy, trained checkpoint, selection event
LLM	`capture_llm.py`	`uv add contexta`	local mock API response events, sample metrics, measured pass rate, selection event

Machine Learning
Deep Learning
LLM

"""Train a real regression model and capture its measured evidence."""

import pickle
from pathlib import Path

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from contexta import Contexta
from contexta.capture import LocalJsonlSink


features, targets = load_diabetes(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
    features, targets, test_size=0.2, random_state=42
)

workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "diabetes-regression"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = LinearRegression()

with ctx.run("linear-regression", dataset_ref="dataset:sklearn.diabetes") as run:
    run.event(
        "dataset.loaded",
        message="Loaded the scikit-learn diabetes dataset",
        attributes={"rows": len(features), "features": features.shape[1]},
    )
    with run.stage("train"):
        model.fit(train_x, train_y)

    with run.stage("evaluate") as stage:
        predictions = model.predict(test_x)
        r2 = r2_score(test_y, predictions)
        mae = mean_absolute_error(test_y, predictions)
        stage.metric("r2", r2, unit="ratio")
        stage.metric("mae", mae)

    model_path = workspace / "models" / "linear-regression.pkl"
    model_path.parent.mkdir(parents=True, exist_ok=True)
    model_path.write_bytes(pickle.dumps(model))
    run.register_artifact("model", str(model_path), attributes={"format": "pickle"})

records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())

print(f"Captured run: {run.ref}")
print(f"Measured r2: {r2:.3f}; mae: {mae:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Model artifact: {model_path.as_posix()}")

"""Train a tiny CNN and capture epoch, evaluation, and checkpoint evidence."""

from pathlib import Path

import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class TinyCNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(8 * 4 * 4, 10),
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        return self.layers(features)


torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
    digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
    torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
    torch.tensor(train_y, dtype=torch.long),
)
loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)

workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = TinyCNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

with ctx.run("tiny-cnn", dataset_ref="dataset:sklearn.digits") as run:
    with run.stage("train") as stage:
        for epoch in range(1, 3):
            total_loss = 0.0
            for features, targets in loader:
                optimizer.zero_grad()
                loss = loss_fn(model(features), targets)
                loss.backward()
                optimizer.step()
                total_loss += loss.item() * len(targets)
            with stage.batch(f"epoch-{epoch}") as batch:
                batch.metric("loss", total_loss / len(train_data))

    with run.stage("evaluate") as stage:
        with torch.no_grad():
            logits = model(test_features)
            accuracy = (logits.argmax(dim=1) == test_targets).float().mean().item()
        stage.metric("accuracy", accuracy, unit="ratio")
        with stage.sample("first-validation-image") as sample:
            sample.metric(
                "prediction.correct",
                float(logits[0].argmax().item() == test_targets[0].item()),
                unit="ratio",
            )

    checkpoint = workspace / "models" / "tiny-cnn.pt"
    checkpoint.parent.mkdir(parents=True, exist_ok=True)
    torch.save(model.state_dict(), checkpoint)
    run.register_artifact("checkpoint", str(checkpoint), attributes={"epochs": 2})

with ctx.deployment("tiny-cnn-candidate", run_ref=run.ref) as deployment:
    deployment.event("checkpoint.selected", message="Selected trained checkpoint for review")

records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())

print(f"Captured run: {run.ref}")
print(f"Measured validation accuracy: {accuracy:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Checkpoint artifact: {checkpoint.as_posix()}")

"""Evaluate an OpenAI-shaped local mock API and capture response evidence."""

from pathlib import Path
from time import perf_counter
from types import SimpleNamespace

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class MockCompletions:
    def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
        question = messages[-1]["content"]
        if "workspace" in question.lower():
            answer = "Contexta stores local evidence in a .contexta workspace."
        else:
            answer = "I cannot answer from the provided context."
        return SimpleNamespace(
            id=f"chatgpt-mock-{model}",
            choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
            usage=SimpleNamespace(completion_tokens=len(answer.split())),
        )


class MockOpenAI:
    def __init__(self) -> None:
        self.chat = type("Chat", (), {"completions": MockCompletions()})()


cases = [
    ("workspace-question", "Where is the workspace?", ".contexta"),
    ("unsupported-question", "Which GPU was used?", "cannot answer"),
]
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-eval"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
passed = 0

with ctx.run("mock-chat-evaluation", dataset_ref="dataset:local.prompt-cases") as run:
    with run.stage("evaluate") as stage:
        for name, question, expected in cases:
            started = perf_counter()
            response = client.chat.completions.create(
                model="gpt-4.1-mini-mock",
                messages=[{"role": "user", "content": question}],
            )
            answer = response.choices[0].message.content
            correct = expected in answer
            passed += int(correct)
            with stage.sample(name) as sample:
                sample.metric("correct", float(correct), unit="ratio")
                sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
                sample.metric("completion.tokens", response.usage.completion_tokens)
                sample.event("response.received", message=answer)
        pass_rate = passed / len(cases)
        stage.metric("pass.rate", pass_rate, unit="ratio")

with ctx.deployment("mock-chat-prompt", run_ref=run.ref) as deployment:
    deployment.event("prompt.selected", message="Selected observed prompt flow for staging")

records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())

print(f"Captured run: {run.ref}")
print(f"Measured prompt-case pass rate: {pass_rate:.2f}")
print(f"Records: {records_path.as_posix()}")

Run the copied program:

uv run capture_regression.py

Use capture_cnn.py or capture_llm.py instead when you copied another tab. All three programs print the captured run and leave record evidence here:

.contexta/
  cache/capture/record.jsonl

The ML and Deep Learning programs also print an artifact path under .contexta/models/. The LLM program records request-by-request evaluation evidence only; it does not invent an output file merely to demonstrate an artifact.

When you inspect record.jsonl, look for the measured metric together with its run_ref and stage_execution_ref. That association is the important part of capture: the result is stored with the execution that produced it, instead of appearing as an unexplained number.

Pitfalls

Do not present a preselected accuracy or loss value as though a model generated it.
Do not write a placeholder checkpoint and describe it as a trained artifact.
For provider-shaped integrations, a local mock API is appropriate when the API interaction is real and deterministic while network access and paid model behavior are intentionally excluded.
Keep dataset references, computation stages, metrics and resulting artifacts in the same observable run.

What To Capture​

Executable Examples​

Pitfalls​

Related Pages​

What To Capture

Executable Examples

Pitfalls

Related Pages