Capture Evidence
Use this page when deciding what belongs inside a run. Observability evidence should be produced by a training, evaluation, or inference action that truly occurred.
What To Capture
| Evidence | Why it matters |
|---|---|
| dataset or input reference | identifies the input behind measured results |
| stages and batches | locates training and evaluation work |
| computed metrics | supports comparisons and release gates |
| events and usage | records calls, choices, and irregularities |
| artifacts | connects observations to a trained output or evaluation asset |
Executable Examples
Each tab is a self-contained program that writes observed results to a local
.contexta/ workspace. Copy one tab into the file shown below, install its
dependencies, and run it from the directory where you want the workspace to
be created.
| Example | Save as | Install | What the run records |
|---|---|---|---|
| Machine Learning | capture_regression.py | uv add "contexta[sklearn]" | dataset event, measured r2 and mae, fitted model artifact |
| Deep Learning | capture_cnn.py | uv add "contexta[sklearn,torch]" | epoch loss, validation sample/accuracy, trained checkpoint, selection event |
| LLM | capture_llm.py | uv add contexta | local mock API response events, sample metrics, measured pass rate, selection event |
- Machine Learning
- Deep Learning
- LLM
"""Train a real regression model and capture its measured evidence."""
import pickle
from pathlib import Path
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from contexta import Contexta
from contexta.capture import LocalJsonlSink
features, targets = load_diabetes(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
features, targets, test_size=0.2, random_state=42
)
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "diabetes-regression"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = LinearRegression()
with ctx.run("linear-regression", dataset_ref="dataset:sklearn.diabetes") as run:
run.event(
"dataset.loaded",
message="Loaded the scikit-learn diabetes dataset",
attributes={"rows": len(features), "features": features.shape[1]},
)
with run.stage("train"):
model.fit(train_x, train_y)
with run.stage("evaluate") as stage:
predictions = model.predict(test_x)
r2 = r2_score(test_y, predictions)
mae = mean_absolute_error(test_y, predictions)
stage.metric("r2", r2, unit="ratio")
stage.metric("mae", mae)
model_path = workspace / "models" / "linear-regression.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
model_path.write_bytes(pickle.dumps(model))
run.register_artifact("model", str(model_path), attributes={"format": "pickle"})
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured r2: {r2:.3f}; mae: {mae:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Model artifact: {model_path.as_posix()}")
"""Train a tiny CNN and capture epoch, evaluation, and checkpoint evidence."""
from pathlib import Path
import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class TinyCNN(nn.Module):
def __init__(self) -> None:
super().__init__()
self.layers = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(8 * 4 * 4, 10),
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
return self.layers(features)
torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
torch.tensor(train_y, dtype=torch.long),
)
loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = TinyCNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
with ctx.run("tiny-cnn", dataset_ref="dataset:sklearn.digits") as run:
with run.stage("train") as stage:
for epoch in range(1, 3):
total_loss = 0.0
for features, targets in loader:
optimizer.zero_grad()
loss = loss_fn(model(features), targets)
loss.backward()
optimizer.step()
total_loss += loss.item() * len(targets)
with stage.batch(f"epoch-{epoch}") as batch:
batch.metric("loss", total_loss / len(train_data))
with run.stage("evaluate") as stage:
with torch.no_grad():
logits = model(test_features)
accuracy = (logits.argmax(dim=1) == test_targets).float().mean().item()
stage.metric("accuracy", accuracy, unit="ratio")
with stage.sample("first-validation-image") as sample:
sample.metric(
"prediction.correct",
float(logits[0].argmax().item() == test_targets[0].item()),
unit="ratio",
)
checkpoint = workspace / "models" / "tiny-cnn.pt"
checkpoint.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), checkpoint)
run.register_artifact("checkpoint", str(checkpoint), attributes={"epochs": 2})
with ctx.deployment("tiny-cnn-candidate", run_ref=run.ref) as deployment:
deployment.event("checkpoint.selected", message="Selected trained checkpoint for review")
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured validation accuracy: {accuracy:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Checkpoint artifact: {checkpoint.as_posix()}")
"""Evaluate an OpenAI-shaped local mock API and capture response evidence."""
from pathlib import Path
from time import perf_counter
from types import SimpleNamespace
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class MockCompletions:
def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
question = messages[-1]["content"]
if "workspace" in question.lower():
answer = "Contexta stores local evidence in a .contexta workspace."
else:
answer = "I cannot answer from the provided context."
return SimpleNamespace(
id=f"chatgpt-mock-{model}",
choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
usage=SimpleNamespace(completion_tokens=len(answer.split())),
)
class MockOpenAI:
def __init__(self) -> None:
self.chat = type("Chat", (), {"completions": MockCompletions()})()
cases = [
("workspace-question", "Where is the workspace?", ".contexta"),
("unsupported-question", "Which GPU was used?", "cannot answer"),
]
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-eval"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
passed = 0
with ctx.run("mock-chat-evaluation", dataset_ref="dataset:local.prompt-cases") as run:
with run.stage("evaluate") as stage:
for name, question, expected in cases:
started = perf_counter()
response = client.chat.completions.create(
model="gpt-4.1-mini-mock",
messages=[{"role": "user", "content": question}],
)
answer = response.choices[0].message.content
correct = expected in answer
passed += int(correct)
with stage.sample(name) as sample:
sample.metric("correct", float(correct), unit="ratio")
sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
sample.metric("completion.tokens", response.usage.completion_tokens)
sample.event("response.received", message=answer)
pass_rate = passed / len(cases)
stage.metric("pass.rate", pass_rate, unit="ratio")
with ctx.deployment("mock-chat-prompt", run_ref=run.ref) as deployment:
deployment.event("prompt.selected", message="Selected observed prompt flow for staging")
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured prompt-case pass rate: {pass_rate:.2f}")
print(f"Records: {records_path.as_posix()}")
Run the copied program:
uv run capture_regression.py
Use capture_cnn.py or capture_llm.py instead when you copied another tab.
All three programs print the captured run and leave record evidence here:
.contexta/
cache/capture/record.jsonl
The ML and Deep Learning programs also print an artifact path under
.contexta/models/. The LLM program records request-by-request evaluation
evidence only; it does not invent an output file merely to demonstrate an
artifact.
When you inspect record.jsonl, look for the measured metric together with
its run_ref and stage_execution_ref. That association is the important
part of capture: the result is stored with the execution that produced it,
instead of appearing as an unexplained number.
Pitfalls
- Do not present a preselected accuracy or loss value as though a model generated it.
- Do not write a placeholder checkpoint and describe it as a trained artifact.
- For provider-shaped integrations, a local mock API is appropriate when the API interaction is real and deterministic while network access and paid model behavior are intentionally excluded.
- Keep dataset references, computation stages, metrics and resulting artifacts in the same observable run.