Contexta Style Guide
This page captures maintainers' recommended practices for instrumenting ML systems.
Priority A: Essential
Observe Real Work
Use metrics obtained from the operation being observed. Workflow examples should perform their operation before recording its results.
- Machine Learning
- Deep Learning
- LLM
"""Train a real regression model and capture its measured evidence."""
import pickle
from pathlib import Path
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from contexta import Contexta
from contexta.capture import LocalJsonlSink
features, targets = load_diabetes(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
features, targets, test_size=0.2, random_state=42
)
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "diabetes-regression"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = LinearRegression()
with ctx.run("linear-regression", dataset_ref="dataset:sklearn.diabetes") as run:
run.event(
"dataset.loaded",
message="Loaded the scikit-learn diabetes dataset",
attributes={"rows": len(features), "features": features.shape[1]},
)
with run.stage("train"):
model.fit(train_x, train_y)
with run.stage("evaluate") as stage:
predictions = model.predict(test_x)
r2 = r2_score(test_y, predictions)
mae = mean_absolute_error(test_y, predictions)
stage.metric("r2", r2, unit="ratio")
stage.metric("mae", mae)
model_path = workspace / "models" / "linear-regression.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
model_path.write_bytes(pickle.dumps(model))
run.register_artifact("model", str(model_path), attributes={"format": "pickle"})
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured r2: {r2:.3f}; mae: {mae:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Model artifact: {model_path.as_posix()}")
"""Train a tiny CNN and capture epoch, evaluation, and checkpoint evidence."""
from pathlib import Path
import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class TinyCNN(nn.Module):
def __init__(self) -> None:
super().__init__()
self.layers = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(8 * 4 * 4, 10),
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
return self.layers(features)
torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
torch.tensor(train_y, dtype=torch.long),
)
loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
model = TinyCNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
with ctx.run("tiny-cnn", dataset_ref="dataset:sklearn.digits") as run:
with run.stage("train") as stage:
for epoch in range(1, 3):
total_loss = 0.0
for features, targets in loader:
optimizer.zero_grad()
loss = loss_fn(model(features), targets)
loss.backward()
optimizer.step()
total_loss += loss.item() * len(targets)
with stage.batch(f"epoch-{epoch}") as batch:
batch.metric("loss", total_loss / len(train_data))
with run.stage("evaluate") as stage:
with torch.no_grad():
logits = model(test_features)
accuracy = (logits.argmax(dim=1) == test_targets).float().mean().item()
stage.metric("accuracy", accuracy, unit="ratio")
with stage.sample("first-validation-image") as sample:
sample.metric(
"prediction.correct",
float(logits[0].argmax().item() == test_targets[0].item()),
unit="ratio",
)
checkpoint = workspace / "models" / "tiny-cnn.pt"
checkpoint.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), checkpoint)
run.register_artifact("checkpoint", str(checkpoint), attributes={"epochs": 2})
with ctx.deployment("tiny-cnn-candidate", run_ref=run.ref) as deployment:
deployment.event("checkpoint.selected", message="Selected trained checkpoint for review")
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured validation accuracy: {accuracy:.3f}")
print(f"Records: {records_path.as_posix()}")
print(f"Checkpoint artifact: {checkpoint.as_posix()}")
"""Evaluate an OpenAI-shaped local mock API and capture response evidence."""
from pathlib import Path
from time import perf_counter
from types import SimpleNamespace
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class MockCompletions:
def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
question = messages[-1]["content"]
if "workspace" in question.lower():
answer = "Contexta stores local evidence in a .contexta workspace."
else:
answer = "I cannot answer from the provided context."
return SimpleNamespace(
id=f"chatgpt-mock-{model}",
choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
usage=SimpleNamespace(completion_tokens=len(answer.split())),
)
class MockOpenAI:
def __init__(self) -> None:
self.chat = type("Chat", (), {"completions": MockCompletions()})()
cases = [
("workspace-question", "Where is the workspace?", ".contexta"),
("unsupported-question", "Which GPU was used?", "cannot answer"),
]
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-eval"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
passed = 0
with ctx.run("mock-chat-evaluation", dataset_ref="dataset:local.prompt-cases") as run:
with run.stage("evaluate") as stage:
for name, question, expected in cases:
started = perf_counter()
response = client.chat.completions.create(
model="gpt-4.1-mini-mock",
messages=[{"role": "user", "content": question}],
)
answer = response.choices[0].message.content
correct = expected in answer
passed += int(correct)
with stage.sample(name) as sample:
sample.metric("correct", float(correct), unit="ratio")
sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
sample.metric("completion.tokens", response.usage.completion_tokens)
sample.event("response.received", message=answer)
pass_rate = passed / len(cases)
stage.metric("pass.rate", pass_rate, unit="ratio")
with ctx.deployment("mock-chat-prompt", run_ref=run.ref) as deployment:
deployment.event("prompt.selected", message="Selected observed prompt flow for staging")
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Captured run: {run.ref}")
print(f"Measured prompt-case pass rate: {pass_rate:.2f}")
print(f"Records: {records_path.as_posix()}")
Avoid hardcoded "good" metrics or placeholder model artifacts in workflow examples. They explain syntax but misrepresent observability.
Capture Enough Context For Review
Every important run should include the input or dataset reference, meaningful stage names, measured metrics, and any artifact a reviewer would inspect or promote.
Keep Workspaces Disposable In Examples
Run copied examples in a practice directory so their local .contexta/
workspace does not mix with a reader's real project history. Tests and
maintainer-only runners may use temporary directories.
Priority B: Strongly Recommended
Prefer The Facade First
Start with Contexta for capture, query, comparison, diagnostics, lineage, and
reports. Move to direct stores only for storage internals or advanced recovery.
Keep External Cost Optional
Keep external credentials, network behavior, and billing out of introductory examples unless the page explicitly teaches those concerns.
Print A Reviewable Result
Examples should print a run ref, measured score, artifact path, report path, diagnostic summary, or workspace location.
Priority C: Recommended
Share Executable Source
Display checked example files in docs rather than maintaining slightly different inline copies across pages and locales.
Localize Prose, Not Code
Korean documentation should display the same runnable source as English unless a localized output is part of what the page teaches.