Compare Runs
Compare runs when a decision depends on measured differences. A comparison example should execute the candidates before selecting one; otherwise it only demonstrates dictionary sorting.
Executable Examples
Each tab executes candidate work and records measured evidence before any comparison or selection is made.
- Machine Learning
- Deep Learning
- LLM
"""Train two real SVM candidates and compare their captured evaluation results."""
import pickle
from pathlib import Path
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from contexta import Contexta
from contexta.capture import LocalJsonlSink
features, targets = load_iris(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
features, targets, test_size=0.3, stratify=targets, random_state=7
)
candidates = {
"linear-svm": SVC(kernel="linear"),
"rbf-svm": SVC(kernel="rbf", gamma="scale"),
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "iris-svm"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}
for name, estimator in candidates.items():
with ctx.run(name, dataset_ref="dataset:sklearn.iris") as run:
with run.stage("train"):
model = make_pipeline(StandardScaler(), estimator)
model.fit(train_x, train_y)
with run.stage("evaluate") as stage:
predictions = model.predict(test_x)
accuracy = accuracy_score(test_y, predictions)
macro_f1 = f1_score(test_y, predictions, average="macro")
with stage.batch("holdout-split") as batch:
batch.metric("accuracy", accuracy, unit="ratio")
batch.metric("macro.f1", macro_f1, unit="ratio")
with batch.sample("first-prediction") as sample:
sample.metric("correct", float(predictions[0] == test_y[0]), unit="ratio")
model_path = workspace / "models" / f"{name}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
model_path.write_bytes(pickle.dumps(model))
run.register_artifact("model", str(model_path), attributes={"candidate": name})
scores[name] = accuracy
run_refs[name] = run.ref
best_name = max(scores, key=scores.get)
delta = scores["rbf-svm"] - scores["linear-svm"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())
print(f"Compared runs: {run_refs['linear-svm']} vs {run_refs['rbf-svm']}")
print(f"Accuracy: {scores['linear-svm']:.3f} -> {scores['rbf-svm']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")
"""Train two tiny CNN configurations and compare their measured accuracy."""
from pathlib import Path
import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class TinyCNN(nn.Module):
def __init__(self) -> None:
super().__init__()
self.layers = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Flatten(),
nn.Linear(8 * 4 * 4, 10),
)
def forward(self, features: torch.Tensor) -> torch.Tensor:
return self.layers(features)
torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
torch.tensor(train_y, dtype=torch.long),
)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)
loss_fn = nn.CrossEntropyLoss()
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}
for name, learning_rate in {"cnn-fast": 0.01, "cnn-steady": 0.003}.items():
model = TinyCNN()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loader = DataLoader(train_data, batch_size=64, shuffle=True)
with ctx.run(name, dataset_ref="dataset:sklearn.digits") as run:
with run.stage("train") as stage:
for epoch in range(1, 3):
total_loss = 0.0
for features, targets in loader:
optimizer.zero_grad()
loss = loss_fn(model(features), targets)
loss.backward()
optimizer.step()
total_loss += loss.item() * len(targets)
with stage.batch(f"epoch-{epoch}") as batch:
batch.metric("loss", total_loss / len(train_data))
with run.stage("evaluate") as stage:
with torch.no_grad():
predictions = model(test_features).argmax(dim=1)
accuracy = (predictions == test_targets).float().mean().item()
stage.metric("accuracy", accuracy, unit="ratio")
scores[name] = accuracy
checkpoint = workspace / "models" / f"{name}.pt"
checkpoint.parent.mkdir(parents=True, exist_ok=True)
torch.save(model.state_dict(), checkpoint)
run.register_artifact("checkpoint", str(checkpoint), attributes={"candidate": name})
run_refs[name] = run.ref
best_name = max(scores, key=scores.get)
delta = scores["cnn-steady"] - scores["cnn-fast"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())
print(f"Compared runs: {run_refs['cnn-fast']} vs {run_refs['cnn-steady']}")
print(f"Validation accuracy: {scores['cnn-fast']:.3f} -> {scores['cnn-steady']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")
"""Compare two prompt strategies through an OpenAI-shaped local mock API."""
from pathlib import Path
from time import perf_counter
from types import SimpleNamespace
from contexta import Contexta
from contexta.capture import LocalJsonlSink
class MockCompletions:
def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
instruction = messages[0]["content"]
question = messages[-1]["content"]
if "workspace" in question.lower():
answer = "Contexta stores local evidence in a .contexta workspace."
elif "refuse unsupported" in instruction.lower():
answer = "I cannot answer from the provided context."
else:
answer = "A GPU was probably used."
return SimpleNamespace(
choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
usage=SimpleNamespace(completion_tokens=len(answer.split())),
)
class MockOpenAI:
def __init__(self) -> None:
self.chat = type("Chat", (), {"completions": MockCompletions()})()
cases = [
("workspace-question", "Where is the workspace?", ".contexta"),
("unsupported-question", "Which GPU was used?", "cannot answer"),
]
prompts = {
"helpful-only": "Answer the user's question.",
"grounded": "Answer from known context and refuse unsupported questions.",
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
scores = {}
run_refs = {}
for name, instruction in prompts.items():
passed = 0
with ctx.run(name, dataset_ref="dataset:local.prompt-cases") as run:
with run.stage("evaluate") as stage:
for case_name, question, expected in cases:
started = perf_counter()
response = client.chat.completions.create(
model="gpt-4.1-mini-mock",
messages=[
{"role": "system", "content": instruction},
{"role": "user", "content": question},
],
)
answer = response.choices[0].message.content
correct = expected in answer
passed += int(correct)
with stage.sample(case_name) as sample:
sample.metric("correct", float(correct), unit="ratio")
sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
sample.metric("completion.tokens", response.usage.completion_tokens)
pass_rate = passed / len(cases)
stage.metric("pass.rate", pass_rate, unit="ratio")
scores[name] = pass_rate
run_refs[name] = run.ref
best_name = max(scores, key=scores.get)
delta = scores["grounded"] - scores["helpful-only"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
print(f"Compared runs: {run_refs['helpful-only']} vs {run_refs['grounded']}")
print(f"Pass rate: {scores['helpful-only']:.2f} -> {scores['grounded']:.2f}")
print(f"Delta: {delta:+.2f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
Copy one tab into a local file and run it in the directory where .contexta/
should be created:
| Example | Save as | Install |
|---|---|---|
| Machine Learning | compare_svm.py | uv add "contexta[sklearn]" |
| Deep Learning | compare_cnn.py | uv add "contexta[sklearn,torch]" |
| LLM | compare_llm.py | uv add contexta |
Each program prints two measured candidate results, their delta, and the
selected run. Inspect .contexta/cache/capture/record.jsonl to see each value
connected to the run and evaluation stage that produced it. The ML and Deep
Learning examples also register the fitted candidate artifacts.
What To Compare
| Domain | Useful measured comparison key |
|---|---|
| Machine Learning | validation accuracy, F1, MAE, calibration |
| Deep Learning | validation loss, validation accuracy, checkpoint size, latency |
| LLM | evaluation pass rate, faithfulness, latency, token usage |