Compare Runs

Compare runs when a decision depends on measured differences. A comparison example should execute the candidates before selecting one; otherwise it only demonstrates dictionary sorting.

Executable Examples

Each tab executes candidate work and records measured evidence before any comparison or selection is made.

Machine Learning
Deep Learning
LLM

"""Train two real SVM candidates and compare their captured evaluation results."""

import pickle
from pathlib import Path

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from contexta import Contexta
from contexta.capture import LocalJsonlSink


features, targets = load_iris(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
    features, targets, test_size=0.3, stratify=targets, random_state=7
)
candidates = {
    "linear-svm": SVC(kernel="linear"),
    "rbf-svm": SVC(kernel="rbf", gamma="scale"),
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "iris-svm"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}

for name, estimator in candidates.items():
    with ctx.run(name, dataset_ref="dataset:sklearn.iris") as run:
        with run.stage("train"):
            model = make_pipeline(StandardScaler(), estimator)
            model.fit(train_x, train_y)

        with run.stage("evaluate") as stage:
            predictions = model.predict(test_x)
            accuracy = accuracy_score(test_y, predictions)
            macro_f1 = f1_score(test_y, predictions, average="macro")
            with stage.batch("holdout-split") as batch:
                batch.metric("accuracy", accuracy, unit="ratio")
                batch.metric("macro.f1", macro_f1, unit="ratio")
                with batch.sample("first-prediction") as sample:
                    sample.metric("correct", float(predictions[0] == test_y[0]), unit="ratio")

        model_path = workspace / "models" / f"{name}.pkl"
        model_path.parent.mkdir(parents=True, exist_ok=True)
        model_path.write_bytes(pickle.dumps(model))
        run.register_artifact("model", str(model_path), attributes={"candidate": name})
    scores[name] = accuracy
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["rbf-svm"] - scores["linear-svm"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['linear-svm']} vs {run_refs['rbf-svm']}")
print(f"Accuracy: {scores['linear-svm']:.3f} -> {scores['rbf-svm']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")

"""Train two tiny CNN configurations and compare their measured accuracy."""

from pathlib import Path

import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class TinyCNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(8 * 4 * 4, 10),
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        return self.layers(features)


torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
    digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
    torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
    torch.tensor(train_y, dtype=torch.long),
)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)
loss_fn = nn.CrossEntropyLoss()
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}

for name, learning_rate in {"cnn-fast": 0.01, "cnn-steady": 0.003}.items():
    model = TinyCNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loader = DataLoader(train_data, batch_size=64, shuffle=True)
    with ctx.run(name, dataset_ref="dataset:sklearn.digits") as run:
        with run.stage("train") as stage:
            for epoch in range(1, 3):
                total_loss = 0.0
                for features, targets in loader:
                    optimizer.zero_grad()
                    loss = loss_fn(model(features), targets)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item() * len(targets)
                with stage.batch(f"epoch-{epoch}") as batch:
                    batch.metric("loss", total_loss / len(train_data))

        with run.stage("evaluate") as stage:
            with torch.no_grad():
                predictions = model(test_features).argmax(dim=1)
            accuracy = (predictions == test_targets).float().mean().item()
            stage.metric("accuracy", accuracy, unit="ratio")
            scores[name] = accuracy

        checkpoint = workspace / "models" / f"{name}.pt"
        checkpoint.parent.mkdir(parents=True, exist_ok=True)
        torch.save(model.state_dict(), checkpoint)
        run.register_artifact("checkpoint", str(checkpoint), attributes={"candidate": name})
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["cnn-steady"] - scores["cnn-fast"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['cnn-fast']} vs {run_refs['cnn-steady']}")
print(f"Validation accuracy: {scores['cnn-fast']:.3f} -> {scores['cnn-steady']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")

"""Compare two prompt strategies through an OpenAI-shaped local mock API."""

from pathlib import Path
from time import perf_counter
from types import SimpleNamespace

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class MockCompletions:
    def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
        instruction = messages[0]["content"]
        question = messages[-1]["content"]
        if "workspace" in question.lower():
            answer = "Contexta stores local evidence in a .contexta workspace."
        elif "refuse unsupported" in instruction.lower():
            answer = "I cannot answer from the provided context."
        else:
            answer = "A GPU was probably used."
        return SimpleNamespace(
            choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
            usage=SimpleNamespace(completion_tokens=len(answer.split())),
        )


class MockOpenAI:
    def __init__(self) -> None:
        self.chat = type("Chat", (), {"completions": MockCompletions()})()


cases = [
    ("workspace-question", "Where is the workspace?", ".contexta"),
    ("unsupported-question", "Which GPU was used?", "cannot answer"),
]
prompts = {
    "helpful-only": "Answer the user's question.",
    "grounded": "Answer from known context and refuse unsupported questions.",
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
scores = {}
run_refs = {}

for name, instruction in prompts.items():
    passed = 0
    with ctx.run(name, dataset_ref="dataset:local.prompt-cases") as run:
        with run.stage("evaluate") as stage:
            for case_name, question, expected in cases:
                started = perf_counter()
                response = client.chat.completions.create(
                    model="gpt-4.1-mini-mock",
                    messages=[
                        {"role": "system", "content": instruction},
                        {"role": "user", "content": question},
                    ],
                )
                answer = response.choices[0].message.content
                correct = expected in answer
                passed += int(correct)
                with stage.sample(case_name) as sample:
                    sample.metric("correct", float(correct), unit="ratio")
                    sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
                    sample.metric("completion.tokens", response.usage.completion_tokens)
            pass_rate = passed / len(cases)
            stage.metric("pass.rate", pass_rate, unit="ratio")
            scores[name] = pass_rate
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["grounded"] - scores["helpful-only"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['helpful-only']} vs {run_refs['grounded']}")
print(f"Pass rate: {scores['helpful-only']:.2f} -> {scores['grounded']:.2f}")
print(f"Delta: {delta:+.2f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")

Copy one tab into a local file and run it in the directory where .contexta/ should be created:

Example	Save as	Install
Machine Learning	`compare_svm.py`	`uv add "contexta[sklearn]"`
Deep Learning	`compare_cnn.py`	`uv add "contexta[sklearn,torch]"`
LLM	`compare_llm.py`	`uv add contexta`

Each program prints two measured candidate results, their delta, and the selected run. Inspect .contexta/cache/capture/record.jsonl to see each value connected to the run and evaluation stage that produced it. The ML and Deep Learning examples also register the fitted candidate artifacts.

What To Compare

Domain	Useful measured comparison key
Machine Learning	validation accuracy, F1, MAE, calibration
Deep Learning	validation loss, validation accuracy, checkpoint size, latency
LLM	evaluation pass rate, faithfulness, latency, token usage

Executable Examples​

What To Compare​

Related Pages​

Executable Examples

What To Compare

Related Pages