Skip to main content

Compare Runs

Compare runs when a decision depends on measured differences. A comparison example should execute the candidates before selecting one; otherwise it only demonstrates dictionary sorting.

Executable Examples

Each tab executes candidate work and records measured evidence before any comparison or selection is made.

"""Train two real SVM candidates and compare their captured evaluation results."""

import pickle
from pathlib import Path

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from contexta import Contexta
from contexta.capture import LocalJsonlSink


features, targets = load_iris(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
features, targets, test_size=0.3, stratify=targets, random_state=7
)
candidates = {
"linear-svm": SVC(kernel="linear"),
"rbf-svm": SVC(kernel="rbf", gamma="scale"),
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "iris-svm"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}

for name, estimator in candidates.items():
with ctx.run(name, dataset_ref="dataset:sklearn.iris") as run:
with run.stage("train"):
model = make_pipeline(StandardScaler(), estimator)
model.fit(train_x, train_y)

with run.stage("evaluate") as stage:
predictions = model.predict(test_x)
accuracy = accuracy_score(test_y, predictions)
macro_f1 = f1_score(test_y, predictions, average="macro")
with stage.batch("holdout-split") as batch:
batch.metric("accuracy", accuracy, unit="ratio")
batch.metric("macro.f1", macro_f1, unit="ratio")
with batch.sample("first-prediction") as sample:
sample.metric("correct", float(predictions[0] == test_y[0]), unit="ratio")

model_path = workspace / "models" / f"{name}.pkl"
model_path.parent.mkdir(parents=True, exist_ok=True)
model_path.write_bytes(pickle.dumps(model))
run.register_artifact("model", str(model_path), attributes={"candidate": name})
scores[name] = accuracy
run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["rbf-svm"] - scores["linear-svm"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['linear-svm']} vs {run_refs['rbf-svm']}")
print(f"Accuracy: {scores['linear-svm']:.3f} -> {scores['rbf-svm']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")

Copy one tab into a local file and run it in the directory where .contexta/ should be created:

ExampleSave asInstall
Machine Learningcompare_svm.pyuv add "contexta[sklearn]"
Deep Learningcompare_cnn.pyuv add "contexta[sklearn,torch]"
LLMcompare_llm.pyuv add contexta

Each program prints two measured candidate results, their delta, and the selected run. Inspect .contexta/cache/capture/record.jsonl to see each value connected to the run and evaluation stage that produced it. The ML and Deep Learning examples also register the fitted candidate artifacts.

What To Compare

DomainUseful measured comparison key
Machine Learningvalidation accuracy, F1, MAE, calibration
Deep Learningvalidation loss, validation accuracy, checkpoint size, latency
LLMevaluation pass rate, faithfulness, latency, token usage