Query And Diagnostics

Query and diagnostics become useful after the evidence represents actual work. The programs below generate measured evidence from trained candidates or an API-like evaluation call; inspect those workspaces rather than starting from prewritten metric values.

Evidence-Producing Programs

Machine Learning
Deep Learning
LLM

"""Train two real SVM candidates and compare their captured evaluation results."""

import pickle
from pathlib import Path

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from contexta import Contexta
from contexta.capture import LocalJsonlSink


features, targets = load_iris(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(
    features, targets, test_size=0.3, stratify=targets, random_state=7
)
candidates = {
    "linear-svm": SVC(kernel="linear"),
    "rbf-svm": SVC(kernel="rbf", gamma="scale"),
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "iris-svm"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}

for name, estimator in candidates.items():
    with ctx.run(name, dataset_ref="dataset:sklearn.iris") as run:
        with run.stage("train"):
            model = make_pipeline(StandardScaler(), estimator)
            model.fit(train_x, train_y)

        with run.stage("evaluate") as stage:
            predictions = model.predict(test_x)
            accuracy = accuracy_score(test_y, predictions)
            macro_f1 = f1_score(test_y, predictions, average="macro")
            with stage.batch("holdout-split") as batch:
                batch.metric("accuracy", accuracy, unit="ratio")
                batch.metric("macro.f1", macro_f1, unit="ratio")
                with batch.sample("first-prediction") as sample:
                    sample.metric("correct", float(predictions[0] == test_y[0]), unit="ratio")

        model_path = workspace / "models" / f"{name}.pkl"
        model_path.parent.mkdir(parents=True, exist_ok=True)
        model_path.write_bytes(pickle.dumps(model))
        run.register_artifact("model", str(model_path), attributes={"candidate": name})
    scores[name] = accuracy
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["rbf-svm"] - scores["linear-svm"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['linear-svm']} vs {run_refs['rbf-svm']}")
print(f"Accuracy: {scores['linear-svm']:.3f} -> {scores['rbf-svm']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")

"""Train two tiny CNN configurations and compare their measured accuracy."""

from pathlib import Path

import torch
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class TinyCNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(8 * 4 * 4, 10),
        )

    def forward(self, features: torch.Tensor) -> torch.Tensor:
        return self.layers(features)


torch.manual_seed(7)
digits = load_digits()
train_x, test_x, train_y, test_y = train_test_split(
    digits.images, digits.target, test_size=0.2, stratify=digits.target, random_state=7
)
train_data = TensorDataset(
    torch.tensor(train_x[:, None] / 16.0, dtype=torch.float32),
    torch.tensor(train_y, dtype=torch.long),
)
test_features = torch.tensor(test_x[:, None] / 16.0, dtype=torch.float32)
test_targets = torch.tensor(test_y, dtype=torch.long)
loss_fn = nn.CrossEntropyLoss()
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "digits-cnn-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
scores = {}
run_refs = {}

for name, learning_rate in {"cnn-fast": 0.01, "cnn-steady": 0.003}.items():
    model = TinyCNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loader = DataLoader(train_data, batch_size=64, shuffle=True)
    with ctx.run(name, dataset_ref="dataset:sklearn.digits") as run:
        with run.stage("train") as stage:
            for epoch in range(1, 3):
                total_loss = 0.0
                for features, targets in loader:
                    optimizer.zero_grad()
                    loss = loss_fn(model(features), targets)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item() * len(targets)
                with stage.batch(f"epoch-{epoch}") as batch:
                    batch.metric("loss", total_loss / len(train_data))

        with run.stage("evaluate") as stage:
            with torch.no_grad():
                predictions = model(test_features).argmax(dim=1)
            accuracy = (predictions == test_targets).float().mean().item()
            stage.metric("accuracy", accuracy, unit="ratio")
            scores[name] = accuracy

        checkpoint = workspace / "models" / f"{name}.pt"
        checkpoint.parent.mkdir(parents=True, exist_ok=True)
        torch.save(model.state_dict(), checkpoint)
        run.register_artifact("checkpoint", str(checkpoint), attributes={"candidate": name})
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["cnn-steady"] - scores["cnn-fast"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())
artifacts_path = local_sink.file_path_for("ARTIFACT").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['cnn-fast']} vs {run_refs['cnn-steady']}")
print(f"Validation accuracy: {scores['cnn-fast']:.3f} -> {scores['cnn-steady']:.3f}")
print(f"Delta: {delta:+.3f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")
print(f"Artifacts: {artifacts_path.as_posix()}")

"""Compare two prompt strategies through an OpenAI-shaped local mock API."""

from pathlib import Path
from time import perf_counter
from types import SimpleNamespace

from contexta import Contexta
from contexta.capture import LocalJsonlSink


class MockCompletions:
    def create(self, *, model: str, messages: list[dict[str, str]]) -> SimpleNamespace:
        instruction = messages[0]["content"]
        question = messages[-1]["content"]
        if "workspace" in question.lower():
            answer = "Contexta stores local evidence in a .contexta workspace."
        elif "refuse unsupported" in instruction.lower():
            answer = "I cannot answer from the provided context."
        else:
            answer = "A GPU was probably used."
        return SimpleNamespace(
            choices=[SimpleNamespace(message=SimpleNamespace(content=answer))],
            usage=SimpleNamespace(completion_tokens=len(answer.split())),
        )


class MockOpenAI:
    def __init__(self) -> None:
        self.chat = type("Chat", (), {"completions": MockCompletions()})()


cases = [
    ("workspace-question", "Where is the workspace?", ".contexta"),
    ("unsupported-question", "Which GPU was used?", "cannot answer"),
]
prompts = {
    "helpful-only": "Answer the user's question.",
    "grounded": "Answer from known context and refuse unsupported questions.",
}
workspace = Path(".contexta")
ctx = Contexta(workspace=str(workspace), config={"project_name": "mock-openai-compare"})
local_sink = next(sink for sink in ctx.sinks if isinstance(sink, LocalJsonlSink))
client = MockOpenAI()
scores = {}
run_refs = {}

for name, instruction in prompts.items():
    passed = 0
    with ctx.run(name, dataset_ref="dataset:local.prompt-cases") as run:
        with run.stage("evaluate") as stage:
            for case_name, question, expected in cases:
                started = perf_counter()
                response = client.chat.completions.create(
                    model="gpt-4.1-mini-mock",
                    messages=[
                        {"role": "system", "content": instruction},
                        {"role": "user", "content": question},
                    ],
                )
                answer = response.choices[0].message.content
                correct = expected in answer
                passed += int(correct)
                with stage.sample(case_name) as sample:
                    sample.metric("correct", float(correct), unit="ratio")
                    sample.metric("latency.ms", (perf_counter() - started) * 1000, unit="ms")
                    sample.metric("completion.tokens", response.usage.completion_tokens)
            pass_rate = passed / len(cases)
            stage.metric("pass.rate", pass_rate, unit="ratio")
            scores[name] = pass_rate
    run_refs[name] = run.ref

best_name = max(scores, key=scores.get)
delta = scores["grounded"] - scores["helpful-only"]
records_path = local_sink.file_path_for("RECORD").relative_to(Path.cwd())

print(f"Compared runs: {run_refs['helpful-only']} vs {run_refs['grounded']}")
print(f"Pass rate: {scores['helpful-only']:.2f} -> {scores['grounded']:.2f}")
print(f"Delta: {delta:+.2f}")
print(f"Selected run: {run_refs[best_name]}")
print(f"Records: {records_path.as_posix()}")

Canonical Query APIs

Use the facade after evidence has been written to a canonical workspace:

ctx.list_runs(...)
ctx.get_run_snapshot(...)
ctx.compare_runs(...)
ctx.select_best_run(...)
ctx.diagnose_run(...)
ctx.traverse_lineage(...)
ctx.build_snapshot_report(...)

For lightweight runtime-capture examples, inspect workspace / "cache" / "capture" / "record.jsonl" to see the measured events and metrics emitted by the program.

Evidence-Producing Programs​

Canonical Query APIs​

Related Pages​

Evidence-Producing Programs

Canonical Query APIs

Related Pages