Harness Engineering for AI Agents · Verification & Failure Modes

Building the Verification Loop

Colab Notebook · ~60 min
Google Colab Notebook
Building the Verification Loop
Python · ~60 min
Open in Colab
Lab Objectives
1
Implement the 5-dimension evaluation rubric as a function that parses evaluator JSON and returns a weighted score
2
Implement the harness-side count check that verifies enumerated output meets its count constraint without model involvement
3
Build the full evaluate → revise → verify loop with configurable max_rounds and pass_threshold, logging per-round scores
4
Run the loop on a real synthesis output and observe how the revision prompt drives score improvement across rounds
5
Implement the failure pattern detector that identifies the failure class from Wiggum issue strings
6
Analyze a set of 20 synthetic run records to compute the composite score and identify the weakest rubric dimension

Setup

conda activate ollama-pi
# Requires: ollama with at least Qwen3-Coder:30b (evaluator) or any 7B+ model
# If running without Ollama, the lab provides a mock evaluator

Exercise 1: The Evaluation Rubric

Implement compute_weighted_score() from raw dimension scores.

DIMENSION_WEIGHTS = {
    "relevance":     0.20,
    "completeness":  0.25,
    "depth":         0.30,
    "specificity":   0.15,
    "structure":     0.10
}

def compute_weighted_score(dimensions: dict) -> float:
    """Compute weighted Wiggum score from dimension dict."""
    # YOUR CODE: apply DIMENSION_WEIGHTS to each dimension
    # YOUR CODE: return sum of weighted scores
    pass

# Unit tests:
dims_high = {"relevance": 9, "completeness": 9, "depth": 9, "specificity": 9, "structure": 9}
dims_weak = {"relevance": 9, "completeness": 8, "depth": 5, "specificity": 5, "structure": 9}

assert abs(compute_weighted_score(dims_high) - 9.0) < 0.01
assert abs(compute_weighted_score(dims_weak) - 7.15) < 0.01
print(f"High score: {compute_weighted_score(dims_high):.2f}")
print(f"Weak score (depth/specificity drag): {compute_weighted_score(dims_weak):.2f}")

Exercise 2: Harness-Side Count Check

Implement a Python count check for enumerated output — no model involved.

import re

def count_h2_sections(markdown: str) -> int:
    """Count ## level headings as top-level items."""
    # YOUR CODE
    pass

def count_check(output: str, expected_count: int) -> tuple[bool, int]:
    """Returns (passed, actual_count)."""
    actual = count_h2_sections(output)
    return actual == expected_count, actual

# Tests:
output_correct = """# Top 5 Techniques

## 1. RAG\nContent...

## 2. Context Compression\nContent...

## 3. Few-Shot Prompting\nContent...

## 4. Chain-of-Thought\nContent...

## 5. Tool Use\nContent..."""

output_wrong = output_correct.replace("## 5. Tool Use\nContent...", "")

passed, actual = count_check(output_correct, 5)
assert passed and actual == 5

passed, actual = count_check(output_wrong, 5)
assert not passed and actual == 4
print("Count check tests passed")

Extend: Write a count_check_with_retry() that calls a mock producer to generate a revision if the count is wrong, then re-checks:

def mock_producer(prompt: str) -> str:
    """Mock producer that always generates exactly the count requested."""
    # Parse requested count from prompt and generate that many ## sections
    match = re.search(r'exactly (\d+)', prompt)
    if match:
        n = int(match.group(1))
        sections = "\n\n".join(f"## {i+1}. Item {i+1}\nContent..." for i in range(n))
        return f"# Output\n\n{sections}"
    return prompt  # passthrough if no count found

Exercise 3: The Evaluation Call

Implement evaluate() using an Ollama evaluator model (or the mock below).

import json
EVALUATOR_MODEL = "Qwen3-Coder:30b"  # or your largest available model

# Mock evaluator for offline development:
def mock_evaluate(output: str, task: str) -> dict:
    """Returns a mock evaluation result."""
    # Simulate common failure: depth and specificity drag
    depth = 6 if "here you would" in output.lower() else 8
    specificity = 5 if "your tool" in output.lower() or "specific implementation" in output.lower() else 8
    dims = {
        "relevance": 9, "completeness": 8,
        "depth": depth, "specificity": specificity, "structure": 9
    }
    issues = []
    if depth < 7:
        issues.append("Output contains placeholder implementations — add concrete examples")
    if specificity < 7:
        issues.append("Generic tool references used — name specific tools and versions")
    return {"dimensions": dims, "issues": issues}

def evaluate(output: str, task: str, task_type: str,
             evaluator_model: str = EVALUATOR_MODEL,
             use_mock: bool = False) -> dict:
    if use_mock:
        result = mock_evaluate(output, task)
    else:
        # YOUR CODE: build EVAL_PROMPT, call ollama.chat, parse JSON response
        pass

    result["weighted_score"] = compute_weighted_score(result["dimensions"])
    return result

Exercise 4: The Full Wiggum Loop

Assemble the complete evaluate → revise → verify loop.

WIGGUM_MAX_ROUNDS = 3
WIGGUM_PASS_THRESHOLD = 8.0

def build_revision_prompt(task: str, output: str, issues: list[str]) -> str:
    issue_list = "\n".join(f"- {issue}" for issue in issues)
    return f"""\
Revise this output to address these specific issues:

{issue_list}

Task: {task}

Current output:
{output}

Produce the complete revised output. Do not mention the issues — just fix them.
Output ONLY Markdown starting with # heading."""

def wiggum_loop(task: str, output: str, task_type: str,
                producer_fn, evaluator_fn,
                max_rounds: int = WIGGUM_MAX_ROUNDS,
                threshold: float = WIGGUM_PASS_THRESHOLD):
    """producer_fn(prompt) -> str; evaluator_fn(output, task, task_type) -> dict"""
    scores_by_round = []

    # YOUR CODE: implement the evaluate → revise → verify loop
    # - Log score and issue count per round
    # - Return (final_output, scores_by_round, verdict) where verdict is 'PASS' or 'FAIL'
    pass

# Test with mock functions:
test_output = """
# Context Engineering Techniques

## 1. RAG
Here you would implement your retrieval pipeline using your preferred tool.

## 2. Context Compression
Here you would apply your specific compression method.
"""

def mock_producer_fn(prompt):
    # Simulate improvement: replace placeholders with concrete content
    return prompt.replace(
        "Here you would", "Implement this with ChromaDB:"
    ).replace("your preferred tool", "chromadb==0.4.x with all-MiniLM-L6-v2 embeddings")

result_output, rounds, verdict = wiggum_loop(
    task="Explain the top 2 context engineering techniques",
    output=test_output,
    task_type="enumerated",
    producer_fn=mock_producer_fn,
    evaluator_fn=lambda o, t, tt: evaluate(o, t, tt, use_mock=True)
)

print(f"\nVerdict: {verdict}")
print(f"Rounds: {len(rounds)}")
for r in rounds:
    print(f"  Round {r['round']}: score={r['score']:.2f}")

Exercise 5: Failure Pattern Detector

Implement a classifier that maps Wiggum issue strings to failure classes.

FAILURE_PATTERNS = {
    "placeholder": [
        r"lacks.*implementation", r"no implementation.*note",
        r"too generic", r"placeholder", r"here you would"
    ],
    "count_violation": [
        r"expected.*section", r"section count", r"missing section",
        r"only \d+ section"
    ],
    "encoding_corruption": [
        r"incomplete.*backtick", r"missing.*closing", r"encoding artifact"
    ],
    "task_drift": [
        r"off.topic", r"not.*address.*task", r"irrelevant"
    ]
}

def classify_failure(issue: str) -> str:
    """Return the failure class name, or 'other' if no match."""
    import re
    issue_lower = issue.lower()
    for class_name, patterns in FAILURE_PATTERNS.items():
        if any(re.search(p, issue_lower) for p in patterns):
            return class_name
    return "other"

# Test:
test_issues = [
    "Section 2 lacks a concrete implementation note",
    "Expected 5 sections, got 4",
    "Code block is incomplete (missing closing backticks)",
    "Content is engaging and well-structured",
]
for issue in test_issues:
    print(f"{classify_failure(issue):20} | {issue}")

Exercise 6: Composite Score Analysis

Load 20 synthetic run records and compute composite scores and dimension averages.

import json
import random

# Generate 20 synthetic run records
random.seed(42)
SYNTHETIC_RUNS = [
    {
        "run_id": f"run_{i:03d}",
        "task_type": random.choice(["enumerated", "best_practices"]),
        "wiggum_scores": {
            "r1": {
                "relevance":    random.randint(7, 10),
                "completeness": random.randint(6, 10),
                "depth":        random.randint(5, 9),
                "specificity":  random.randint(4, 9),
                "structure":    random.randint(7, 10)
            }
        },
        "criteria_check": random.choice(["PASS"] * 8 + ["FAIL"] * 2),
        "final": random.choice(["PASS"] * 7 + ["FAIL"] * 3)
    }
    for i in range(20)
]

# YOUR CODE: compute composite score for the session
# composite = 0.7 * mean_wiggum_r1 + 0.3 * criteria_rate * 10

# YOUR CODE: compute mean score per dimension across all runs

# YOUR CODE: identify the weakest dimension (lowest mean score)

# Expected output format:
# Composite score: X.XX
# Dimension averages:
#   relevance:    8.3
#   completeness: 7.8
#   depth:        6.9  ← weakest
#   specificity:  6.7
#   structure:    8.5