Building the Verification Loop
Setup
conda activate ollama-pi
# Requires: ollama with at least Qwen3-Coder:30b (evaluator) or any 7B+ model
# If running without Ollama, the lab provides a mock evaluator
Exercise 1: The Evaluation Rubric
Implement compute_weighted_score() from raw dimension scores.
DIMENSION_WEIGHTS = {
"relevance": 0.20,
"completeness": 0.25,
"depth": 0.30,
"specificity": 0.15,
"structure": 0.10
}
def compute_weighted_score(dimensions: dict) -> float:
"""Compute weighted Wiggum score from dimension dict."""
# YOUR CODE: apply DIMENSION_WEIGHTS to each dimension
# YOUR CODE: return sum of weighted scores
pass
# Unit tests:
dims_high = {"relevance": 9, "completeness": 9, "depth": 9, "specificity": 9, "structure": 9}
dims_weak = {"relevance": 9, "completeness": 8, "depth": 5, "specificity": 5, "structure": 9}
assert abs(compute_weighted_score(dims_high) - 9.0) < 0.01
assert abs(compute_weighted_score(dims_weak) - 7.15) < 0.01
print(f"High score: {compute_weighted_score(dims_high):.2f}")
print(f"Weak score (depth/specificity drag): {compute_weighted_score(dims_weak):.2f}")
Exercise 2: Harness-Side Count Check
Implement a Python count check for enumerated output — no model involved.
import re
def count_h2_sections(markdown: str) -> int:
"""Count ## level headings as top-level items."""
# YOUR CODE
pass
def count_check(output: str, expected_count: int) -> tuple[bool, int]:
"""Returns (passed, actual_count)."""
actual = count_h2_sections(output)
return actual == expected_count, actual
# Tests:
output_correct = """# Top 5 Techniques
## 1. RAG\nContent...
## 2. Context Compression\nContent...
## 3. Few-Shot Prompting\nContent...
## 4. Chain-of-Thought\nContent...
## 5. Tool Use\nContent..."""
output_wrong = output_correct.replace("## 5. Tool Use\nContent...", "")
passed, actual = count_check(output_correct, 5)
assert passed and actual == 5
passed, actual = count_check(output_wrong, 5)
assert not passed and actual == 4
print("Count check tests passed")
Extend: Write a count_check_with_retry() that calls a mock producer to generate a revision if the count is wrong, then re-checks:
def mock_producer(prompt: str) -> str:
"""Mock producer that always generates exactly the count requested."""
# Parse requested count from prompt and generate that many ## sections
match = re.search(r'exactly (\d+)', prompt)
if match:
n = int(match.group(1))
sections = "\n\n".join(f"## {i+1}. Item {i+1}\nContent..." for i in range(n))
return f"# Output\n\n{sections}"
return prompt # passthrough if no count found
Exercise 3: The Evaluation Call
Implement evaluate() using an Ollama evaluator model (or the mock below).
import json
EVALUATOR_MODEL = "Qwen3-Coder:30b" # or your largest available model
# Mock evaluator for offline development:
def mock_evaluate(output: str, task: str) -> dict:
"""Returns a mock evaluation result."""
# Simulate common failure: depth and specificity drag
depth = 6 if "here you would" in output.lower() else 8
specificity = 5 if "your tool" in output.lower() or "specific implementation" in output.lower() else 8
dims = {
"relevance": 9, "completeness": 8,
"depth": depth, "specificity": specificity, "structure": 9
}
issues = []
if depth < 7:
issues.append("Output contains placeholder implementations — add concrete examples")
if specificity < 7:
issues.append("Generic tool references used — name specific tools and versions")
return {"dimensions": dims, "issues": issues}
def evaluate(output: str, task: str, task_type: str,
evaluator_model: str = EVALUATOR_MODEL,
use_mock: bool = False) -> dict:
if use_mock:
result = mock_evaluate(output, task)
else:
# YOUR CODE: build EVAL_PROMPT, call ollama.chat, parse JSON response
pass
result["weighted_score"] = compute_weighted_score(result["dimensions"])
return result
Exercise 4: The Full Wiggum Loop
Assemble the complete evaluate → revise → verify loop.
WIGGUM_MAX_ROUNDS = 3
WIGGUM_PASS_THRESHOLD = 8.0
def build_revision_prompt(task: str, output: str, issues: list[str]) -> str:
issue_list = "\n".join(f"- {issue}" for issue in issues)
return f"""\
Revise this output to address these specific issues:
{issue_list}
Task: {task}
Current output:
{output}
Produce the complete revised output. Do not mention the issues — just fix them.
Output ONLY Markdown starting with # heading."""
def wiggum_loop(task: str, output: str, task_type: str,
producer_fn, evaluator_fn,
max_rounds: int = WIGGUM_MAX_ROUNDS,
threshold: float = WIGGUM_PASS_THRESHOLD):
"""producer_fn(prompt) -> str; evaluator_fn(output, task, task_type) -> dict"""
scores_by_round = []
# YOUR CODE: implement the evaluate → revise → verify loop
# - Log score and issue count per round
# - Return (final_output, scores_by_round, verdict) where verdict is 'PASS' or 'FAIL'
pass
# Test with mock functions:
test_output = """
# Context Engineering Techniques
## 1. RAG
Here you would implement your retrieval pipeline using your preferred tool.
## 2. Context Compression
Here you would apply your specific compression method.
"""
def mock_producer_fn(prompt):
# Simulate improvement: replace placeholders with concrete content
return prompt.replace(
"Here you would", "Implement this with ChromaDB:"
).replace("your preferred tool", "chromadb==0.4.x with all-MiniLM-L6-v2 embeddings")
result_output, rounds, verdict = wiggum_loop(
task="Explain the top 2 context engineering techniques",
output=test_output,
task_type="enumerated",
producer_fn=mock_producer_fn,
evaluator_fn=lambda o, t, tt: evaluate(o, t, tt, use_mock=True)
)
print(f"\nVerdict: {verdict}")
print(f"Rounds: {len(rounds)}")
for r in rounds:
print(f" Round {r['round']}: score={r['score']:.2f}")
Exercise 5: Failure Pattern Detector
Implement a classifier that maps Wiggum issue strings to failure classes.
FAILURE_PATTERNS = {
"placeholder": [
r"lacks.*implementation", r"no implementation.*note",
r"too generic", r"placeholder", r"here you would"
],
"count_violation": [
r"expected.*section", r"section count", r"missing section",
r"only \d+ section"
],
"encoding_corruption": [
r"incomplete.*backtick", r"missing.*closing", r"encoding artifact"
],
"task_drift": [
r"off.topic", r"not.*address.*task", r"irrelevant"
]
}
def classify_failure(issue: str) -> str:
"""Return the failure class name, or 'other' if no match."""
import re
issue_lower = issue.lower()
for class_name, patterns in FAILURE_PATTERNS.items():
if any(re.search(p, issue_lower) for p in patterns):
return class_name
return "other"
# Test:
test_issues = [
"Section 2 lacks a concrete implementation note",
"Expected 5 sections, got 4",
"Code block is incomplete (missing closing backticks)",
"Content is engaging and well-structured",
]
for issue in test_issues:
print(f"{classify_failure(issue):20} | {issue}")
Exercise 6: Composite Score Analysis
Load 20 synthetic run records and compute composite scores and dimension averages.
import json
import random
# Generate 20 synthetic run records
random.seed(42)
SYNTHETIC_RUNS = [
{
"run_id": f"run_{i:03d}",
"task_type": random.choice(["enumerated", "best_practices"]),
"wiggum_scores": {
"r1": {
"relevance": random.randint(7, 10),
"completeness": random.randint(6, 10),
"depth": random.randint(5, 9),
"specificity": random.randint(4, 9),
"structure": random.randint(7, 10)
}
},
"criteria_check": random.choice(["PASS"] * 8 + ["FAIL"] * 2),
"final": random.choice(["PASS"] * 7 + ["FAIL"] * 3)
}
for i in range(20)
]
# YOUR CODE: compute composite score for the session
# composite = 0.7 * mean_wiggum_r1 + 0.3 * criteria_rate * 10
# YOUR CODE: compute mean score per dimension across all runs
# YOUR CODE: identify the weakest dimension (lowest mean score)
# Expected output format:
# Composite score: X.XX
# Dimension averages:
# relevance: 8.3
# completeness: 7.8
# depth: 6.9 ← weakest
# specificity: 6.7
# structure: 8.5