Designing the Strata
class StratifiedSampler:
"""
Samples proportionally from each stratum to validate system quality.
Ensures every category has representation — not just the common cases.
"""
def __init__(self, daily_review_budget: int = 50):
self.budget = daily_review_budget
# Define strata and their review budget allocation
self.strata_config = {
# Confidence-based strata
"confidence_high": {"budget_pct": 0.10, "review_reason": "calibration check"},
"confidence_medium": {"budget_pct": 0.20, "review_reason": "spot verification"},
"confidence_low": {"budget_pct": 0.40, "review_reason": "accuracy audit"},
# Document-type strata
"international_docs": {"budget_pct": 0.15, "review_reason": "currency/format issues"},
"handwritten": {"budget_pct": 0.10, "review_reason": "OCR accuracy"},
# Amount-range strata
"high_value_gt1000": {"budget_pct": 0.05, "review_reason": "financial risk"},
}
def daily_sample(self, processed_items: list) -> dict:
"""Select stratified sample from today's processed items."""
stratified = {}
for stratum_name, config in self.strata_config.items():
stratum_items = self._filter_stratum(processed_items, stratum_name)
n_to_sample = max(
5, # Minimum per stratum
int(self.budget * config["budget_pct"])
)
if stratum_items:
sample = random.sample(
stratum_items,
min(n_to_sample, len(stratum_items))
)
stratified[stratum_name] = {
"items": sample,
"stratum_size": len(stratum_items),
"sample_size": len(sample),
"review_reason": config["review_reason"]
}
return stratified
def _filter_stratum(self, items: list, stratum: str) -> list:
filters = {
"confidence_high": lambda i: i["confidence"]["overall"] == "high",
"confidence_medium": lambda i: i["confidence"]["overall"] == "medium",
"confidence_low": lambda i: i["confidence"]["overall"] == "low",
"international_docs":lambda i: i.get("currency") not in ("USD", None),
"handwritten": lambda i: i.get("document_type") == "handwritten",
"high_value_gt1000": lambda i: i.get("total_amount", 0) > 1000,
}
return [i for i in items if filters.get(stratum, lambda _: False)(i)]
Using Sample Results
def process_sample_review_outcomes(sample_outcomes: list[dict]) -> dict:
"""Convert reviewer verdicts into calibration improvements."""
by_stratum = {}
for outcome in sample_outcomes:
stratum = outcome["stratum"]
correct = outcome["human_verified_correct"]
if stratum not in by_stratum:
by_stratum[stratum] = []
by_stratum[stratum].append(correct)
recommendations = {}
for stratum, results in by_stratum.items():
accuracy = sum(results) / len(results)
if "confidence_high" in stratum and accuracy < 0.90:
recommendations["confidence_routing"] = "treat_high_as_medium"
if "confidence_low" in stratum and accuracy > 0.80:
recommendations["confidence_routing"] = "some_low_can_be_spot_check"
if "international_docs" in stratum and accuracy < 0.70:
recommendations["international_handling"] = "add_currency_normalization"
return recommendations
Key Takeaways
- Stratified not random — ensure every category is represented
- Minimum 30-50 per stratum for statistical significance
- Allocate review budget by risk — low-confidence gets more sampling
- Feed results to calibration — sample outcomes improve routing thresholds
- Track over time — trends in accuracy identify emerging issues