Recovery Pattern 1: Retry with Backoff
async def with_retry(tool_fn, max_retries=3, base_delay=1):
"""For transient errors only."""
last_error = None
for attempt in range(max_retries):
try:
return await tool_fn()
except ToolError as e:
if e.category != "transient":
raise # Don't retry non-transient errors
last_error = e
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt) # 1s, 2s, 4s
await asyncio.sleep(delay)
raise last_error
Recovery Pattern 2: Circuit Breaker
class CircuitBreaker:
def __init__(self, failure_threshold=5, reset_timeout=60):
self.failures = 0
self.threshold = failure_threshold
self.reset_timeout = reset_timeout
self.last_failure_time = None
self.state = "closed" # closed=normal, open=blocking
async def call(self, tool_fn):
if self.state == "open":
if time.time() - self.last_failure_time > self.reset_timeout:
self.state = "half-open" # Try once
else:
raise CircuitOpenError("Tool circuit breaker is open — service unavailable")
try:
result = await tool_fn()
if self.state == "half-open":
self.state = "closed"
self.failures = 0
return result
except ToolError as e:
if e.category == "transient":
self.failures += 1
self.last_failure_time = time.time()
if self.failures >= self.threshold:
self.state = "open"
raise
Recovery Pattern 3: Fallback
async def search_with_fallback(query: str) -> dict:
"""Try knowledge base first, fall back to web search."""
try:
result = await search_knowledge_base(query)
return {"source": "knowledge_base", "result": result, "quality": "high"}
except ToolError as e:
if e.category == "transient":
# Try fallback for transient failures
try:
result = await web_search(query)
return {
"source": "web_search",
"result": result,
"quality": "medium",
"note": "Knowledge base unavailable — results from web search may be less accurate"
}
except Exception:
pass
# For non-transient errors, don't use fallback — the problem needs fixing
raise
Recovery Pattern 4: Graceful Degradation
async def get_customer_full_profile(customer_id: str) -> dict:
"""Gets all customer data — degrades gracefully if some sources fail."""
results = {"customer_id": customer_id}
missing_data = []
# Try to get each data component independently
for component, fetch_fn in [
("profile", get_customer_profile),
("order_history", get_order_history),
("payment_history", get_payment_history),
("support_history", get_support_history),
]:
try:
results[component] = await fetch_fn(customer_id)
except ToolError as e:
results[component] = None
missing_data.append({"component": component, "reason": str(e)})
if missing_data:
results["_incomplete"] = True
results["_missing_components"] = missing_data
results["_note"] = f"Profile is incomplete — {len(missing_data)} components unavailable"
return results
Key Takeaways
- Retry with backoff for transient errors — exponential delays
- Circuit breaker prevents cascading failures under sustained outage
- Fallback provides alternative path for transient failures only
- Graceful degradation returns partial data, clearly labeled as incomplete
- Never use fallback for validation or permission errors — different root cause