Retry With Exponential Backoff
import asyncio
from typing import Callable, TypeVar
T = TypeVar('T')
async def retry_transient(
fn: Callable,
max_retries: int = 3,
base_delay: float = 1.0
) -> T:
"""Only for transient errors. Never for permission/validation/business errors."""
last_error = None
for attempt in range(max_retries):
try:
return await fn()
except (TimeoutError, ServiceUnavailableError, RateLimitError) as e:
last_error = e
if attempt < max_retries - 1:
delay = base_delay * (2 ** attempt) # 1s, 2s, 4s
await asyncio.sleep(delay)
except (PermissionError, ValidationError, BusinessRuleError):
raise # Never retry these — raise immediately
raise last_error
Circuit Breaker
class CircuitBreaker:
def __init__(self, failure_threshold: int = 5, reset_timeout: float = 60.0):
self.failures = 0
self.threshold = failure_threshold
self.reset_timeout = reset_timeout
self.last_failure_time = None
self.state = "closed" # closed=normal, open=blocking
async def call(self, fn: Callable) -> any:
if self.state == "open":
elapsed = time.time() - self.last_failure_time
if elapsed > self.reset_timeout:
self.state = "half-open"
else:
raise CircuitOpenError(
f"Circuit open — service unavailable. "
f"Retry in {self.reset_timeout - elapsed:.0f}s."
)
try:
result = await fn()
if self.state == "half-open":
self.state = "closed"
self.failures = 0
return result
except Exception as e:
self.failures += 1
self.last_failure_time = time.time()
if self.failures >= self.threshold:
self.state = "open"
raise
Graceful Degradation
async def get_customer_profile_with_degradation(customer_id: str) -> dict:
"""Returns whatever data is available, clearly labeled."""
results = {"customer_id": customer_id, "_incomplete": False, "_missing": []}
for component, fetch_fn in [
("profile", lambda: get_profile(customer_id)),
("order_history", lambda: get_orders(customer_id)),
("payment_data", lambda: get_payments(customer_id)),
]:
try:
results[component] = await fetch_fn()
except Exception as e:
results[component] = None
results["_incomplete"] = True
results["_missing"].append({
"component": component,
"error": str(e)
})
if results["_incomplete"]:
results["_note"] = (
f"Profile is incomplete. Missing: "
f"{', '.join(m['component'] for m in results['_missing'])}. "
f"Decisions based on available data only."
)
return results
Recovery Decision Matrix
def choose_recovery(error_category: str, attempt: int) -> str:
"""Which recovery strategy based on error type and attempt number."""
strategies = {
"transient": {
0: "retry",
1: "retry_with_backoff",
2: "fallback",
3: "graceful_degrade"
},
"permission": {0: "fail_with_message"}, # Never retry
"validation": {0: "fail_with_message"}, # Fix the input
"business": {0: "alternative_action"}, # Different workflow
"unknown": {0: "retry", 1: "escalate_to_human"},
}
category_strategies = strategies.get(error_category, {"0": "fail"})
return category_strategies.get(min(attempt, max(category_strategies.keys())), "fail")
Key Takeaways
- Retry for transient — exponential backoff, cap at 3 attempts
- Circuit breaker — prevents cascading failure under sustained outage
- Fallback — alternative source for transient failures only, label quality reduction
- Graceful degradation — partial results with clear incomplete labels
- Never retry permission/validation — fix the root cause