Structured Error Propagation
# Subagent always returns structured result — success OR failure
async def run_research_subagent(task: dict) -> dict:
try:
result = await execute_research(task)
return {
"status": "success",
"task_id": task["id"],
"task_description": task["description"],
"result": result,
"completion_time": datetime.utcnow().isoformat()
}
except TransientError as e:
return {
"status": "failed",
"task_id": task["id"],
"task_description": task["description"],
"error_category": "transient",
"error_message": str(e),
"isRetryable": True,
"retry_after_seconds": 5
}
except PermissionError as e:
return {
"status": "failed",
"task_id": task["id"],
"task_description": task["description"],
"error_category": "permission",
"error_message": str(e),
"isRetryable": False,
"required_access": e.required_permissions
}
except Exception as e:
return {
"status": "failed",
"task_id": task["id"],
"task_description": task["description"],
"error_category": "unknown",
"error_message": str(e),
"isRetryable": False
}
Coordinator Recovery Decisions
# Coordinator receives all results including failures
async def handle_research_results(results: list[dict]) -> dict:
successful = [r for r in results if r["status"] == "success"]
failed = [r for r in results if r["status"] == "failed"]
if not failed:
# All succeeded — proceed to synthesis
return await synthesize_results([r["result"] for r in successful])
# Classify failures
retryable = [r for r in failed if r.get("isRetryable")]
permanent = [r for r in failed if not r.get("isRetryable")]
recovery_context = f"""
Research phase completed with partial failures.
SUCCESSFUL ({len(successful)} tasks):
{format_results(successful)}
RETRYABLE FAILURES ({len(retryable)} tasks):
{format_failures(retryable)}
PERMANENT FAILURES ({len(permanent)} tasks):
{format_failures(permanent)}
Options:
1. Retry the retryable failures and synthesize when they complete
2. Synthesize from successful results only (note: {len(failed)} tasks failed)
3. Report failure to user and stop
Which approach best serves the user's original request?
"""
return {"status": "partial", "context": recovery_context, "successful": successful}
Key Takeaways
- Always return structured results — success or failure, never silent
- Include error category and isRetryable in failure responses
- Coordinator decides recovery, subagents just report
- Partial success must be labeled — coordinator needs to know what’s missing
- Never swallow exceptions silently — makes debugging impossible