AI SDK
Self-Healing
Automatic error recovery with machine learning
Self-Healing Agents
Automatically recover from errors using ML-based strategies that learn from failures.
16 tests passing • Automatic retry, fallback, rephrase, and learning strategies
Basic Usage
healer := sdk.NewSelfHealingAgent(
agent,
sdk.SelfHealingConfig{
MaxRetries: 3,
AutoRecover: true,
EnableLearning: true,
},
logger,
metrics,
)
response, err := healer.Process(ctx, "Your prompt")
// Automatically retries with smart strategies on failureBuilt-in Strategies
Retry Strategy
Retries the same request with exponential backoff:
config := sdk.SelfHealingConfig{
MaxRetries: 3,
AutoRecover: true,
InitialDelay: 1 * time.Second,
}
healer := sdk.NewSelfHealingAgent(agent, config, logger, metrics)Fallback Model Strategy
Switches to a different model on failure:
strategy := &sdk.FallbackModelStrategy{
FallbackModel: "gpt-3.5-turbo",
}
healer.RegisterStrategy("fallback", strategy)Rephrase Strategy
Rephrases the prompt and retries:
strategy := &sdk.RephraseStrategy{
RephrasePrompt: "Rephrase this more clearly: ",
}
healer.RegisterStrategy("rephrase", strategy)Tool Use Strategy
Uses additional tools to gather context:
strategy := &sdk.ToolUseStrategy{
ToolName: "search_docs",
}
healer.RegisterStrategy("tool_use", strategy)Human Intervention Strategy
Requests human help when automated recovery fails:
strategy := &sdk.HumanInterventionStrategy{
NotificationChannel: notifyChan,
}
healer.RegisterStrategy("human", strategy)Custom Recovery Strategy
type CustomStrategy struct{}
func (s *CustomStrategy) Name() string {
return "custom"
}
func (s *CustomStrategy) Recover(
ctx context.Context,
agent *sdk.Agent,
input string,
originalError error,
) (*sdk.Result, error) {
// Your custom recovery logic
// Example: Add context and retry
enhanced := fmt.Sprintf("Context: %s\n\n%s", getContext(), input)
return agent.Execute(ctx, enhanced)
}
func (s *CustomStrategy) ShouldApply(err error) bool {
// When should this strategy be used?
return errors.Is(err, sdk.ErrInvalidResponse)
}
// Register
healer.RegisterStrategy("custom", &CustomStrategy{})Learning from Failures
The self-healing agent learns which strategies work best:
healer := sdk.NewSelfHealingAgent(agent,
sdk.SelfHealingConfig{
EnableLearning: true, // Enable ML-based learning
MaxRetries: 3,
},
logger, metrics,
)
// After multiple recoveries, the agent learns:
// - Which errors are most common
// - Which strategies work best
// - When to switch strategies automatically
stats := healer.GetRecoveryStats()
fmt.Printf("Total recoveries: %d\n", stats.TotalAttempts)
fmt.Printf("Success rate: %.2f%%\n", stats.SuccessRate * 100)
fmt.Printf("Most effective strategy: %s\n", stats.MostEffectiveStrategy)Recovery Statistics
stats := healer.GetRecoveryStats()
fmt.Printf("Attempts: %d\n", stats.TotalAttempts)
fmt.Printf("Successes: %d\n", stats.SuccessfulRecoveries)
fmt.Printf("Success Rate: %.2f%%\n", stats.SuccessRate * 100)
fmt.Printf("Average Retries: %.1f\n", stats.AverageRetries)
// Strategy effectiveness
for strategy, count := range stats.StrategyUsage {
successRate := stats.StrategySuccess[strategy] / float64(count) * 100
fmt.Printf("%s: %.1f%% success (%d uses)\n",
strategy, successRate, count)
}Error Patterns
View learned error patterns:
patterns := healer.GetErrorPatterns()
for _, pattern := range patterns {
fmt.Printf("Error: %s\n", pattern.ErrorType)
fmt.Printf("Count: %d\n", pattern.Count)
fmt.Printf("Best Strategy: %s (%.1f%% success)\n",
pattern.BestStrategy, pattern.SuccessRate * 100)
}Real-World Example
func createRobustAgent() *sdk.SelfHealingAgent {
// Base agent
agent := sdk.NewAgent("robust", llm, logger, metrics,
&sdk.AgentOptions{
Model: "gpt-4",
},
)
// Self-healing wrapper
healer := sdk.NewSelfHealingAgent(agent,
sdk.SelfHealingConfig{
MaxRetries: 5,
AutoRecover: true,
EnableLearning: true,
InitialDelay: 1 * time.Second,
},
logger, metrics,
)
// Register strategies
healer.RegisterStrategy("fallback", &sdk.FallbackModelStrategy{
FallbackModel: "gpt-3.5-turbo",
})
healer.RegisterStrategy("rephrase", &sdk.RephraseStrategy{
RephrasePrompt: "Rephrase for clarity: ",
})
healer.RegisterStrategy("tool", &sdk.ToolUseStrategy{
ToolName: "search_context",
})
return healer
}
// Use
healer := createRobustAgent()
response, err := healer.Process(ctx, "Complex query")
if err != nil {
// All strategies failed
log.Printf("Recovery failed: %v", err)
// Check what was tried
stats := healer.GetRecoveryStats()
log.Printf("Attempted %d recoveries", stats.TotalAttempts)
}Configuration Options
type SelfHealingConfig struct {
// Maximum recovery attempts
MaxRetries int
// Enable automatic recovery
AutoRecover bool
// Enable ML-based learning
EnableLearning bool
// Initial delay before retry
InitialDelay time.Duration
// Maximum delay between retries
MaxDelay time.Duration
// Backoff multiplier
BackoffMultiplier float64
// Maximum error history to keep
MaxErrorHistory int
// Strategy selection threshold
// (confidence needed to auto-switch)
StrategyThreshold float64
}Best Practices
Start Conservative
// Start with simple retry
config := sdk.SelfHealingConfig{
MaxRetries: 3,
AutoRecover: true,
EnableLearning: false, // Disable learning initially
}Monitor and Tune
// Regularly check stats
ticker := time.NewTicker(1 * time.Hour)
go func() {
for range ticker.C {
stats := healer.GetRecoveryStats()
metrics.Gauge("self_healing.success_rate").Set(stats.SuccessRate)
if stats.SuccessRate < 0.5 {
logger.Warn("low self-healing success rate", "rate", stats.SuccessRate)
}
}
}()Enable Learning Gradually
// After baseline is established
if stats.TotalAttempts > 100 && stats.SuccessRate > 0.7 {
healer.EnableLearning()
logger.Info("enabled self-healing learning")
}Debugging
// Enable detailed logging
healer.SetVerbose(true)
response, err := healer.Process(ctx, input)
// Check error history
for _, errEntry := range healer.GetErrorHistory() {
log.Printf("Error: %v", errEntry.Error)
log.Printf("Strategy used: %s", errEntry.StrategyUsed)
log.Printf("Recovered: %v", errEntry.Recovered)
log.Printf("Time: %s", errEntry.Timestamp)
}Next Steps
- Agents - Base agent features
- Resilience - Circuit breakers and rate limiting
- Examples - Self-healing examples
How is this guide?
Last updated on