AI SDK

Self-Healing

Automatic error recovery with machine learning

Self-Healing Agents

Automatically recover from errors using ML-based strategies that learn from failures.

16 tests passing • Automatic retry, fallback, rephrase, and learning strategies

Basic Usage

healer := sdk.NewSelfHealingAgent(
    agent,
    sdk.SelfHealingConfig{
        MaxRetries:      3,
        AutoRecover:     true,
        EnableLearning:  true,
    },
    logger,
    metrics,
)

response, err := healer.Process(ctx, "Your prompt")
// Automatically retries with smart strategies on failure

Built-in Strategies

Retry Strategy

Retries the same request with exponential backoff:

config := sdk.SelfHealingConfig{
    MaxRetries:   3,
    AutoRecover:  true,
    InitialDelay: 1 * time.Second,
}

healer := sdk.NewSelfHealingAgent(agent, config, logger, metrics)

Fallback Model Strategy

Switches to a different model on failure:

strategy := &sdk.FallbackModelStrategy{
    FallbackModel: "gpt-3.5-turbo",
}

healer.RegisterStrategy("fallback", strategy)

Rephrase Strategy

Rephrases the prompt and retries:

strategy := &sdk.RephraseStrategy{
    RephrasePrompt: "Rephrase this more clearly: ",
}

healer.RegisterStrategy("rephrase", strategy)

Tool Use Strategy

Uses additional tools to gather context:

strategy := &sdk.ToolUseStrategy{
    ToolName: "search_docs",
}

healer.RegisterStrategy("tool_use", strategy)

Human Intervention Strategy

Requests human help when automated recovery fails:

strategy := &sdk.HumanInterventionStrategy{
    NotificationChannel: notifyChan,
}

healer.RegisterStrategy("human", strategy)

Custom Recovery Strategy

type CustomStrategy struct{}

func (s *CustomStrategy) Name() string {
    return "custom"
}

func (s *CustomStrategy) Recover(
    ctx context.Context,
    agent *sdk.Agent,
    input string,
    originalError error,
) (*sdk.Result, error) {
    // Your custom recovery logic
    
    // Example: Add context and retry
    enhanced := fmt.Sprintf("Context: %s\n\n%s", getContext(), input)
    return agent.Execute(ctx, enhanced)
}

func (s *CustomStrategy) ShouldApply(err error) bool {
    // When should this strategy be used?
    return errors.Is(err, sdk.ErrInvalidResponse)
}

// Register
healer.RegisterStrategy("custom", &CustomStrategy{})

Learning from Failures

The self-healing agent learns which strategies work best:

healer := sdk.NewSelfHealingAgent(agent, 
    sdk.SelfHealingConfig{
        EnableLearning: true,  // Enable ML-based learning
        MaxRetries:     3,
    },
    logger, metrics,
)

// After multiple recoveries, the agent learns:
// - Which errors are most common
// - Which strategies work best
// - When to switch strategies automatically

stats := healer.GetRecoveryStats()
fmt.Printf("Total recoveries: %d\n", stats.TotalAttempts)
fmt.Printf("Success rate: %.2f%%\n", stats.SuccessRate * 100)
fmt.Printf("Most effective strategy: %s\n", stats.MostEffectiveStrategy)

Recovery Statistics

stats := healer.GetRecoveryStats()

fmt.Printf("Attempts: %d\n", stats.TotalAttempts)
fmt.Printf("Successes: %d\n", stats.SuccessfulRecoveries)
fmt.Printf("Success Rate: %.2f%%\n", stats.SuccessRate * 100)
fmt.Printf("Average Retries: %.1f\n", stats.AverageRetries)

// Strategy effectiveness
for strategy, count := range stats.StrategyUsage {
    successRate := stats.StrategySuccess[strategy] / float64(count) * 100
    fmt.Printf("%s: %.1f%% success (%d uses)\n", 
        strategy, successRate, count)
}

Error Patterns

View learned error patterns:

patterns := healer.GetErrorPatterns()

for _, pattern := range patterns {
    fmt.Printf("Error: %s\n", pattern.ErrorType)
    fmt.Printf("Count: %d\n", pattern.Count)
    fmt.Printf("Best Strategy: %s (%.1f%% success)\n",
        pattern.BestStrategy, pattern.SuccessRate * 100)
}

Real-World Example

func createRobustAgent() *sdk.SelfHealingAgent {
    // Base agent
    agent := sdk.NewAgent("robust", llm, logger, metrics, 
        &sdk.AgentOptions{
            Model: "gpt-4",
        },
    )
    
    // Self-healing wrapper
    healer := sdk.NewSelfHealingAgent(agent,
        sdk.SelfHealingConfig{
            MaxRetries:      5,
            AutoRecover:     true,
            EnableLearning:  true,
            InitialDelay:    1 * time.Second,
        },
        logger, metrics,
    )
    
    // Register strategies
    healer.RegisterStrategy("fallback", &sdk.FallbackModelStrategy{
        FallbackModel: "gpt-3.5-turbo",
    })
    
    healer.RegisterStrategy("rephrase", &sdk.RephraseStrategy{
        RephrasePrompt: "Rephrase for clarity: ",
    })
    
    healer.RegisterStrategy("tool", &sdk.ToolUseStrategy{
        ToolName: "search_context",
    })
    
    return healer
}

// Use
healer := createRobustAgent()

response, err := healer.Process(ctx, "Complex query")
if err != nil {
    // All strategies failed
    log.Printf("Recovery failed: %v", err)
    
    // Check what was tried
    stats := healer.GetRecoveryStats()
    log.Printf("Attempted %d recoveries", stats.TotalAttempts)
}

Configuration Options

type SelfHealingConfig struct {
    // Maximum recovery attempts
    MaxRetries int
    
    // Enable automatic recovery
    AutoRecover bool
    
    // Enable ML-based learning
    EnableLearning bool
    
    // Initial delay before retry
    InitialDelay time.Duration
    
    // Maximum delay between retries
    MaxDelay time.Duration
    
    // Backoff multiplier
    BackoffMultiplier float64
    
    // Maximum error history to keep
    MaxErrorHistory int
    
    // Strategy selection threshold
    // (confidence needed to auto-switch)
    StrategyThreshold float64
}

Best Practices

Start Conservative

// Start with simple retry
config := sdk.SelfHealingConfig{
    MaxRetries:     3,
    AutoRecover:    true,
    EnableLearning: false,  // Disable learning initially
}

Monitor and Tune

// Regularly check stats
ticker := time.NewTicker(1 * time.Hour)
go func() {
    for range ticker.C {
        stats := healer.GetRecoveryStats()
        metrics.Gauge("self_healing.success_rate").Set(stats.SuccessRate)
        
        if stats.SuccessRate < 0.5 {
            logger.Warn("low self-healing success rate", "rate", stats.SuccessRate)
        }
    }
}()

Enable Learning Gradually

// After baseline is established
if stats.TotalAttempts > 100 && stats.SuccessRate > 0.7 {
    healer.EnableLearning()
    logger.Info("enabled self-healing learning")
}

Debugging

// Enable detailed logging
healer.SetVerbose(true)

response, err := healer.Process(ctx, input)

// Check error history
for _, errEntry := range healer.GetErrorHistory() {
    log.Printf("Error: %v", errEntry.Error)
    log.Printf("Strategy used: %s", errEntry.StrategyUsed)
    log.Printf("Recovered: %v", errEntry.Recovered)
    log.Printf("Time: %s", errEntry.Timestamp)
}

Next Steps

How is this guide?

Last updated on