AI SDK
Multi-Modal
Process images, audio, and video with AI models
Multi-Modal Processing
Process images, audio, and video with vision and multi-modal AI models.
Supported Models: GPT-4 Vision, Claude 3, Gemini Pro Vision
Image Processing
From File
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./photo.jpg").
WithText("What's in this image?").
WithModel("gpt-4-vision").
Execute()
fmt.Println(result.Text)From URL
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageURL("https://example.com/image.jpg").
WithText("Describe this image in detail").
Execute()From Base64 Data
imageData := base64.StdEncoding.EncodeToString(imageBytes)
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImage(imageData, "image/jpeg").
WithText("Analyze this image").
Execute()Multiple Images
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./before.jpg").
WithImageFile("./after.jpg").
WithText("Compare these two images. What changed?").
Execute()
fmt.Println(result.Text)
// Output: "The second image shows additional lighting and a new chair..."Vision Tasks
Object Detection
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./scene.jpg").
WithText("List all objects visible in this image with their locations").
Execute()
// Output: "I can see: 1) A red car in the center, 2) A tree on the left..."OCR (Text Extraction)
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./document.jpg").
WithText("Extract all text from this image").
Execute()
fmt.Println(result.Text)
// Output: "Invoice #12345\nDate: 2024-01-01\nTotal: $99.99..."Image Comparison
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./product_a.jpg").
WithImageFile("./product_b.jpg").
WithText("Compare these products and highlight differences").
Execute()Chart/Graph Analysis
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./sales_chart.png").
WithText("Analyze this chart and provide key insights").
Execute()
// Output: "The chart shows a 40% increase in Q4 sales..."Audio Processing
Transcription
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithAudioFile("./meeting.mp3").
WithText("Transcribe this audio").
Execute()
fmt.Println(result.Text)With Timestamps
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithAudioFile("./podcast.mp3").
WithText("Transcribe with timestamps every 30 seconds").
Execute()
// Output: "[00:00] Welcome to the show...\n[00:30] Today we'll discuss..."Audio Analysis
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithAudioFile("./customer_call.wav").
WithText("Analyze sentiment and key topics in this customer call").
Execute()Video Processing
Video Analysis
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithVideoFile("./demo.mp4").
WithText("Summarize what happens in this video").
Execute()Action Detection
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithVideoFile("./security_footage.mp4").
WithText("Identify any suspicious activities in this video").
Execute()Advanced Usage
System Instructions
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithSystemMessage("You are an expert medical image analyst").
WithImageFile("./xray.jpg").
WithText("Analyze this X-ray and identify any anomalies").
Execute()Multiple Content Types
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithText("Context: Product review video").
WithVideoFile("./review.mp4").
WithText("Extract the main points").
WithImageFile("./product_specs.jpg").
WithText("Compare the video claims with these specs").
Execute()With Parameters
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./creative.jpg").
WithText("Create a detailed description").
WithModel("gpt-4-vision").
WithTemperature(0.8). // More creative
WithMaxTokens(1000). // Longer output
Execute()With Callbacks
result, err := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./large_image.jpg").
WithText("Analyze thoroughly").
OnStart(func() {
fmt.Println("Processing image...")
}).
OnComplete(func(result *sdk.Result) {
fmt.Printf("Analysis complete! Tokens: %d\n", result.Usage.TotalTokens)
}).
OnError(func(err error) {
fmt.Printf("Error: %v\n", err)
}).
Execute()Structured Output from Images
Extract structured data from images:
type Invoice struct {
Number string `json:"number"`
Date string `json:"date"`
Total float64 `json:"total"`
Items []Item `json:"items"`
Vendor string `json:"vendor"`
}
type Item struct {
Description string `json:"description"`
Quantity int `json:"quantity"`
Price float64 `json:"price"`
}
// First, get text from image
textResult, _ := sdk.NewMultiModalBuilder(ctx, llmManager, logger, metrics).
WithImageFile("./invoice.jpg").
WithText("Extract all invoice data").
Execute()
// Then, parse into struct
invoice, _ := sdk.NewGenerateObjectBuilder[Invoice](
ctx, llmManager, logger, metrics,
).
WithPrompt(textResult.Text).
Execute()
fmt.Printf("%+v\n", invoice)Real-World Examples
Document Processing Pipeline
func processDocument(imagePath string) (*DocumentData, error) {
// Step 1: OCR
ocrResult, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageFile(imagePath).
WithText("Extract all text, preserving structure").
Execute()
if err != nil {
return nil, err
}
// Step 2: Parse into structure
doc, err := sdk.NewGenerateObjectBuilder[DocumentData](
ctx, llm, logger, metrics,
).
WithPrompt(ocrResult.Text).
Execute()
return doc, err
}Product Catalog from Images
func catalogProduct(imageURL string) (*Product, error) {
// Analyze image
analysis, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageURL(imageURL).
WithText(`
Analyze this product image:
- Product name and category
- Key features and specifications
- Color and style
- Suitable for what use cases
`).
Execute()
if err != nil {
return nil, err
}
// Extract structured data
product, err := sdk.NewGenerateObjectBuilder[Product](
ctx, llm, logger, metrics,
).
WithPrompt(analysis.Text).
Execute()
return product, err
}Video Summarization
func summarizeVideo(videoPath string) (string, error) {
result, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithVideoFile(videoPath).
WithText(`
Create a comprehensive summary:
1. Main topics discussed
2. Key takeaways
3. Timeline of major events
4. Action items (if any)
`).
WithMaxTokens(1500).
Execute()
if err != nil {
return "", err
}
return result.Text, nil
}Medical Image Analysis
func analyzeXRay(imagePath string) (*MedicalAnalysis, error) {
result, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithSystemMessage("You are a radiology AI assistant").
WithImageFile(imagePath).
WithText(`
Analyze this X-ray image:
1. Identify anatomical structures
2. Note any abnormalities
3. Provide preliminary observations
Note: This is for educational purposes only
`).
Execute()
if err != nil {
return nil, err
}
// Parse into structured format
analysis, err := sdk.NewGenerateObjectBuilder[MedicalAnalysis](
ctx, llm, logger, metrics,
).
WithPrompt(result.Text).
Execute()
return analysis, err
}Content Type Support
| Type | Methods | Formats |
|---|---|---|
| Image | WithImageFile, WithImageURL, WithImage | JPEG, PNG, GIF, WebP |
| Audio | WithAudioFile, WithAudioURL, WithAudio | MP3, WAV, M4A, OGG |
| Video | WithVideoFile, WithVideoURL, WithVideo | MP4, MOV, AVI, WebM |
| Text | WithText | Plain text |
Model Support
| Provider | Vision | Audio | Video |
|---|---|---|---|
| OpenAI | GPT-4 Vision | Whisper | ❌ |
| Anthropic | Claude 3 | ❌ | ❌ |
| Gemini Pro Vision | ❌ | Gemini Pro |
Best Practices
Image Quality
// Ensure good quality for better results
result, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageFile("./high_res_image.jpg"). // Use high resolution
WithText("Detailed analysis").
Execute()Clear Instructions
// Be specific about what you want
result, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageFile("./receipt.jpg").
WithText(`
Extract from this receipt:
1. Store name
2. Date and time
3. All items with prices
4. Total amount
5. Payment method
`).
Execute()Error Handling
result, err := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageFile("./image.jpg").
WithText("Analyze").
Execute()
if err != nil {
if errors.Is(err, sdk.ErrUnsupportedFormat) {
return fmt.Errorf("image format not supported")
}
if errors.Is(err, sdk.ErrFileTooLarge) {
return fmt.Errorf("image file too large (max 20MB)")
}
return err
}Performance Optimization
Resize Large Images
import "image/jpeg"
// Resize before sending
img := resizeImage(originalImage, 1024, 1024)
buf := new(bytes.Buffer)
jpeg.Encode(buf, img, nil)
imageData := base64.StdEncoding.EncodeToString(buf.Bytes())
result, _ := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImage(imageData, "image/jpeg").
WithText("Analyze").
Execute()Use URLs When Possible
// More efficient than uploading large files
result, _ := sdk.NewMultiModalBuilder(ctx, llm, logger, metrics).
WithImageURL("https://cdn.example.com/image.jpg").
WithText("Analyze").
Execute()Next Steps
How is this guide?
Last updated on