Documentation Index Fetch the complete documentation index at: https://docs.savegate.ai/llms.txt
Use this file to discover all available pages before exploring further.
Security
Never commit API keys to version control
Use environment variables
Rotate keys regularly
Use separate keys for dev/staging/production
import os
from openai import OpenAI
# Good
client = OpenAI(
api_key = os.getenv( "SAVEGATE_API_KEY" ),
base_url = "https://api.savegate.ai/v1"
)
# Bad - hardcoded key
client = OpenAI( api_key = "sk-savegate-xxx..." )
Never expose API keys in client-side code:
❌ Browser JavaScript
❌ Mobile apps
❌ Public repositories
✅ Backend servers
✅ Serverless functions
Use Async for Concurrency
Process multiple requests in parallel: import asyncio
from openai import AsyncOpenAI
async_client = AsyncOpenAI(
api_key = os.getenv( "SAVEGATE_API_KEY" ),
base_url = "https://api.savegate.ai/v1"
)
async def process_batch ( messages ):
tasks = [
async_client.chat.completions.create(
model = "gpt-5.1" ,
messages = [{ "role" : "user" , "content" : msg}]
)
for msg in messages
]
return await asyncio.gather( * tasks)
Use streaming for better perceived performance: response = client.chat.completions.create(
model = "gpt-4" ,
messages = [{ "role" : "user" , "content" : "Long response" }],
stream = True # Enable streaming
)
for chunk in response:
# Process chunks as they arrive
pass
Cache common queries to save costs: from functools import lru_cache
import hashlib
@lru_cache ( maxsize = 1000 )
def cached_chat ( message : str , model : str = "gpt-4" ):
response = client.chat.completions.create(
model = model,
messages = [{ "role" : "user" , "content" : message}]
)
return response.choices[ 0 ].message.content
Cost Optimization
Use cheaper models when appropriate:
Simple tasks : gpt-4.1-nano, claude-3.5-haiku
Complex reasoning : gpt-5.1, claude-sonnet-4.5, o3
Code : claude-sonnet-4.5, gpt-5.1
Fast & affordable : gpt-5.1-mini, gpt-4.2-mini
Shorter, more specific prompts cost less: # Expensive - verbose prompt
prompt = "I would like you to please help me understand..." # 200 tokens
# Cheaper - concise prompt
prompt = "Explain: ..." # 50 tokens
Set max_tokens to avoid excessive output: response = client.chat.completions.create(
model = "gpt-4" ,
messages = [{ "role" : "user" , "content" : "Explain AI" }],
max_tokens = 500 # Limit response length
)
Track costs in real-time:
Check Dashboard regularly
Set up billing alerts
Review usage patterns
Identify optimization opportunities
Error Handling
Always retry transient errors: def chat_with_retry ( message , max_retries = 3 ):
for attempt in range (max_retries):
try :
return client.chat.completions.create( ... )
except Exception as e:
if attempt == max_retries - 1 :
raise
time.sleep( 2 ** attempt)
Have fallbacks ready: try :
response = client.chat.completions.create( ... )
except Exception :
# Fallback to cached response or default message
return default_response
Track errors for debugging: import logging
logger = logging.getLogger( __name__ )
try :
response = client.chat.completions.create( ... )
except Exception as e:
logger.error( f "API call failed: { e } " , exc_info = True )
raise
Production Checklist
Security
✅ API keys in environment variables
✅ No keys in client-side code
✅ Input validation
✅ Rate limiting on your API
Performance
✅ Async/concurrent requests
✅ Streaming enabled where appropriate
✅ Response caching
✅ Timeouts configured
Reliability
✅ Retry logic implemented
✅ Error handling
✅ Logging
✅ Monitoring/alerting
Cost
✅ Right model for each task
✅ Prompts optimized
✅ Output limits set
✅ Usage monitoring
Example Production Setup
import os
import logging
import time
from typing import Optional
from openai import OpenAI, APIError
from functools import lru_cache
# Configure logging
logging.basicConfig( level = logging. INFO )
logger = logging.getLogger( __name__ )
class ProductionSaveGateClient :
"""Production-ready SaveGate client with best practices"""
def __init__ ( self ):
api_key = os.getenv( "SAVEGATE_API_KEY" )
if not api_key:
raise ValueError ( "SAVEGATE_API_KEY not set" )
self .client = OpenAI(
api_key = api_key,
base_url = "https://api.savegate.ai/v1" ,
timeout = 30.0
)
def chat (
self ,
message : str ,
model : str = "gpt-5.1" ,
max_tokens : int = 1000 ,
stream : bool = False ,
max_retries : int = 3
) -> Optional[ str ]:
"""Chat with comprehensive error handling and retries"""
# Validate input
if not message or len (message) > 10000 :
raise ValueError ( "Invalid message length" )
for attempt in range (max_retries):
try :
response = self .client.chat.completions.create(
model = model,
messages = [{ "role" : "user" , "content" : message}],
max_tokens = max_tokens,
stream = stream
)
if stream:
return self ._handle_stream(response)
else :
return response.choices[ 0 ].message.content
except APIError as e:
logger.error( f "API error (attempt { attempt + 1 } ): { e } " )
if attempt == max_retries - 1 :
raise
wait = ( 2 ** attempt) * ( 1 + random.random())
time.sleep(wait)
except Exception as e:
logger.exception( f "Unexpected error: { e } " )
raise
return None
def _handle_stream ( self , stream ):
"""Handle streaming response"""
full_response = ""
try :
for chunk in stream:
if chunk.choices[ 0 ].delta.content:
content = chunk.choices[ 0 ].delta.content
full_response += content
yield content
except Exception as e:
logger.error( f "Stream error: { e } " )
raise
@lru_cache ( maxsize = 100 )
def cached_chat ( self , message : str ) -> str :
"""Cached version for common queries"""
return self .chat(message)
# Usage
client = ProductionSaveGateClient()
# Simple call
response = client.chat( "Hello!" )
# With options
response = client.chat(
"Explain AI" ,
model = "gpt-5.1-mini" , # Cheaper model
max_tokens = 500 # Limit cost
)
# Cached call
response = client.cached_chat( "What is 2+2?" ) # Cached
Error Handling Comprehensive error handling guide
API Reference Complete API documentation