Skip to main content

Security

  • Never commit API keys to version control
  • Use environment variables
  • Rotate keys regularly
  • Use separate keys for dev/staging/production
import os
from openai import OpenAI

# Good
client = OpenAI(
    api_key=os.getenv("SAVEGATE_API_KEY"),
    base_url="https://api.savegate.ai/v1"
)

# Bad - hardcoded key
client = OpenAI(api_key="sk-savegate-xxx...")
Never expose API keys in client-side code:
  • ❌ Browser JavaScript
  • ❌ Mobile apps
  • ❌ Public repositories
  • ✅ Backend servers
  • ✅ Serverless functions
Always validate user input:
def safe_chat(user_input: str, max_length: int = 4000):
    # Validate input
    if not user_input or not user_input.strip():
        raise ValueError("Empty input")

    if len(user_input) > max_length:
        raise ValueError(f"Input too long (max {max_length})")

    # Sanitize if needed
    user_input = user_input.strip()

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": user_input}]
    )
    return response.choices[0].message.content

Performance

Process multiple requests in parallel:
import asyncio
from openai import AsyncOpenAI

async_client = AsyncOpenAI(
    api_key=os.getenv("SAVEGATE_API_KEY"),
    base_url="https://api.savegate.ai/v1"
)

async def process_batch(messages):
    tasks = [
        async_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": msg}]
        )
        for msg in messages
    ]
    return await asyncio.gather(*tasks)
Use streaming for better perceived performance:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Long response"}],
    stream=True  # Enable streaming
)

for chunk in response:
    # Process chunks as they arrive
    pass
Cache common queries to save costs:
from functools import lru_cache
import hashlib

@lru_cache(maxsize=1000)
def cached_chat(message: str, model: str = "gpt-4"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": message}]
    )
    return response.choices[0].message.content

Cost Optimization

Use cheaper models when appropriate:
  • Simple tasks: gpt-3.5-turbo, claude-3-haiku
  • Complex reasoning: gpt-4, claude-3-5-sonnet
  • Code: claude-3-5-sonnet
  • Long context: gemini-1.5-pro
Shorter, more specific prompts cost less:
# Expensive - verbose prompt
prompt = "I would like you to please help me understand..."  # 200 tokens

# Cheaper - concise prompt
prompt = "Explain: ..."  # 50 tokens
Set max_tokens to avoid excessive output:
response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Explain AI"}],
    max_tokens=500  # Limit response length
)
Track costs in real-time:
  • Check Dashboard regularly
  • Set up billing alerts
  • Review usage patterns
  • Identify optimization opportunities

Error Handling

Always retry transient errors:
def chat_with_retry(message, max_retries=3):
    for attempt in range(max_retries):
        try:
            return client.chat.completions.create(...)
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)
Have fallbacks ready:
try:
    response = client.chat.completions.create(...)
except Exception:
    # Fallback to cached response or default message
    return default_response
Track errors for debugging:
import logging

logger = logging.getLogger(__name__)

try:
    response = client.chat.completions.create(...)
except Exception as e:
    logger.error(f"API call failed: {e}", exc_info=True)
    raise

Production Checklist

1

Security

  • ✅ API keys in environment variables
  • ✅ No keys in client-side code
  • ✅ Input validation
  • ✅ Rate limiting on your API
2

Performance

  • ✅ Async/concurrent requests
  • ✅ Streaming enabled where appropriate
  • ✅ Response caching
  • ✅ Timeouts configured
3

Reliability

  • ✅ Retry logic implemented
  • ✅ Error handling
  • ✅ Logging
  • ✅ Monitoring/alerting
4

Cost

  • ✅ Right model for each task
  • ✅ Prompts optimized
  • ✅ Output limits set
  • ✅ Usage monitoring

Example Production Setup

import os
import logging
import time
from typing import Optional
from openai import OpenAI, APIError
from functools import lru_cache

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ProductionSaveGateClient:
    """Production-ready SaveGate client with best practices"""

    def __init__(self):
        api_key = os.getenv("SAVEGATE_API_KEY")
        if not api_key:
            raise ValueError("SAVEGATE_API_KEY not set")

        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.savegate.ai/v1",
            timeout=30.0
        )

    def chat(
        self,
        message: str,
        model: str = "gpt-4",
        max_tokens: int = 1000,
        stream: bool = False,
        max_retries: int = 3
    ) -> Optional[str]:
        """Chat with comprehensive error handling and retries"""

        # Validate input
        if not message or len(message) > 10000:
            raise ValueError("Invalid message length")

        for attempt in range(max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": message}],
                    max_tokens=max_tokens,
                    stream=stream
                )

                if stream:
                    return self._handle_stream(response)
                else:
                    return response.choices[0].message.content

            except APIError as e:
                logger.error(f"API error (attempt {attempt + 1}): {e}")
                if attempt == max_retries - 1:
                    raise

                wait = (2 ** attempt) * (1 + random.random())
                time.sleep(wait)

            except Exception as e:
                logger.exception(f"Unexpected error: {e}")
                raise

        return None

    def _handle_stream(self, stream):
        """Handle streaming response"""
        full_response = ""
        try:
            for chunk in stream:
                if chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    full_response += content
                    yield content
        except Exception as e:
            logger.error(f"Stream error: {e}")
            raise

    @lru_cache(maxsize=100)
    def cached_chat(self, message: str) -> str:
        """Cached version for common queries"""
        return self.chat(message)

# Usage
client = ProductionSaveGateClient()

# Simple call
response = client.chat("Hello!")

# With options
response = client.chat(
    "Explain AI",
    model="gpt-3.5-turbo",  # Cheaper model
    max_tokens=500  # Limit cost
)

# Cached call
response = client.cached_chat("What is 2+2?")  # Cached