Best Practices

Security

Secure API Keys

Never commit API keys to version control
Use environment variables
Rotate keys regularly
Use separate keys for dev/staging/production

import os
from openai import OpenAI

# Good
client = OpenAI(
    api_key=os.getenv("SAVEGATE_API_KEY"),
    base_url="https://api.savegate.ai/v1"
)

# Bad - hardcoded key
client = OpenAI(api_key="sk-savegate-xxx...")

Backend Only

Never expose API keys in client-side code:

❌ Browser JavaScript
❌ Mobile apps
❌ Public repositories
✅ Backend servers
✅ Serverless functions

Input Validation

Always validate user input:

def safe_chat(user_input: str, max_length: int = 4000):
    # Validate input
    if not user_input or not user_input.strip():
        raise ValueError("Empty input")

    if len(user_input) > max_length:
        raise ValueError(f"Input too long (max {max_length})")

    # Sanitize if needed
    user_input = user_input.strip()

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": user_input}]
    )
    return response.choices[0].message.content

Performance

Use Async for Concurrency

Process multiple requests in parallel:

import asyncio
from openai import AsyncOpenAI

async_client = AsyncOpenAI(
    api_key=os.getenv("SAVEGATE_API_KEY"),
    base_url="https://api.savegate.ai/v1"
)

async def process_batch(messages):
    tasks = [
        async_client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": msg}]
        )
        for msg in messages
    ]
    return await asyncio.gather(*tasks)

Stream When Possible

Use streaming for better perceived performance:

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Long response"}],
    stream=True  # Enable streaming
)

for chunk in response:
    # Process chunks as they arrive
    pass

Cache Responses

Cache common queries to save costs:

from functools import lru_cache
import hashlib

@lru_cache(maxsize=1000)
def cached_chat(message: str, model: str = "gpt-4"):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": message}]
    )
    return response.choices[0].message.content

Cost Optimization

Choose the Right Model

Use cheaper models when appropriate:

Simple tasks: gpt-3.5-turbo, claude-3-haiku
Complex reasoning: gpt-4, claude-3-5-sonnet
Code: claude-3-5-sonnet
Long context: gemini-1.5-pro

Optimize Prompts

Shorter, more specific prompts cost less:

# Expensive - verbose prompt
prompt = "I would like you to please help me understand..."  # 200 tokens

# Cheaper - concise prompt
prompt = "Explain: ..."  # 50 tokens

Limit Output Length

Set max_tokens to avoid excessive output:

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Explain AI"}],
    max_tokens=500  # Limit response length
)

Monitor Usage

Track costs in real-time:

Check Dashboard regularly
Set up billing alerts
Review usage patterns
Identify optimization opportunities

Error Handling

Implement Retries

Always retry transient errors:

def chat_with_retry(message, max_retries=3):
    for attempt in range(max_retries):
        try:
            return client.chat.completions.create(...)
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            time.sleep(2 ** attempt)

Graceful Degradation

Have fallbacks ready:

try:
    response = client.chat.completions.create(...)
except Exception:
    # Fallback to cached response or default message
    return default_response

Log Errors

Track errors for debugging:

import logging

logger = logging.getLogger(__name__)

try:
    response = client.chat.completions.create(...)
except Exception as e:
    logger.error(f"API call failed: {e}", exc_info=True)
    raise

Production Checklist

Security

✅ API keys in environment variables
✅ No keys in client-side code
✅ Input validation
✅ Rate limiting on your API

Performance

✅ Async/concurrent requests
✅ Streaming enabled where appropriate
✅ Response caching
✅ Timeouts configured

Reliability

✅ Retry logic implemented
✅ Error handling
✅ Logging
✅ Monitoring/alerting

Cost

✅ Right model for each task
✅ Prompts optimized
✅ Output limits set
✅ Usage monitoring

Example Production Setup

import os
import logging
import time
from typing import Optional
from openai import OpenAI, APIError
from functools import lru_cache

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ProductionSaveGateClient:
    """Production-ready SaveGate client with best practices"""

    def __init__(self):
        api_key = os.getenv("SAVEGATE_API_KEY")
        if not api_key:
            raise ValueError("SAVEGATE_API_KEY not set")

        self.client = OpenAI(
            api_key=api_key,
            base_url="https://api.savegate.ai/v1",
            timeout=30.0
        )

    def chat(
        self,
        message: str,
        model: str = "gpt-4",
        max_tokens: int = 1000,
        stream: bool = False,
        max_retries: int = 3
    ) -> Optional[str]:
        """Chat with comprehensive error handling and retries"""

        # Validate input
        if not message or len(message) > 10000:
            raise ValueError("Invalid message length")

        for attempt in range(max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=model,
                    messages=[{"role": "user", "content": message}],
                    max_tokens=max_tokens,
                    stream=stream
                )

                if stream:
                    return self._handle_stream(response)
                else:
                    return response.choices[0].message.content

            except APIError as e:
                logger.error(f"API error (attempt {attempt + 1}): {e}")
                if attempt == max_retries - 1:
                    raise

                wait = (2 ** attempt) * (1 + random.random())
                time.sleep(wait)

            except Exception as e:
                logger.exception(f"Unexpected error: {e}")
                raise

        return None

    def _handle_stream(self, stream):
        """Handle streaming response"""
        full_response = ""
        try:
            for chunk in stream:
                if chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    full_response += content
                    yield content
        except Exception as e:
            logger.error(f"Stream error: {e}")
            raise

    @lru_cache(maxsize=100)
    def cached_chat(self, message: str) -> str:
        """Cached version for common queries"""
        return self.chat(message)

# Usage
client = ProductionSaveGateClient()

# Simple call
response = client.chat("Hello!")

# With options
response = client.chat(
    "Explain AI",
    model="gpt-3.5-turbo",  # Cheaper model
    max_tokens=500  # Limit cost
)

# Cached call
response = client.cached_chat("What is 2+2?")  # Cached

Error Handling

Comprehensive error handling guide

API Reference

Complete API documentation

Getting Started

Core Concepts

SDK Integration

Guides

Security

Performance

Cost Optimization

Error Handling

Production Checklist

Example Production Setup

Error Handling

API Reference

Getting Started

Core Concepts

SDK Integration

Guides

​Security

​Performance

​Cost Optimization

​Error Handling

​Production Checklist

​Example Production Setup

Error Handling

API Reference

Security

Performance

Cost Optimization

Error Handling

Production Checklist

Example Production Setup