Overview

The NanoGPT API offers multiple ways to generate text, including OpenAI-compatible endpoints and our legacy options. This guide covers all available text generation methods.

If you are using a TEE-backed model (e.g., prefixed with TEE/), you can also verify the enclave attestation and signatures for your chat completions. See the TEE Model Verification guide for more details.

OpenAI Compatible Endpoints

Chat Completions (v1/chat/completions)

This endpoint mimics OpenAI’s chat completions API:

import requests
import json

BASE_URL = "https://nano-gpt.com/api/v1"
API_KEY = "YOUR_API_KEY"  # Replace with your API key

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
    "Accept": "text/event-stream"  # Required for SSE streaming
}

def stream_chat_completion(messages, model="chatgpt-4o-latest"):
    """
    Send a streaming chat completion request using the OpenAI-compatible endpoint.
    """
    data = {
        "model": model,
        "messages": messages,
        "stream": True  # Enable streaming
    }

    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers,
        json=data,
        stream=True
    )

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                line = line[6:]
            if line == '[DONE]':
                break
            try:
                chunk = json.loads(line)
                if chunk['choices'][0]['delta'].get('content'):
                    yield chunk['choices'][0]['delta']['content']
            except json.JSONDecodeError:
                continue

# Example usage
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Please explain the concept of artificial intelligence."}
]

try:
    print("Assistant's Response:")
    for content_chunk in stream_chat_completion(messages):
        print(content_chunk, end='', flush=True)
    print("")
except Exception as e:
    print(f"Error: {str(e)}")

Text Completions (v1/completions)

This endpoint mimics OpenAI’s legacy text completions API:

import requests
import json

BASE_URL = "https://nano-gpt.com/api/v1"
API_KEY = "YOUR_API_KEY"  # Replace with your API key

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

def get_completion(prompt, model="chatgpt-4o-latest"):
    """
    Send a text completion request using the OpenAI-compatible endpoint.
    """
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": 1000,  # Optional: maximum number of tokens to generate
        "temperature": 0.7,  # Optional: controls randomness (0-2)
        "top_p": 1,         # Optional: nucleus sampling parameter
        "stream": False     # Set to True for streaming responses
    }

    response = requests.post(
        f"{BASE_URL}/completions",
        headers=headers,
        json=data
    )

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    return response.json()

# Example usage
prompt = "Write a short story about a robot learning to paint:"
try:
    response = get_completion(prompt)
    print("Completion:", response['choices'][0]['text'])
except Exception as e:
    print(f"Error: {str(e)}")

Legacy Text Completions

For the older, non-OpenAI compatible endpoint:

import requests
import json

BASE_URL = "https://nano-gpt.com/api"
API_KEY = "YOUR_API_KEY"

headers = {
    "x-api-key": API_KEY,
    "Content-Type": "application/json"
}

def talk_to_gpt(prompt, model="chatgpt-4o-latest", messages=[]):
    data = {
        "prompt": prompt,
        "model": model,
        "messages": messages
    }
    response = requests.post(f"{BASE_URL}/talk-to-gpt", headers=headers, json=data)
    return response.text if response.status_code == 200 else None

# Example usage
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing well, thank you! How can I assist you today?"}
]
prompt = "Please explain the concept of artificial intelligence."
response = talk_to_gpt(prompt, messages=messages)
if response:
    # Split the response to separate the text and NanoGPT info
    parts = response.split('<NanoGPT>')
    text_response = parts[0].strip()
    nano_info = json.loads(parts[1].split('</NanoGPT>')[0])

    print("NanoGPT Response:", text_response)
    print("Cost:", nano_info['cost'])
    print("Input Tokens:", nano_info['inputTokens'])
    print("Output Tokens:", nano_info['outputTokens'])
else:
    print("Failed to get response from GPT")

Chat Completions with Cache Control (Claude Models)

For Claude models, you can enable caching to improve performance and reduce costs for repeated conversations. 5m cache costs 1.25x, 1h costs 2x on initial message, both give a 90% discount after.

def chat_completion_with_cache(messages, model="claude-3-5-sonnet-20241022"):
    """
    Send a chat completion request with caching enabled (Claude models only).
    """
    data = {
        "model": model,
        "messages": messages,
        "stream": False,
        "temperature": 0.7,
        "max_tokens": 4000,
        "cache_control": {
            "enabled": True,
            "ttl": "5m"  # Cache for 5 minutes, or 1h for 1 hour.
        }
    }

    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers,
        json=data
    )

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    return response.json()

# Example with cache control
messages_with_context = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Previous conversation context..."},
    {"role": "assistant", "content": "Previous response..."},
    {"role": "user", "content": "New question here"}
]

try:
    response = chat_completion_with_cache(messages_with_context)
    print("Assistant's Response:", response['choices'][0]['message']['content'])
except Exception as e:
    print(f"Error: {str(e)}")

Enable real-time web access for any model by appending special suffixes:

def chat_completion_with_web_search(messages, model="chatgpt-4o-latest", search_depth="standard"):
    """
    Send a chat completion request with web search enabled.
    
    Args:
        messages: List of message objects
        model: Base model name
        search_depth: "standard" ($0.006) or "deep" ($0.06)
    """
    # Append the appropriate suffix for web search
    if search_depth == "deep":
        model_with_search = f"{model}:online/linkup-deep"
    else:
        model_with_search = f"{model}:online"
    
    data = {
        "model": model_with_search,
        "messages": messages,
        "stream": True
    }
    
    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers,
        json=data,
        stream=True
    )
    
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                line = line[6:]
            if line == '[DONE]':
                break
            try:
                chunk = json.loads(line)
                if chunk['choices'][0]['delta'].get('content'):
                    yield chunk['choices'][0]['delta']['content']
            except json.JSONDecodeError:
                continue

# Example: Get current information
messages = [
    {"role": "user", "content": "What happened in the tech industry this week?"}
]

print("Standard web search:")
for content in chat_completion_with_web_search(messages):
    print(content, end='', flush=True)

# Example: Deep research
research_messages = [
    {"role": "user", "content": "Provide a comprehensive analysis of the latest developments in quantum computing"}
]

print("\n\nDeep web search:")
for content in chat_completion_with_web_search(research_messages, search_depth="deep"):
    print(content, end='', flush=True)

Web Search Options

  • :online - Standard search with 10 results ($0.006 per request)
  • :online/linkup-deep - Deep iterative search ($0.06 per request)

Web search dramatically improves factuality - GPT-4o-mini with web access shows a 10x improvement in accuracy, making it twice as accurate as models like o1 without web access.

For more advanced web search capabilities including structured output, domain filtering, and date filtering, see the Web Search API.