Skip to main content

Overview

The NanoGPT API offers multiple ways to generate text, including OpenAI-compatible endpoints and our legacy options. This guide covers all available text generation methods. If you are using a TEE-backed model (e.g., prefixed with TEE/), you can also verify the enclave attestation and signatures for your chat completions. See the TEE Model Verification guide for more details.

OpenAI Compatible Endpoints

Chat Completions (v1/chat/completions)

This endpoint mimics OpenAI’s chat completions API:
import requests
import json

BASE_URL = "https://nano-gpt.com/api/v1"
API_KEY = "YOUR_API_KEY"  # Replace with your API key

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
    "Accept": "text/event-stream"  # Required for SSE streaming
}

def stream_chat_completion(messages, model="chatgpt-4o-latest"):
    """
    Send a streaming chat completion request using the OpenAI-compatible endpoint.
    """
    data = {
        "model": model,
        "messages": messages,
        "stream": True  # Enable streaming
    }

    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers,
        json=data,
        stream=True
    )

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                line = line[6:]
            if line == '[DONE]':
                break
            try:
                chunk = json.loads(line)
                if chunk['choices'][0]['delta'].get('content'):
                    yield chunk['choices'][0]['delta']['content']
            except json.JSONDecodeError:
                continue

# Example usage
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Please explain the concept of artificial intelligence."}
]

try:
    print("Assistant's Response:")
    for content_chunk in stream_chat_completion(messages):
        print(content_chunk, end='', flush=True)
    print("")
except Exception as e:
    print(f"Error: {str(e)}")

Text Completions (v1/completions)

This endpoint mimics OpenAI’s legacy text completions API:
import requests
import json

BASE_URL = "https://nano-gpt.com/api/v1"
API_KEY = "YOUR_API_KEY"  # Replace with your API key

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

def get_completion(prompt, model="chatgpt-4o-latest"):
    """
    Send a text completion request using the OpenAI-compatible endpoint.
    """
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": 1000,  # Optional: maximum number of tokens to generate
        "temperature": 0.7,  # Optional: controls randomness (0-2)
        "top_p": 1,         # Optional: nucleus sampling parameter
        "stream": False     # Set to True for streaming responses
    }

    response = requests.post(
        f"{BASE_URL}/completions",
        headers=headers,
        json=data
    )

    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")

    return response.json()

# Example usage
prompt = "Write a short story about a robot learning to paint:"
try:
    response = get_completion(prompt)
    print("Completion:", response['choices'][0]['text'])
except Exception as e:
    print(f"Error: {str(e)}")

Legacy Text Completions

For the older, non-OpenAI compatible endpoint:
import requests
import json

BASE_URL = "https://nano-gpt.com/api"
API_KEY = "YOUR_API_KEY"

headers = {
    "x-api-key": API_KEY,
    "Content-Type": "application/json"
}

def talk_to_gpt(prompt, model="chatgpt-4o-latest", messages=[]):
    data = {
        "prompt": prompt,
        "model": model,
        "messages": messages
    }
    response = requests.post(f"{BASE_URL}/talk-to-gpt", headers=headers, json=data)
    return response.text if response.status_code == 200 else None

# Example usage
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing well, thank you! How can I assist you today?"}
]
prompt = "Please explain the concept of artificial intelligence."
response = talk_to_gpt(prompt, messages=messages)
if response:
    # Split the response to separate the text and NanoGPT info
    parts = response.split('<NanoGPT>')
    text_response = parts[0].strip()
    nano_info = json.loads(parts[1].split('</NanoGPT>')[0])

    print("NanoGPT Response:", text_response)
    print("Cost:", nano_info['cost'])
    print("Input Tokens:", nano_info['inputTokens'])
    print("Output Tokens:", nano_info['outputTokens'])
else:
    print("Failed to get response from GPT")

Prompt Caching (Claude Models)

Claude caching follows Anthropic’s Messages schema: cache_control lives on the message content blocks you want to reuse. NanoGPT simply forwards those markers to Anthropic, so you decide where each cache breakpoint sits. The first invocation costs 1.25× (5 min TTL) or 2× (1 hour TTL); cached replays discount the same tokens by ~90%.

Explicit cache_control markers

def chat_completion_with_claude_cache(messages, model="claude-3-5-sonnet-20241022"):
    """
    Attach cache_control directly to the static prompt blocks you want Claude to reuse.
    """
    headers_with_cache = {
        **headers,  # reuse Authorization + Content-Type from above
        "anthropic-beta": "prompt-caching-2024-07-31"
    }

    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.7,
        "max_tokens": 4000,
        "stream": False
    }

    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers_with_cache,
        json=payload
    )
    response.raise_for_status()
    return response.json()

messages_with_breakpoint = [
    {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": "You are a financial watchdog. Answer in JSON with rationale fields.",
                "cache_control": {"type": "ephemeral", "ttl": "5m"}
            }
        ]
    },
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": (
                    "Context: <10 kB of policy + rubric that rarely changes>\n"
                    "A separate uncached message will carry the live question."
                ),
                "cache_control": {"type": "ephemeral", "ttl": "5m"}
            }
        ]
    },
    {"role": "user", "content": "What risks should I watch for in today's filing?"}
]

result = chat_completion_with_claude_cache(messages_with_breakpoint)
print(result["choices"][0]["message"]["content"])
  • cache_control belongs to the individual content blocks (system, user, tool definitions, etc.). Each marker caches the entire prefix up to and including that block, matching Anthropic’s behavior.
  • Supported TTLs are 5m and 1h. Omit ttl to use the default 5m window.
  • Set the anthropic-beta: prompt-caching-2024-07-31 header on any request that contains cache markers; Anthropic rejects cache requests without the beta flag.
  • Check usage.prompt_tokens_details.cached_tokens in NanoGPT’s response to confirm what was billed at the discounted rate.

Using the prompt_caching helper

If you prefer not to duplicate cache_control entries manually, NanoGPT accepts a helper object that tags the leading prefix for you.
payload = {
    "model": "claude-3-5-sonnet-20241022",
    "messages": [
        {"role": "system", "content": "Summaries must be under 100 words."},
        {"role": "user", "content": "Cache the playbook for an hour."},
        {"role": "user", "content": "Live question goes here"}
    ],
    "prompt_caching": {
        "enabled": True,
        "ttl": "1h",
        "cut_after_message_index": 1  # cache everything through message index 1
    }
}

requests.post(
    f"{BASE_URL}/chat/completions",
    headers={
        **headers,
        "anthropic-beta": "prompt-caching-2024-07-31"
    },
    json=payload
)
cut_after_message_index is zero-based and points at the last message in the static prefix. NanoGPT will attach a cache_control block with your TTL to each message up to that index before forwarding the request to Anthropic—no additional heuristics are applied. If you need different cache durations or non-contiguous breakpoints, fall back to explicit cache_control markers in your messages array. Enable real-time web access for any model by appending special suffixes:
def chat_completion_with_web_search(messages, model="chatgpt-4o-latest", search_depth="standard"):
    """
    Send a chat completion request with web search enabled.
    
    Args:
        messages: List of message objects
        model: Base model name
        search_depth: "standard" ($0.006) or "deep" ($0.06)
    """
    # Append the appropriate suffix for web search
    if search_depth == "deep":
        model_with_search = f"{model}:online/linkup-deep"
    else:
        model_with_search = f"{model}:online"
    
    data = {
        "model": model_with_search,
        "messages": messages,
        "stream": True
    }
    
    response = requests.post(
        f"{BASE_URL}/chat/completions",
        headers=headers,
        json=data,
        stream=True
    )
    
    if response.status_code != 200:
        raise Exception(f"Error: {response.status_code}")
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                line = line[6:]
            if line == '[DONE]':
                break
            try:
                chunk = json.loads(line)
                if chunk['choices'][0]['delta'].get('content'):
                    yield chunk['choices'][0]['delta']['content']
            except json.JSONDecodeError:
                continue

# Example: Get current information
messages = [
    {"role": "user", "content": "What happened in the tech industry this week?"}
]

print("Standard web search:")
for content in chat_completion_with_web_search(messages):
    print(content, end='', flush=True)

# Example: Deep research
research_messages = [
    {"role": "user", "content": "Provide a comprehensive analysis of the latest developments in quantum computing"}
]

print("\n\nDeep web search:")
for content in chat_completion_with_web_search(research_messages, search_depth="deep"):
    print(content, end='', flush=True)

Web Search Options

  • :online - Standard search with 10 results ($0.006 per request)
  • :online/linkup-deep - Deep iterative search ($0.06 per request)
Web search dramatically improves factuality - GPT-4o-mini with web access shows a 10x improvement in accuracy, making it twice as accurate as models like o1 without web access. For more advanced web search capabilities including structured output, domain filtering, and date filtering, see the Web Search API.