Overview

The NanoGPT TTS API allows you to convert text into natural-sounding speech using various models from different providers. The API supports multiple languages, voices, and customization options including speed control, voice instructions, and audio format selection.

Available Models

  • Kokoro-82m: High-quality multilingual model with 44 voices ($0.001/1k chars)
  • Dia-TTS: Multi-speaker dialogue model with script parsing ($0.05/1k chars)
  • Elevenlabs-Turbo-V2.5: Premium quality with 46 voices and style controls ($0.06/1k chars)
  • tts-1: OpenAI’s standard quality model with low latency ($0.015/1k chars)
  • tts-1-hd: OpenAI’s high definition model ($0.030/1k chars)
  • gpt-4o-mini-tts: Ultra-low cost OpenAI model ($0.0006/1k chars)

Authentication

All requests require authentication via API key:

x-api-key: YOUR_API_KEY

Basic Text-to-Speech

Simple TTS Request

import requests
import json

BASE_URL = "https://nano-gpt.com/api"
API_KEY = "YOUR_API_KEY"

def text_to_speech(text, model="Kokoro-82m", voice=None, **kwargs):
    """
    Convert text to speech using NanoGPT TTS API
    """
    headers = {
        "x-api-key": API_KEY,
        "Content-Type": "application/json"
    }
    
    payload = {
        "text": text,
        "model": model
    }
    
    if voice:
        payload["voice"] = voice
    
    # Add any additional parameters
    payload.update(kwargs)
    
    response = requests.post(
        f"{BASE_URL}/tts",
        headers=headers,
        json=payload
    )
    
    if response.status_code == 200:
        # Check if response is JSON or binary
        content_type = response.headers.get('content-type', '')
        
        if 'application/json' in content_type:
            # JSON response with audio URL
            data = response.json()
            print(f"Audio URL: {data['audioUrl']}")
            
            # Download the audio file
            audio_response = requests.get(data['audioUrl'])
            with open('output.wav', 'wb') as f:
                f.write(audio_response.content)
                
        else:
            # Binary audio data (OpenAI models)
            with open('output.mp3', 'wb') as f:
                f.write(response.content)
                
        print("Audio saved successfully!")
        return response
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")

# Example usage
try:
    response = text_to_speech(
        "Hello! This is a test of the NanoGPT text-to-speech API.",
        model="Kokoro-82m",
        voice="af_bella",
        speed=1.2
    )
    print("TTS generation completed!")
except Exception as e:
    print(f"Error: {e}")

Model-Specific Examples

Kokoro-82m - Multilingual Voices

Kokoro supports 44 voices across 13 language groups:

# Available voice categories and examples
KOKORO_VOICES = {
    "american_female": ["af_alloy", "af_aoede", "af_bella", "af_jessica", "af_nova"],
    "american_male": ["am_adam", "am_echo", "am_eric", "am_liam", "am_onyx"],
    "british_female": ["bf_alice", "bf_emma", "bf_isabella", "bf_lily"],
    "british_male": ["bm_daniel", "bm_fable", "bm_george", "bm_lewis"],
    "japanese_female": ["jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro"],
    "mandarin_female": ["zf_xiaobei", "zf_xiaoni", "zf_xiaoxiao", "zf_xiaoyi"],
    "french_female": ["ff_siwis"],
    "italian_male": ["im_nicola"],
    "hindi_female": ["hf_alpha", "hf_beta"]
}

def generate_multilingual_samples():
    """Generate speech samples in different languages"""
    samples = [
        {"text": "Hello, welcome to our service!", "voice": "af_bella", "lang": "English"},
        {"text": "Bonjour, bienvenue dans notre service!", "voice": "ff_siwis", "lang": "French"},
        {"text": "こんにちは、私たちのサービスへようこそ!", "voice": "jf_alpha", "lang": "Japanese"},
        {"text": "你好,欢迎使用我们的服务!", "voice": "zf_xiaoxiao", "lang": "Chinese"},
        {"text": "Ciao, benvenuto nel nostro servizio!", "voice": "im_nicola", "lang": "Italian"}
    ]
    
    for i, sample in enumerate(samples):
        try:
            response = text_to_speech(
                text=sample["text"],
                model="Kokoro-82m",
                voice=sample["voice"],
                speed=1.0
            )
            
            # Save with descriptive filename
            filename = f"sample_{i+1}_{sample['lang'].lower()}.wav"
            print(f"Generated {sample['lang']} sample: {filename}")
            
        except Exception as e:
            print(f"Error generating {sample['lang']} sample: {e}")

generate_multilingual_samples()

Dia-TTS - Multi-Speaker Conversations

Dia-TTS excels at creating conversations between two speakers:

def create_dialogue(script_parts):
    """
    Create a dialogue using Dia-TTS with speaker tags
    """
    # Format script with speaker tags
    dialogue_text = ""
    for part in script_parts:
        speaker = part["speaker"]  # "S1" or "S2"
        text = part["text"]
        dialogue_text += f"[{speaker}] {text} "
    
    return text_to_speech(
        text=dialogue_text.strip(),
        model="Dia-TTS",
        speed=1.1
    )

# Example dialogue
conversation = [
    {"speaker": "S1", "text": "Welcome to our podcast!"},
    {"speaker": "S2", "text": "Thanks for having me on the show."},
    {"speaker": "S1", "text": "Let's start by talking about your background."},
    {"speaker": "S2", "text": "I've been working in AI for over a decade."},
    {"speaker": "S1", "text": "That's fascinating! What got you started?"},
    {"speaker": "S2", "text": "It all began with a computer science course in college."}
]

try:
    response = create_dialogue(conversation)
    print("Dialogue generated successfully!")
except Exception as e:
    print(f"Error: {e}")

# You can also use a single speaker
single_speaker_text = "[S1] This is a single speaker narration using the first voice."
text_to_speech(single_speaker_text, model="Dia-TTS", voice="S2")  # Use S2 voice

Elevenlabs-Turbo-V2.5 - Premium Quality with Controls

Elevenlabs offers advanced voice control options:

def generate_with_voice_controls(text, voice="Rachel", **controls):
    """
    Generate speech with advanced voice controls
    """
    return text_to_speech(
        text=text,
        model="Elevenlabs-Turbo-V2.5",
        voice=voice,
        speed=controls.get("speed", 1.0),
        stability=controls.get("stability", 0.5),
        similarity_boost=controls.get("similarity_boost", 0.75),
        style=controls.get("style", 0)
    )

# Different voice styles
examples = [
    {
        "text": "This is a very stable and consistent voice.",
        "controls": {"stability": 0.9, "similarity_boost": 0.8, "style": 0}
    },
    {
        "text": "This is an expressive and dynamic voice!",
        "controls": {"stability": 0.3, "similarity_boost": 0.7, "style": 0.8}
    },
    {
        "text": "This is a balanced, natural sounding voice.",
        "controls": {"stability": 0.5, "similarity_boost": 0.75, "style": 0.3}
    }
]

for i, example in enumerate(examples):
    try:
        response = generate_with_voice_controls(
            text=example["text"],
            voice="Rachel",
            **example["controls"]
        )
        print(f"Generated style example {i+1}")
    except Exception as e:
        print(f"Error generating example {i+1}: {e}")

# Available Elevenlabs voices
ELEVENLABS_VOICES = [
    "Adam", "Alice", "Antoni", "Aria", "Arnold", "Bella", "Bill", "Brian",
    "Callum", "Charlie", "Charlotte", "Chris", "Daniel", "Domi", "Dorothy",
    "Drew", "Elli", "Emily", "Eric", "Ethan", "Fin", "Freya", "George",
    "Gigi", "Giovanni", "Grace", "James", "Jeremy", "Jessica", "Joseph",
    "Josh", "Laura", "Liam", "Lily", "Matilda", "Matthew", "Michael",
    "Nicole", "Rachel", "River", "Roger", "Ryan", "Sam", "Sarah", "Thomas", "Will"
]

OpenAI Models - Multiple Formats and Instructions

OpenAI models support various audio formats and voice instructions:

def generate_openai_tts(text, model="tts-1", **options):
    """
    Generate speech using OpenAI models with format options
    """
    return text_to_speech(
        text=text,
        model=model,
        voice=options.get("voice", "nova"),
        speed=options.get("speed", 1.0),
        response_format=options.get("response_format", "mp3"),
        instructions=options.get("instructions")
    )

# Different format examples
formats_demo = [
    {"format": "mp3", "desc": "Compressed, good for web"},
    {"format": "wav", "desc": "Uncompressed, high quality"},
    {"format": "opus", "desc": "Efficient streaming codec"},
    {"format": "flac", "desc": "Lossless compression"}
]

text = "This audio demonstrates different format options."

for fmt in formats_demo:
    try:
        response = generate_openai_tts(
            text=text,
            model="tts-1-hd",
            voice="nova",
            response_format=fmt["format"]
        )
        print(f"Generated {fmt['format'].upper()} format: {fmt['desc']}")
    except Exception as e:
        print(f"Error with {fmt['format']}: {e}")

# Voice instructions example (tts-1-hd and gpt-4o-mini-tts)
instruction_examples = [
    {
        "text": "Welcome to our customer service line.",
        "instructions": "Speak warmly and professionally, like a friendly customer service representative"
    },
    {
        "text": "Breaking news: Scientists make major discovery!",
        "instructions": "Speak with the excitement and urgency of a news reporter"
    },
    {
        "text": "Once upon a time, in a faraway land...",
        "instructions": "Tell this like a bedtime story, gentle and soothing"
    }
]

for example in instruction_examples:
    try:
        response = generate_openai_tts(
            text=example["text"],
            model="gpt-4o-mini-tts",  # Supports instructions
            voice="alloy",
            instructions=example["instructions"]
        )
        print(f"Generated with instructions: {example['instructions'][:50]}...")
    except Exception as e:
        print(f"Error: {e}")

# Available OpenAI voices
OPENAI_VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "onyx", "nova", "sage", "shimmer", "verse"]

Complete TTS Class Implementation

Here’s a comprehensive TTS class with all model support:

import requests
import json
from pathlib import Path

class NanoGPTTTS:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://nano-gpt.com/api"
        
        # Model capabilities
        self.model_info = {
            "Kokoro-82m": {
                "cost_per_1k": 0.001,
                "max_chars": 10000,
                "supports_speed": True,
                "output_format": "wav",
                "binary_response": False
            },
            "Dia-TTS": {
                "cost_per_1k": 0.05,
                "max_chars": 10000,
                "supports_speed": True,
                "output_format": "wav",
                "binary_response": False
            },
            "Elevenlabs-Turbo-V2.5": {
                "cost_per_1k": 0.06,
                "max_chars": 10000,
                "supports_speed": True,
                "output_format": "mp3",
                "binary_response": False,
                "supports_voice_controls": True
            },
            "tts-1": {
                "cost_per_1k": 0.015,
                "max_chars": 4096,
                "supports_speed": True,
                "binary_response": True,
                "supports_formats": True
            },
            "tts-1-hd": {
                "cost_per_1k": 0.030,
                "max_chars": 4096,
                "supports_speed": True,
                "binary_response": True,
                "supports_formats": True,
                "supports_instructions": True
            },
            "gpt-4o-mini-tts": {
                "cost_per_1k": 0.0006,
                "max_chars": 4096,
                "supports_speed": False,  # Speed ignored
                "binary_response": True,
                "supports_formats": True,
                "supports_instructions": True
            }
        }
    
    def synthesize(self, text, model="Kokoro-82m", output_file=None, **kwargs):
        """
        Main synthesis method with automatic parameter handling
        """
        # Validate inputs
        self._validate_request(text, model, **kwargs)
        
        # Prepare request
        headers = {
            "x-api-key": self.api_key,
            "Content-Type": "application/json"
        }
        
        payload = self._build_payload(text, model, **kwargs)
        
        # Make request
        response = requests.post(
            f"{self.base_url}/tts",
            headers=headers,
            json=payload
        )
        
        if response.status_code == 200:
            return self._handle_response(response, model, output_file)
        else:
            self._handle_error(response)
    
    def _validate_request(self, text, model, **kwargs):
        """Validate request parameters"""
        if not text.strip():
            raise ValueError("Text cannot be empty")
        
        if model not in self.model_info:
            raise ValueError(f"Unsupported model: {model}")
        
        model_config = self.model_info[model]
        
        if len(text) > model_config["max_chars"]:
            raise ValueError(f"Text too long for {model}. Max: {model_config['max_chars']} chars")
        
        # Validate speed parameter
        if kwargs.get("speed") and not model_config.get("supports_speed", True):
            print(f"Warning: Speed parameter ignored for {model}")
    
    def _build_payload(self, text, model, **kwargs):
        """Build request payload based on model capabilities"""
        payload = {
            "text": text,
            "model": model
        }
        
        model_config = self.model_info[model]
        
        # Add voice if specified
        if kwargs.get("voice"):
            payload["voice"] = kwargs["voice"]
        
        # Add speed if supported
        if kwargs.get("speed") and model_config.get("supports_speed"):
            payload["speed"] = kwargs["speed"]
        
        # Add OpenAI-specific parameters
        if model.startswith(("tts-", "gpt-")):
            if kwargs.get("response_format"):
                payload["response_format"] = kwargs["response_format"]
            if kwargs.get("instructions") and model_config.get("supports_instructions"):
                payload["instructions"] = kwargs["instructions"]
        
        # Add Elevenlabs-specific parameters
        elif model == "Elevenlabs-Turbo-V2.5":
            for param in ["stability", "similarity_boost", "style"]:
                if kwargs.get(param) is not None:
                    payload[param] = kwargs[param]
        
        return payload
    
    def _handle_response(self, response, model, output_file):
        """Handle different response types"""
        model_config = self.model_info[model]
        
        if model_config.get("binary_response"):
            # Binary audio data (OpenAI models)
            audio_data = response.content
            
            if output_file:
                with open(output_file, 'wb') as f:
                    f.write(audio_data)
                return {"audio_file": output_file, "size": len(audio_data)}
            else:
                return {"audio_data": audio_data, "size": len(audio_data)}
        
        else:
            # JSON response with URL
            data = response.json()
            
            if output_file:
                # Download and save audio
                audio_response = requests.get(data['audioUrl'])
                with open(output_file, 'wb') as f:
                    f.write(audio_response.content)
                data["local_file"] = output_file
            
            return data
    
    def _handle_error(self, response):
        """Handle API errors"""
        try:
            error_data = response.json()
            error_msg = error_data.get('error', 'Unknown error')
        except:
            error_msg = f"HTTP {response.status_code}"
        
        if response.status_code == 400:
            raise ValueError(f"Bad request: {error_msg}")
        elif response.status_code == 401:
            raise ValueError("Unauthorized: Check your API key")
        elif response.status_code == 402:
            raise ValueError("Insufficient balance")
        elif response.status_code == 413:
            raise ValueError("Text too long")
        else:
            raise Exception(f"API Error: {error_msg}")
    
    def get_model_info(self, model=None):
        """Get information about available models"""
        if model:
            return self.model_info.get(model, {})
        return self.model_info
    
    def batch_synthesize(self, texts, model="Kokoro-82m", **kwargs):
        """Synthesize multiple texts"""
        results = []
        
        for i, text in enumerate(texts):
            try:
                output_file = f"batch_output_{i+1}.wav" if kwargs.get("save_files") else None
                result = self.synthesize(text, model, output_file, **kwargs)
                results.append({"index": i, "success": True, "result": result})
            except Exception as e:
                results.append({"index": i, "success": False, "error": str(e)})
        
        return results

# Usage examples
tts = NanoGPTTTS("YOUR_API_KEY")

# Simple usage
result = tts.synthesize(
    "Hello world!",
    model="Kokoro-82m",
    voice="af_bella",
    output_file="hello.wav"
)

# Advanced Elevenlabs usage
result = tts.synthesize(
    "This is an expressive voice demonstration!",
    model="Elevenlabs-Turbo-V2.5",
    voice="Rachel",
    stability=0.3,
    similarity_boost=0.8,
    style=0.7,
    speed=1.1,
    output_file="expressive.mp3"
)

# OpenAI with instructions
result = tts.synthesize(
    "Welcome to our premium service.",
    model="tts-1-hd",
    voice="nova",
    instructions="Speak like a luxury brand representative",
    response_format="flac",
    output_file="premium.flac"
)

# Batch processing
texts = [
    "First audio clip.",
    "Second audio clip.",
    "Third audio clip."
]

batch_results = tts.batch_synthesize(
    texts,
    model="gpt-4o-mini-tts",
    voice="alloy",
    save_files=True
)

for result in batch_results:
    if result["success"]:
        print(f"Generated file {result['index']}: {result['result'].get('local_file')}")
    else:
        print(f"Failed file {result['index']}: {result['error']}")

Best Practices and Tips

Character Limits and Costs

def optimize_text_for_model(text, model="Kokoro-82m"):
    """
    Optimize text based on model limitations
    """
    tts = NanoGPTTTS("YOUR_API_KEY")
    model_info = tts.get_model_info(model)
    max_chars = model_info.get("max_chars", 10000)
    
    if len(text) <= max_chars:
        return [text]
    
    # Split long text into chunks
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk + sentence + '. ') <= max_chars:
            current_chunk += sentence + '. '
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + '. '
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Example usage
long_text = "Your very long text here..." * 100  # Simulate long text

chunks = optimize_text_for_model(long_text, "tts-1")
print(f"Split into {len(chunks)} chunks")

for i, chunk in enumerate(chunks):
    result = tts.synthesize(
        chunk,
        model="tts-1",
        voice="nova",
        output_file=f"chunk_{i+1}.mp3"
    )
    print(f"Generated chunk {i+1}")

Voice Selection Guide

def suggest_voice(content_type, model="Kokoro-82m"):
    """
    Suggest appropriate voices based on content type
    """
    suggestions = {
        "Kokoro-82m": {
            "professional": ["af_bella", "am_adam", "bf_alice", "bm_daniel"],
            "friendly": ["af_nova", "af_aoede", "am_eric", "bf_emma"],
            "authoritative": ["am_onyx", "bm_george", "am_liam"],
            "storytelling": ["af_jessica", "bf_lily", "am_echo"],
            "multilingual": {
                "japanese": ["jf_alpha", "jf_gongitsune", "jm_kumo"],
                "chinese": ["zf_xiaoxiao", "zf_xiaoyi", "zm_yunxi"],
                "french": ["ff_siwis"],
                "italian": ["im_nicola"]
            }
        },
        "Elevenlabs-Turbo-V2.5": {
            "professional": ["Rachel", "Sarah", "Matthew", "Daniel"],
            "friendly": ["Bella", "Grace", "Josh", "Ryan"],
            "authoritative": ["Adam", "James", "Michael"],
            "storytelling": ["Alice", "Emily", "Jeremy"]
        },
        "OpenAI": {
            "professional": ["nova", "echo", "onyx"],
            "friendly": ["alloy", "shimmer", "coral"],
            "authoritative": ["fable", "ash"],
            "storytelling": ["ballad", "sage", "verse"]
        }
    }
    
    model_key = "OpenAI" if model.startswith(("tts-", "gpt-")) else model
    return suggestions.get(model_key, {}).get(content_type, [])

# Example usage
print("Professional voices for Kokoro:", suggest_voice("professional", "Kokoro-82m"))
print("Storytelling voices for OpenAI:", suggest_voice("storytelling", "tts-1"))

Error Handling and Troubleshooting

import time
from requests.exceptions import RequestException, Timeout

def robust_tts_request(text, model="Kokoro-82m", max_retries=3, **kwargs):
    """
    TTS request with retry logic and comprehensive error handling
    """
    tts = NanoGPTTTS("YOUR_API_KEY")
    
    for attempt in range(max_retries):
        try:
            return tts.synthesize(text, model, **kwargs)
            
        except ValueError as e:
            # Parameter errors - don't retry
            print(f"Parameter error: {e}")
            break
            
        except RequestException as e:
            if attempt < max_retries - 1:
                wait_time = 2 ** attempt  # Exponential backoff
                print(f"Network error (attempt {attempt + 1}): {e}")
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Max retries exceeded: {e}")
                
        except Exception as e:
            print(f"Unexpected error: {e}")
            break
    
    return None

# Usage with error handling
result = robust_tts_request(
    "This request will retry on network errors.",
    model="Elevenlabs-Turbo-V2.5",
    voice="Rachel",
    output_file="robust_output.mp3"
)

if result:
    print("TTS generation successful!")
else:
    print("TTS generation failed after all retries.")

Pricing Summary

ModelCost per 1k charsMax LengthSpecial Features
Kokoro-82m$0.00110,00044 multilingual voices
Dia-TTS$0.0510,000Multi-speaker dialogue
Elevenlabs-Turbo-V2.5$0.0610,000Voice controls, 46 voices
tts-1$0.0154,096Multiple formats
tts-1-hd$0.0304,096HD quality, voice instructions
gpt-4o-mini-tts$0.00064,096Ultra-low cost