Speech-to-Text (STT)

Overview

The NanoGPT STT API allows you to transcribe audio files into text using state-of-the-art speech recognition models. The API supports multiple languages, speaker diarization, and various audio formats with both synchronous and asynchronous processing options.

Available Models

Whisper-Large-V3: OpenAI’s flagship model with high accuracy ($0.01/min)
Wizper: Fast and efficient model optimized for speed ($0.01/min)
Elevenlabs-STT: Premium model with speaker diarization and word-level timestamps ($0.03/min)

Authentication

All requests require authentication via API key:

x-api-key: YOUR_API_KEY

File Upload Methods

Method 1: Direct File Upload (≤3MB)

For smaller audio files, upload directly using multipart/form-data:

import requests

BASE_URL = "https://nano-gpt.com/api"
API_KEY = "YOUR_API_KEY"

def transcribe_file(file_path, model="Whisper-Large-V3", language="auto"):
    """
    Transcribe an audio file using direct upload
    """
    headers = {"x-api-key": API_KEY}
    
    with open(file_path, 'rb') as audio_file:
        files = {
            'audio': ('audio.mp3', audio_file, 'audio/mpeg')
        }
        data = {
            'model': model,
            'language': language
        }
        
        response = requests.post(
            f"{BASE_URL}/transcribe",
            headers=headers,
            files=files,
            data=data
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            raise Exception(f"Error: {response.status_code} - {response.text}")

# Example usage
try:
    result = transcribe_file("meeting.mp3", model="Whisper-Large-V3", language="en")
    print("Transcription:", result['transcription'])
    print("Cost:", result['metadata']['cost'])
    print("Duration:", result['metadata']['chargedDuration'], "minutes")
except Exception as e:
    print(f"Error: {e}")

Method 2: URL Upload (Recommended for >3MB)

For larger files, use URL-based upload:

def transcribe_url(audio_url, model="Whisper-Large-V3", language="auto"):
    """
    Transcribe an audio file from URL
    """
    headers = {
        "x-api-key": API_KEY,
        "Content-Type": "application/json"
    }
    
    data = {
        "audioUrl": audio_url,
        "model": model,
        "language": language
    }
    
    response = requests.post(
        f"{BASE_URL}/transcribe",
        headers=headers,
        json=data
    )
    
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Error: {response.status_code} - {response.text}")

# Example usage
audio_url = "https://example.com/large-audio-file.mp3"
result = transcribe_url(audio_url, model="Wizper")
print("Transcription:", result['transcription'])

Advanced Features with Elevenlabs-STT

Speaker Diarization

Identify and label different speakers in conversations:

import time

def transcribe_with_diarization(audio_url):
    """
    Transcribe with speaker identification (async)
    """
    headers = {
        "x-api-key": API_KEY,
        "Content-Type": "application/json"
    }
    
    # Submit transcription job
    data = {
        "audioUrl": audio_url,
        "model": "Elevenlabs-STT",
        "diarize": True,
        "tagAudioEvents": True,
        "language": "auto"
    }
    
    response = requests.post(
        f"{BASE_URL}/transcribe",
        headers=headers,
        json=data
    )
    
    if response.status_code == 202:
        job_data = response.json()
        return poll_for_results(job_data)
    else:
        raise Exception(f"Error: {response.status_code}")

def poll_for_results(job_data):
    """
    Poll for transcription results
    """
    headers = {
        "x-api-key": API_KEY,
        "Content-Type": "application/json"
    }
    
    status_data = {
        "runId": job_data['runId'],
        "cost": job_data.get('cost'),
        "paymentSource": job_data.get('paymentSource'),
        "isApiRequest": True,
        "fileName": job_data.get('fileName'),
        "fileSize": job_data.get('fileSize'),
        "chargedDuration": job_data.get('chargedDuration'),
        "diarize": job_data.get('diarize', False)
    }
    
    max_attempts = 60
    for attempt in range(max_attempts):
        print(f"Checking status... (attempt {attempt + 1}/{max_attempts})")
        
        response = requests.post(
            f"{BASE_URL}/transcribe/status",
            headers=headers,
            json=status_data
        )
        
        if response.status_code == 200:
            result = response.json()
            status = result.get('status')
            
            if status == 'completed':
                return result
            elif status == 'failed':
                raise Exception(f"Transcription failed: {result.get('error')}")
        
        time.sleep(5)  # Wait 5 seconds before next check
    
    raise Exception("Transcription timed out")

# Example usage
conversation_url = "https://example.com/meeting-recording.mp3"
try:
    result = transcribe_with_diarization(conversation_url)
    
    print("Full Transcription:", result['transcription'])
    print("\nSpeaker Breakdown:")
    
    if 'diarization' in result:
        for segment in result['diarization']['segments']:
            print(f"{segment['speaker']} ({segment['start']}-{segment['end']}s): {segment['text']}")
    
    # Word-level timestamps
    if 'words' in result:
        print("\nWord-level timestamps:")
        for word in result['words'][:10]:  # Show first 10 words
            if word['type'] == 'word':
                print(f"'{word['text']}' at {word['start']}-{word['end']}s")
                
except Exception as e:
    print(f"Error: {e}")

Language Support

The API supports 97+ languages with auto-detection:

# Common language codes
SUPPORTED_LANGUAGES = {
    "auto": "Auto-detect",
    "en": "English",
    "es": "Spanish", 
    "fr": "French",
    "de": "German",
    "it": "Italian",
    "pt": "Portuguese",
    "zh": "Chinese",
    "ja": "Japanese",
    "ko": "Korean",
    "ar": "Arabic",
    "hi": "Hindi",
    "ru": "Russian"
}

def transcribe_multilingual(audio_files):
    """
    Transcribe multiple files with different languages
    """
    results = []
    
    for file_info in audio_files:
        try:
            result = transcribe_url(
                file_info['url'], 
                language=file_info.get('language', 'auto')
            )
            
            results.append({
                'file': file_info['name'],
                'language': result['metadata']['language'],
                'transcription': result['transcription'],
                'cost': result['metadata']['cost']
            })
            
        except Exception as e:
            results.append({
                'file': file_info['name'],
                'error': str(e)
            })
    
    return results

# Example usage
audio_files = [
    {"name": "english.mp3", "url": "https://example.com/english.mp3", "language": "en"},
    {"name": "spanish.mp3", "url": "https://example.com/spanish.mp3", "language": "es"},
    {"name": "unknown.mp3", "url": "https://example.com/unknown.mp3", "language": "auto"}
]

results = transcribe_multilingual(audio_files)
for result in results:
    if 'error' not in result:
        print(f"{result['file']} ({result['language']}): {result['transcription'][:100]}...")
    else:
        print(f"{result['file']}: Error - {result['error']}")

Complete Class Implementation

Here’s a complete transcriber class with error handling and retry logic:

import requests
import time
import json
from pathlib import Path

class NanoGPTTranscriber:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://nano-gpt.com/api"
        
    def transcribe(self, audio_path=None, audio_url=None, **kwargs):
        """
        Transcribe audio with automatic method selection
        """
        if audio_path and audio_url:
            raise ValueError("Specify either audio_path or audio_url, not both")
        
        if audio_path:
            return self._transcribe_file(audio_path, **kwargs)
        elif audio_url:
            return self._transcribe_url(audio_url, **kwargs)
        else:
            raise ValueError("Either audio_path or audio_url must be provided")
    
    def _transcribe_file(self, audio_path, **kwargs):
        """Direct file upload transcription"""
        headers = {"x-api-key": self.api_key}
        
        path = Path(audio_path)
        if path.stat().st_size > 3 * 1024 * 1024:  # 3MB
            raise ValueError("File too large for direct upload. Use audio_url method.")
        
        with open(audio_path, 'rb') as f:
            files = {'audio': (path.name, f.read(), 'audio/mpeg')}
        
        data = self._prepare_request_data(**kwargs)
        
        response = requests.post(
            f"{self.base_url}/transcribe",
            headers=headers,
            files=files,
            data=data
        )
        
        return self._handle_response(response)
    
    def _transcribe_url(self, audio_url, **kwargs):
        """URL-based transcription"""
        headers = {
            "x-api-key": self.api_key,
            "Content-Type": "application/json"
        }
        
        data = {"audioUrl": audio_url}
        data.update(self._prepare_request_data(**kwargs))
        
        response = requests.post(
            f"{self.base_url}/transcribe",
            headers=headers,
            json=data
        )
        
        return self._handle_response(response)
    
    def _prepare_request_data(self, **kwargs):
        """Prepare request data with defaults"""
        data = {
            "model": kwargs.get("model", "Whisper-Large-V3"),
            "language": kwargs.get("language", "auto")
        }
        
        # Add optional parameters
        if kwargs.get("diarize"):
            data["diarize"] = "true" if isinstance(kwargs["diarize"], bool) else kwargs["diarize"]
        if kwargs.get("tagAudioEvents"):
            data["tagAudioEvents"] = "true" if isinstance(kwargs["tagAudioEvents"], bool) else kwargs["tagAudioEvents"]
        if kwargs.get("actualDuration"):
            data["actualDuration"] = str(kwargs["actualDuration"])
            
        return data
    
    def _handle_response(self, response):
        """Handle API response"""
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 202:
            return self._poll_async_job(response.json())
        else:
            try:
                error_data = response.json()
                raise Exception(f"API Error: {error_data.get('error', 'Unknown error')}")
            except json.JSONDecodeError:
                raise Exception(f"HTTP Error: {response.status_code}")
    
    def _poll_async_job(self, job_data):
        """Poll for async job completion"""
        headers = {
            "x-api-key": self.api_key,
            "Content-Type": "application/json"
        }
        
        status_data = {
            "runId": job_data['runId'],
            "cost": job_data.get('cost'),
            "paymentSource": job_data.get('paymentSource'),
            "isApiRequest": True,
            "fileName": job_data.get('fileName'),
            "fileSize": job_data.get('fileSize'),
            "chargedDuration": job_data.get('chargedDuration'),
            "diarize": job_data.get('diarize', False)
        }
        
        max_attempts = 60
        for attempt in range(max_attempts):
            time.sleep(5)
            
            response = requests.post(
                f"{self.base_url}/transcribe/status",
                headers=headers,
                json=status_data
            )
            
            if response.status_code == 200:
                result = response.json()
                if result.get('status') == 'completed':
                    return result
                elif result.get('status') == 'failed':
                    raise Exception(f"Transcription failed: {result.get('error')}")
        
        raise Exception("Transcription timed out")
    
    def format_diarization(self, result):
        """Format transcription with speaker labels"""
        if 'diarization' in result and 'segments' in result['diarization']:
            segments = result['diarization']['segments']
            return '\n\n'.join([
                f"{seg['speaker']}: {seg['text']}"
                for seg in segments
            ])
        return result.get('transcription', '')

# Usage examples
transcriber = NanoGPTTranscriber("YOUR_API_KEY")

# Simple transcription
result = transcriber.transcribe(
    audio_path="meeting.mp3",
    model="Whisper-Large-V3",
    language="en"
)
print("Transcription:", result['transcription'])

# Advanced with speaker diarization
result = transcriber.transcribe(
    audio_url="https://example.com/conversation.mp3",
    model="Elevenlabs-STT",
    diarize=True,
    tagAudioEvents=True
)

print("Formatted conversation:")
print(transcriber.format_diarization(result))

Error Handling and Best Practices

Common Error Responses

def handle_transcription_errors(func):
    """Decorator for handling common transcription errors"""
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 400:
                print("Bad Request: Check file format or parameters")
            elif e.response.status_code == 401:
                print("Unauthorized: Check your API key")
            elif e.response.status_code == 402:
                print("Insufficient balance: Top up your account")
            elif e.response.status_code == 413:
                print("File too large: Use URL upload for files >3MB")
            else:
                print(f"HTTP Error {e.response.status_code}")
        except Exception as e:
            print(f"Error: {str(e)}")
            
    return wrapper

@handle_transcription_errors
def safe_transcribe(transcriber, **kwargs):
    return transcriber.transcribe(**kwargs)

File Format Support

SUPPORTED_FORMATS = {
    '.mp3': 'audio/mpeg',
    '.wav': 'audio/wav', 
    '.m4a': 'audio/mp4',
    '.ogg': 'audio/ogg',
    '.aac': 'audio/aac'
}

def validate_audio_file(file_path):
    """Validate audio file format and size"""
    path = Path(file_path)
    
    if not path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    if path.suffix.lower() not in SUPPORTED_FORMATS:
        raise ValueError(f"Unsupported format: {path.suffix}")
    
    size_mb = path.stat().st_size / (1024 * 1024)
    if size_mb > 3:
        print(f"Warning: File is {size_mb:.1f}MB. Consider using URL upload.")
    
    return True

Pricing and Billing

Whisper-Large-V3: $0.01 per minute
Wizper: $0.01 per minute
Elevenlabs-STT: $0.03 per minute

Costs are calculated based on audio duration.

Get Started

Endpoint Examples

API Reference

Miscellaneous

Integrations

Overview

Available Models

Authentication

File Upload Methods

Method 1: Direct File Upload (≤3MB)

Method 2: URL Upload (Recommended for >3MB)

Advanced Features with Elevenlabs-STT

Speaker Diarization

Language Support

Complete Class Implementation

Error Handling and Best Practices

Common Error Responses

File Format Support

Pricing and Billing

Get Started

Endpoint Examples

API Reference

Miscellaneous

Integrations

​Overview

​Available Models

​Authentication

​File Upload Methods

​Method 1: Direct File Upload (≤3MB)

​Method 2: URL Upload (Recommended for >3MB)

​Advanced Features with Elevenlabs-STT

​Speaker Diarization

​Language Support

​Complete Class Implementation

​Error Handling and Best Practices

​Common Error Responses

​File Format Support

​Pricing and Billing

Overview

Available Models

Authentication

File Upload Methods

Method 1: Direct File Upload (≤3MB)

Method 2: URL Upload (Recommended for >3MB)

Advanced Features with Elevenlabs-STT

Speaker Diarization

Language Support

Complete Class Implementation

Error Handling and Best Practices

Common Error Responses

File Format Support

Pricing and Billing