Get Started
Endpoint Examples
- GETModels
- GETModels (Old)
- POSTChat Completion
- POSTCompletions
- POSTImage Generation
- POSTImage Generation (OpenAI-Compatible)
- POSTSpeech-to-Text Transcription
- POSTSpeech-to-Text Status
- POSTText-to-Speech
- GETTEE Attestation
- GETTEE Signature
- POSTRetrieve Midjourney Generation Status
- POSTVideo Generation
- GETVideo Status
- POSTTalk to GPT (Legacy)
- POSTCheck Balance
- POSTReceive Nano
API Reference
Miscellaneous
API Reference
Speech-to-Text (STT)
Complete guide to speech-to-text transcription APIs
Overview
The NanoGPT STT API allows you to transcribe audio files into text using state-of-the-art speech recognition models. The API supports multiple languages, speaker diarization, and various audio formats with both synchronous and asynchronous processing options.
Available Models
- Whisper-Large-V3: OpenAI’s flagship model with high accuracy ($0.01/min)
- Wizper: Fast and efficient model optimized for speed ($0.01/min)
- Elevenlabs-STT: Premium model with speaker diarization and word-level timestamps ($0.03/min)
Authentication
All requests require authentication via API key:
Copy
x-api-key: YOUR_API_KEY
File Upload Methods
Method 1: Direct File Upload (≤3MB)
For smaller audio files, upload directly using multipart/form-data:
Copy
import requests
BASE_URL = "https://nano-gpt.com/api"
API_KEY = "YOUR_API_KEY"
def transcribe_file(file_path, model="Whisper-Large-V3", language="auto"):
"""
Transcribe an audio file using direct upload
"""
headers = {"x-api-key": API_KEY}
with open(file_path, 'rb') as audio_file:
files = {
'audio': ('audio.mp3', audio_file, 'audio/mpeg')
}
data = {
'model': model,
'language': language
}
response = requests.post(
f"{BASE_URL}/transcribe",
headers=headers,
files=files,
data=data
)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"Error: {response.status_code} - {response.text}")
# Example usage
try:
result = transcribe_file("meeting.mp3", model="Whisper-Large-V3", language="en")
print("Transcription:", result['transcription'])
print("Cost:", result['metadata']['cost'])
print("Duration:", result['metadata']['chargedDuration'], "minutes")
except Exception as e:
print(f"Error: {e}")
Method 2: URL Upload (Recommended for >3MB)
For larger files, use URL-based upload:
Copy
def transcribe_url(audio_url, model="Whisper-Large-V3", language="auto"):
"""
Transcribe an audio file from URL
"""
headers = {
"x-api-key": API_KEY,
"Content-Type": "application/json"
}
data = {
"audioUrl": audio_url,
"model": model,
"language": language
}
response = requests.post(
f"{BASE_URL}/transcribe",
headers=headers,
json=data
)
if response.status_code == 200:
return response.json()
else:
raise Exception(f"Error: {response.status_code} - {response.text}")
# Example usage
audio_url = "https://example.com/large-audio-file.mp3"
result = transcribe_url(audio_url, model="Wizper")
print("Transcription:", result['transcription'])
Advanced Features with Elevenlabs-STT
Speaker Diarization
Identify and label different speakers in conversations:
Copy
import time
def transcribe_with_diarization(audio_url):
"""
Transcribe with speaker identification (async)
"""
headers = {
"x-api-key": API_KEY,
"Content-Type": "application/json"
}
# Submit transcription job
data = {
"audioUrl": audio_url,
"model": "Elevenlabs-STT",
"diarize": True,
"tagAudioEvents": True,
"language": "auto"
}
response = requests.post(
f"{BASE_URL}/transcribe",
headers=headers,
json=data
)
if response.status_code == 202:
job_data = response.json()
return poll_for_results(job_data)
else:
raise Exception(f"Error: {response.status_code}")
def poll_for_results(job_data):
"""
Poll for transcription results
"""
headers = {
"x-api-key": API_KEY,
"Content-Type": "application/json"
}
status_data = {
"runId": job_data['runId'],
"cost": job_data.get('cost'),
"paymentSource": job_data.get('paymentSource'),
"isApiRequest": True,
"fileName": job_data.get('fileName'),
"fileSize": job_data.get('fileSize'),
"chargedDuration": job_data.get('chargedDuration'),
"diarize": job_data.get('diarize', False)
}
max_attempts = 60
for attempt in range(max_attempts):
print(f"Checking status... (attempt {attempt + 1}/{max_attempts})")
response = requests.post(
f"{BASE_URL}/transcribe/status",
headers=headers,
json=status_data
)
if response.status_code == 200:
result = response.json()
status = result.get('status')
if status == 'completed':
return result
elif status == 'failed':
raise Exception(f"Transcription failed: {result.get('error')}")
time.sleep(5) # Wait 5 seconds before next check
raise Exception("Transcription timed out")
# Example usage
conversation_url = "https://example.com/meeting-recording.mp3"
try:
result = transcribe_with_diarization(conversation_url)
print("Full Transcription:", result['transcription'])
print("\nSpeaker Breakdown:")
if 'diarization' in result:
for segment in result['diarization']['segments']:
print(f"{segment['speaker']} ({segment['start']}-{segment['end']}s): {segment['text']}")
# Word-level timestamps
if 'words' in result:
print("\nWord-level timestamps:")
for word in result['words'][:10]: # Show first 10 words
if word['type'] == 'word':
print(f"'{word['text']}' at {word['start']}-{word['end']}s")
except Exception as e:
print(f"Error: {e}")
Language Support
The API supports 97+ languages with auto-detection:
Copy
# Common language codes
SUPPORTED_LANGUAGES = {
"auto": "Auto-detect",
"en": "English",
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"zh": "Chinese",
"ja": "Japanese",
"ko": "Korean",
"ar": "Arabic",
"hi": "Hindi",
"ru": "Russian"
}
def transcribe_multilingual(audio_files):
"""
Transcribe multiple files with different languages
"""
results = []
for file_info in audio_files:
try:
result = transcribe_url(
file_info['url'],
language=file_info.get('language', 'auto')
)
results.append({
'file': file_info['name'],
'language': result['metadata']['language'],
'transcription': result['transcription'],
'cost': result['metadata']['cost']
})
except Exception as e:
results.append({
'file': file_info['name'],
'error': str(e)
})
return results
# Example usage
audio_files = [
{"name": "english.mp3", "url": "https://example.com/english.mp3", "language": "en"},
{"name": "spanish.mp3", "url": "https://example.com/spanish.mp3", "language": "es"},
{"name": "unknown.mp3", "url": "https://example.com/unknown.mp3", "language": "auto"}
]
results = transcribe_multilingual(audio_files)
for result in results:
if 'error' not in result:
print(f"{result['file']} ({result['language']}): {result['transcription'][:100]}...")
else:
print(f"{result['file']}: Error - {result['error']}")
Complete Class Implementation
Here’s a complete transcriber class with error handling and retry logic:
Copy
import requests
import time
import json
from pathlib import Path
class NanoGPTTranscriber:
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "https://nano-gpt.com/api"
def transcribe(self, audio_path=None, audio_url=None, **kwargs):
"""
Transcribe audio with automatic method selection
"""
if audio_path and audio_url:
raise ValueError("Specify either audio_path or audio_url, not both")
if audio_path:
return self._transcribe_file(audio_path, **kwargs)
elif audio_url:
return self._transcribe_url(audio_url, **kwargs)
else:
raise ValueError("Either audio_path or audio_url must be provided")
def _transcribe_file(self, audio_path, **kwargs):
"""Direct file upload transcription"""
headers = {"x-api-key": self.api_key}
path = Path(audio_path)
if path.stat().st_size > 3 * 1024 * 1024: # 3MB
raise ValueError("File too large for direct upload. Use audio_url method.")
with open(audio_path, 'rb') as f:
files = {'audio': (path.name, f.read(), 'audio/mpeg')}
data = self._prepare_request_data(**kwargs)
response = requests.post(
f"{self.base_url}/transcribe",
headers=headers,
files=files,
data=data
)
return self._handle_response(response)
def _transcribe_url(self, audio_url, **kwargs):
"""URL-based transcription"""
headers = {
"x-api-key": self.api_key,
"Content-Type": "application/json"
}
data = {"audioUrl": audio_url}
data.update(self._prepare_request_data(**kwargs))
response = requests.post(
f"{self.base_url}/transcribe",
headers=headers,
json=data
)
return self._handle_response(response)
def _prepare_request_data(self, **kwargs):
"""Prepare request data with defaults"""
data = {
"model": kwargs.get("model", "Whisper-Large-V3"),
"language": kwargs.get("language", "auto")
}
# Add optional parameters
if kwargs.get("diarize"):
data["diarize"] = "true" if isinstance(kwargs["diarize"], bool) else kwargs["diarize"]
if kwargs.get("tagAudioEvents"):
data["tagAudioEvents"] = "true" if isinstance(kwargs["tagAudioEvents"], bool) else kwargs["tagAudioEvents"]
if kwargs.get("actualDuration"):
data["actualDuration"] = str(kwargs["actualDuration"])
return data
def _handle_response(self, response):
"""Handle API response"""
if response.status_code == 200:
return response.json()
elif response.status_code == 202:
return self._poll_async_job(response.json())
else:
try:
error_data = response.json()
raise Exception(f"API Error: {error_data.get('error', 'Unknown error')}")
except json.JSONDecodeError:
raise Exception(f"HTTP Error: {response.status_code}")
def _poll_async_job(self, job_data):
"""Poll for async job completion"""
headers = {
"x-api-key": self.api_key,
"Content-Type": "application/json"
}
status_data = {
"runId": job_data['runId'],
"cost": job_data.get('cost'),
"paymentSource": job_data.get('paymentSource'),
"isApiRequest": True,
"fileName": job_data.get('fileName'),
"fileSize": job_data.get('fileSize'),
"chargedDuration": job_data.get('chargedDuration'),
"diarize": job_data.get('diarize', False)
}
max_attempts = 60
for attempt in range(max_attempts):
time.sleep(5)
response = requests.post(
f"{self.base_url}/transcribe/status",
headers=headers,
json=status_data
)
if response.status_code == 200:
result = response.json()
if result.get('status') == 'completed':
return result
elif result.get('status') == 'failed':
raise Exception(f"Transcription failed: {result.get('error')}")
raise Exception("Transcription timed out")
def format_diarization(self, result):
"""Format transcription with speaker labels"""
if 'diarization' in result and 'segments' in result['diarization']:
segments = result['diarization']['segments']
return '\n\n'.join([
f"{seg['speaker']}: {seg['text']}"
for seg in segments
])
return result.get('transcription', '')
# Usage examples
transcriber = NanoGPTTranscriber("YOUR_API_KEY")
# Simple transcription
result = transcriber.transcribe(
audio_path="meeting.mp3",
model="Whisper-Large-V3",
language="en"
)
print("Transcription:", result['transcription'])
# Advanced with speaker diarization
result = transcriber.transcribe(
audio_url="https://example.com/conversation.mp3",
model="Elevenlabs-STT",
diarize=True,
tagAudioEvents=True
)
print("Formatted conversation:")
print(transcriber.format_diarization(result))
Error Handling and Best Practices
Common Error Responses
Copy
def handle_transcription_errors(func):
"""Decorator for handling common transcription errors"""
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except requests.exceptions.HTTPError as e:
if e.response.status_code == 400:
print("Bad Request: Check file format or parameters")
elif e.response.status_code == 401:
print("Unauthorized: Check your API key")
elif e.response.status_code == 402:
print("Insufficient balance: Top up your account")
elif e.response.status_code == 413:
print("File too large: Use URL upload for files >3MB")
else:
print(f"HTTP Error {e.response.status_code}")
except Exception as e:
print(f"Error: {str(e)}")
return wrapper
@handle_transcription_errors
def safe_transcribe(transcriber, **kwargs):
return transcriber.transcribe(**kwargs)
File Format Support
Copy
SUPPORTED_FORMATS = {
'.mp3': 'audio/mpeg',
'.wav': 'audio/wav',
'.m4a': 'audio/mp4',
'.ogg': 'audio/ogg',
'.aac': 'audio/aac'
}
def validate_audio_file(file_path):
"""Validate audio file format and size"""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
if path.suffix.lower() not in SUPPORTED_FORMATS:
raise ValueError(f"Unsupported format: {path.suffix}")
size_mb = path.stat().st_size / (1024 * 1024)
if size_mb > 3:
print(f"Warning: File is {size_mb:.1f}MB. Consider using URL upload.")
return True
Pricing and Billing
- Whisper-Large-V3: $0.01 per minute
- Wizper: $0.01 per minute
- Elevenlabs-STT: $0.03 per minute
Costs are calculated based on audio duration.
On this page
- Overview
- Available Models
- Authentication
- File Upload Methods
- Method 1: Direct File Upload (≤3MB)
- Method 2: URL Upload (Recommended for >3MB)
- Advanced Features with Elevenlabs-STT
- Speaker Diarization
- Language Support
- Complete Class Implementation
- Error Handling and Best Practices
- Common Error Responses
- File Format Support
- Pricing and Billing
Assistant
Responses are generated using AI and may contain mistakes.