{"service":"Omnia Speech-to-Text API","version":"2.1.0","endpoints":{"/stream":{"method":"WebSocket","description":"Real-time streaming transcription","authentication":{"primary":"Sec-WebSocket-Protocol: token, YOUR_API_KEY (Deepgram-style, recommended)","fallback":"Send {\"type\": \"auth\", \"apiKey\": \"YOUR_KEY\"} message after connect"},"audio_requirements":{"format":"PCM 16-bit signed little-endian (LINEAR16)","sample_rate":"16000 Hz","channels":1,"chunk_size":"640 bytes (20ms of 16kHz 16-bit mono audio)","chunk_interval":"20ms recommended for optimal real-time performance"},"languages":{"description":"Optional. Restrict or auto-detect spoken languages.","via_auth_message":"{\"type\": \"auth\", \"apiKey\": \"...\", \"languages\": [\"fi-FI\", \"en-US\"]}","via_query_param":"wss://stt.omnia-voice.com/stream?languages=fi-FI,en-US (for Sec-WebSocket-Protocol auth)","auto_detect":"Pass [\"auto\"] to detect the dominant language automatically","default":["fi-FI","sv-SE"]},"messages":{"client_to_server":{"audio":"Binary frames containing audio data","control":"{\"type\": \"end\"} to signal end of audio"},"server_to_client":{"ready":"{\"type\": \"ready\", \"sessionId\": \"...\", \"message\": \"...\"}","transcript":"{\"type\": \"transcript\", \"transcript\": \"text\", \"isFinal\": true/false, \"confidence\": 0.95, \"language\": \"fi-FI\"}","error":"{\"type\": \"error\", \"error\": \"message\"}"}},"example":{"javascript":"\n// Recommended: Sec-WebSocket-Protocol authentication (Deepgram-style)\nconst ws = new WebSocket('wss://stt.omnia-voice.com/stream', ['token', 'ck_YOUR_API_KEY']);\n\nws.onopen = () => {\n  // Ready immediately - start sending audio\n  console.log('Connected and authenticated');\n};\n\nws.onmessage = (event) => {\n  const msg = JSON.parse(event.data);\n  if (msg.type === 'ready') {\n    console.log('Session:', msg.sessionId);\n  } else if (msg.type === 'transcript') {\n    console.log(msg.isFinal ? '[FINAL]' : '[INTERIM]', msg.transcript);\n  }\n};\n\n// Send 20ms chunks (640 bytes for 16kHz 16-bit mono)\nfunction sendAudioChunk(pcmData) {\n  ws.send(pcmData); // Binary ArrayBuffer or Blob\n}\n"}},"/transcribe":{"method":"POST","description":"Synchronous file transcription (up to 60 seconds)","authentication":"X-API-Key: YOUR_API_KEY header","request":{"content_type":"audio/* (audio/mpeg, audio/wav, audio/flac, audio/ogg, audio/webm)","body":"Raw audio file bytes","max_size":"10 MB","max_duration":"60 seconds","languages_header":"X-Languages: fi-FI,en-US (optional; or \"auto\" to detect; also accepted as ?languages=)"},"supported_formats":{"audio/mpeg":"MP3 files","audio/wav":"WAV files (PCM)","audio/flac":"FLAC files","audio/ogg":"OGG Opus files","audio/webm":"WebM Opus files","audio/x-raw":"Raw PCM (16-bit, 16kHz, mono)"},"response":{"transcript":"Full transcription text","segments":"[{transcript, confidence, language, isFinal}]","audioSize":"Size in bytes","processingTime":"Processing time in ms"},"example":{"curl":"\ncurl -X POST https://stt.omnia-voice.com/transcribe \\\n  -H \"X-API-Key: YOUR_KEY\" \\\n  -H \"Content-Type: audio/mpeg\" \\\n  --data-binary @audio.mp3\n","javascript":"\nconst response = await fetch('https://stt.omnia-voice.com/transcribe', {\n  method: 'POST',\n  headers: {\n    'X-API-Key': 'YOUR_KEY',\n    'Content-Type': 'audio/mpeg'\n  },\n  body: audioFile // File or Blob\n});\nconst result = await response.json();\nconsole.log(result.transcript);\n"}},"/batch":{"method":"POST","description":"Asynchronous transcription for long audio (up to 8 hours / 500 MB)","authentication":"X-API-Key: YOUR_API_KEY header","request":{"content_type":"Audio MIME type (MP3, WAV, FLAC, OGG/WebM Opus, MP4/M4A AAC, raw PCM)","body":"Raw audio file bytes","max_size":"500 MB","max_duration":"8 hours","languages_header":"X-Languages: fi-FI,en-US (optional; or \"auto\"; also accepted as ?languages=)","metadata_header":"X-Metadata: {...} (optional, passed to usage webhooks)","diarization_header":"X-Diarization: true or \"minSpeakers,maxSpeakers\" (optional; requires explicit X-Languages from the supported set)"},"diarization":{"description":"Identifies who said what; adds speakerTurns to the completed result","languages":"Use X-Languages: auto (any language, auto-detected) or explicit codes from the supported set","supported_explicit_languages":["cmn-Hans-CN","de-DE","en-GB","en-IN","en-US","es-ES","es-US","fr-CA","fr-FR","hi-IN","it-IT","ja-JP","ko-KR","pt-BR"]},"response":"202 with {operationId, status: \"processing\", pollUrl}","large_files":{"description":"Direct POST bodies are capped at ~32MB by the platform. For larger files use the signed-URL flow","flow":["POST /batch/init (X-Content-Type + same option headers) -> {operationId, uploadUrl, startUrl}","PUT the audio to uploadUrl (Content-Type must match; no API key needed; works from browsers)","POST /batch/{operationId}/start -> 202, then poll as usual"]},"polling":{"endpoint":"GET /batch/{operationId} (same API key)","awaiting_upload":"{operationId, status: \"awaiting_upload\", startUrl} (signed-URL flow, before /start)","processing":"{operationId, status: \"processing\", progressPercent}","completed":"{operationId, status: \"completed\", transcript, segments, speakerTurns?, durationMs}","error":"{operationId, status: \"error\", error}","retention":"Results available for 24 hours"}},"/health":{"method":"GET","description":"Health check endpoint","response":{"status":"ok or error","activeConnections":"Number of active WebSocket connections"}},"/metrics":{"method":"GET","description":"Usage statistics and monitoring (requires API key). Metrics are persisted in Redis.","authentication":"Required - API key via ?apiKey=, X-API-Key header, or Bearer token","response":{"uptime":"Server uptime in ms, hours, and ISO timestamp","current":{"activeConnections":"Current WebSocket connections","activeStreams":"Current active streaming sessions"},"totals":{"streamingSessions":"Total streaming sessions since startup","transcribeRequests":"Total transcribe requests since startup","streamingDuration":"Total streaming session duration (ms, seconds, minutes)","audioDuration":"Estimated total audio duration processed (ms, seconds, minutes)","bytesProcessed":"Total bytes of audio processed (bytes, MB)"},"averages":{"streamingSessionDurationMs":"Average streaming session duration","audioDurationPerSessionMs":"Average audio duration per session","sessionsPerHour":"Average sessions per hour"}},"duration_tracking":{"streaming":{"description":"Duration is calculated when WebSocket connection closes","session_duration":"Wall clock time from connect to disconnect","audio_duration":"Estimated from bytes: bytes / (sampleRate * bytesPerSample * channels) * 1000","formula":"For PCM 16kHz 16-bit mono: bytes / 32000 * 1000 = milliseconds","example":"320000 bytes = 10 seconds of audio"},"transcribe":{"description":"Duration is estimated from file size when request completes","audio_duration":"Estimated based on encoding format and file size"}}}},"audio_recommendations":{"streaming":{"format":"PCM 16-bit signed little-endian","sample_rate":16000,"channels":1,"chunk_duration_ms":20,"chunk_size_bytes":640,"calculation":"sample_rate × bytes_per_sample × channels × duration = 16000 × 2 × 1 × 0.02 = 640 bytes"},"file_upload":{"preferred_formats":["MP3","FLAC","WAV"],"max_duration_seconds":60,"max_size_mb":10}},"languages":{"default":["fi-FI","sv-SE"],"option":"Pass explicit BCP-47 codes (up to 10) or [\"auto\"] for automatic detection","examples":["fi-FI","sv-SE","en-US","en-GB","de-DE","fr-FR","es-ES","ja-JP","pt-BR"],"note":"Auto-detect transcribes in the dominant language of the audio; detected language is returned on each transcript"},"errors":{"400":"Bad request - Invalid languages option or empty audio","401":"Unauthorized - Invalid or missing API key","413":"File too large - Maximum 10MB","503":"Service unavailable - Server at capacity"}}