diff --git a/fern/docs.yml b/fern/docs.yml index 0a34784b..449c677d 100644 --- a/fern/docs.yml +++ b/fern/docs.yml @@ -133,7 +133,7 @@ navigation: path: pages/02-speech-to-text/pre-recorded-audio/delete-transcripts.mdx - section: Streaming Speech-to-text icon: microphone-lines - path: pages/02-speech-to-text/universal-streaming/universal-streaming.mdx + path: pages/02-speech-to-text/universal-streaming/universal-streaming/universal-streaming.mdx slug: /universal-streaming contents: - page: Authenticate with a temporary token @@ -141,9 +141,9 @@ navigation: - page: Select the region path: pages/02-speech-to-text/universal-streaming/select-the-region.mdx - page: Multichannel streams - path: pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx + path: pages/02-speech-to-text/universal-streaming/multichannel-streaming/multichannel-streaming.mdx - page: Multilingual transcription - path: pages/02-speech-to-text/universal-streaming/multilingual.mdx + path: pages/02-speech-to-text/universal-streaming/multilingual/multilingual.mdx - page: Turn detection path: pages/02-speech-to-text/universal-streaming/turn-detection.mdx slug: /turn-detection @@ -154,7 +154,7 @@ navigation: path: pages/02-speech-to-text/universal-streaming/common-errors.mdx slug: /common-session-errors-and-closures - page: Keyterms Prompting - path: pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms.mdx + path: pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/universal-streaming-keyterms.mdx slug: /keyterms-prompting - section: Voice agents path: pages/02-speech-to-text/universal-streaming/voice-agents/index.mdx diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript-sdk.js b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript-sdk.js new file mode 100644 index 00000000..0a784f29 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript-sdk.js @@ -0,0 +1,316 @@ +import { AssemblyAI } from 'assemblyai'; +import fs from 'fs'; +import { spawn } from 'child_process'; +import { Readable } from 'stream'; + +// Configuration +const YOUR_API_KEY = ''; +const AUDIO_FILE_PATH = ''; + +// Simple WAV file parser +class SimpleWavParser { + constructor(filePath) { + this.buffer = fs.readFileSync(filePath); + this.parseHeader(); + } + + parseHeader() { + // Read WAV header + this.channels = this.buffer.readUInt16LE(22); + this.sampleRate = this.buffer.readUInt32LE(24); + this.bitsPerSample = this.buffer.readUInt16LE(34); + + // Find data chunk + let dataOffset = 12; + while (dataOffset < this.buffer.length - 8) { + const chunkId = this.buffer.toString('ascii', dataOffset, dataOffset + 4); + const chunkSize = this.buffer.readUInt32LE(dataOffset + 4); + + if (chunkId === 'data') { + this.dataStart = dataOffset + 8; + this.dataSize = chunkSize; + break; + } + + dataOffset += 8 + chunkSize; + } + } + + getChannelData(channelIndex) { + if (this.channels !== 2) { + throw new Error('Audio file is not stereo'); + } + + const bytesPerSample = this.bitsPerSample / 8; + const samplesPerChannel = this.dataSize / (bytesPerSample * this.channels); + const channelData = []; + + // Extract samples for the specified channel + for (let i = 0; i < samplesPerChannel; i++) { + const sampleOffset = this.dataStart + (i * this.channels + channelIndex) * bytesPerSample; + + if (this.bitsPerSample === 16) { + const sample = this.buffer.readInt16LE(sampleOffset); + channelData.push(sample); + } else if (this.bitsPerSample === 8) { + const sample = this.buffer.readUInt8(sampleOffset) - 128; + channelData.push(sample * 256); // Convert to 16-bit range + } + } + + return channelData; + } +} + +class ChannelTranscriber { + constructor(client, channelId, channelName, sampleRate) { + this.client = client; + this.channelId = channelId; + this.channelName = channelName; + this.sampleRate = sampleRate; + this.transcriber = null; + this.audioData = []; + this.currentTurnLine = null; + this.lineCount = 0; + } + + loadAudioChannel() { + try { + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const channelSamples = wavParser.getChannelData(this.channelId); + + // Split into chunks for streaming (50ms chunks) + const FRAMES_PER_BUFFER = Math.floor(this.sampleRate * 0.05); // 50ms + + for (let i = 0; i < channelSamples.length; i += FRAMES_PER_BUFFER) { + const chunkArray = new Int16Array(FRAMES_PER_BUFFER); + + // Copy samples and pad if necessary + for (let j = 0; j < FRAMES_PER_BUFFER; j++) { + if (i + j < channelSamples.length) { + chunkArray[j] = channelSamples[i + j]; + } else { + chunkArray[j] = 0; // Pad with silence + } + } + + // Convert to Buffer (Little Endian) + const buffer = Buffer.from(chunkArray.buffer); + this.audioData.push(buffer); + } + } catch (error) { + throw error; + } + } + + clearCurrentLine() { + if (this.currentTurnLine !== null) { + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + } + } + + printPartialTranscript(words) { + this.clearCurrentLine(); + // Build transcript from individual words + const wordTexts = words.map(word => word.text || ''); + const transcript = wordTexts.join(' '); + const partialText = `${this.channelName}: ${transcript}`; + process.stdout.write(partialText); + this.currentTurnLine = partialText.length; + } + + printFinalTranscript(transcript) { + this.clearCurrentLine(); + const finalText = `${this.channelName}: ${transcript}`; + console.log(finalText); + this.currentTurnLine = null; + this.lineCount++; + } + + async startTranscription() { + try { + this.loadAudioChannel(); + } catch (error) { + throw error; + } + + const turnDetectionConfig = { + endOfTurnConfidenceThreshold: 0.4, + minEndOfTurnSilenceWhenConfident: 160, + maxTurnSilence: 400 + }; + + // Create transcriber with SDK + this.transcriber = this.client.streaming.transcriber({ + sampleRate: this.sampleRate, + formatTurns: true, + ...turnDetectionConfig + }); + + // Set up event handlers + this.transcriber.on('open', ({ id }) => { + // Session opened + }); + + this.transcriber.on('error', (error) => { + console.error(`\n${this.channelName}: Error:`, error); + }); + + this.transcriber.on('close', (code, reason) => { + this.clearCurrentLine(); + if (code !== 1000 && code !== 1001) { + console.log(`\n${this.channelName}: Connection closed unexpectedly`); + } + }); + + this.transcriber.on('turn', (turn) => { + const transcript = (turn.transcript || '').trim(); + const formatted = turn.turn_is_formatted || false; + const words = turn.words || []; + + if (transcript || words.length > 0) { + if (formatted) { + this.printFinalTranscript(transcript); + } else { + this.printPartialTranscript(words); + } + } + }); + + // Connect to the streaming service + await this.transcriber.connect(); + + // Create a readable stream from audio chunks + const audioStream = new Readable({ + async read() { + // This will be controlled by our manual push below + } + }); + + // Pipe audio stream to transcriber + Readable.toWeb(audioStream).pipeTo(this.transcriber.stream()); + + // Stream audio data + for (const chunk of this.audioData) { + audioStream.push(chunk); + await new Promise(resolve => setTimeout(resolve, 50)); // 50ms intervals + } + + // Signal end of stream + audioStream.push(null); + + // Wait a bit for final transcripts + await new Promise(resolve => setTimeout(resolve, 1000)); + + // Close the transcriber + await this.transcriber.close(); + } + + async close() { + if (this.transcriber) { + await this.transcriber.close(); + } + } +} + +function playAudioFile() { + return new Promise((resolve) => { + console.log(`Playing audio: ${AUDIO_FILE_PATH}`); + + // Use platform-specific audio player + let command; + let args; + + if (process.platform === 'darwin') { + // macOS + command = 'afplay'; + args = [AUDIO_FILE_PATH]; + } else if (process.platform === 'win32') { + // Windows - using PowerShell + command = 'powershell'; + args = ['-c', `(New-Object Media.SoundPlayer '${AUDIO_FILE_PATH}').PlaySync()`]; + } else { + // Linux - try aplay + command = 'aplay'; + args = [AUDIO_FILE_PATH]; + } + + try { + const player = spawn(command, args, { + stdio: ['ignore', 'ignore', 'ignore'] // Suppress all output from player + }); + + player.on('close', (code) => { + if (code === 0) { + console.log('Audio playback finished'); + } + resolve(); + }); + + player.on('error', (error) => { + // Silently continue without audio + resolve(); + }); + } catch (error) { + resolve(); + } + }); +} + +async function transcribeMultichannel() { + // Verify API key is set + if (YOUR_API_KEY === '') { + console.error('ERROR: Please set YOUR_API_KEY before running'); + process.exit(1); + } + + // Verify file exists + if (!fs.existsSync(AUDIO_FILE_PATH)) { + console.error(`ERROR: Audio file not found: ${AUDIO_FILE_PATH}`); + process.exit(1); + } + + // Get sample rate from file + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const sampleRate = wavParser.sampleRate; + + // Create SDK client + const client = new AssemblyAI({ + apiKey: YOUR_API_KEY + }); + + const transcriber1 = new ChannelTranscriber(client, 0, 'Speaker 1', sampleRate); + const transcriber2 = new ChannelTranscriber(client, 1, 'Speaker 2', sampleRate); + + try { + // Start audio playback (non-blocking) + const audioPromise = playAudioFile(); + + // Start both transcriptions + const transcriptionPromises = [ + transcriber1.startTranscription(), + transcriber2.startTranscription() + ]; + + // Wait for all to complete + await Promise.all([...transcriptionPromises, audioPromise]); + + } catch (error) { + console.error('\nError during transcription:', error.message); + + // Clean up + await transcriber1.close(); + await transcriber2.close(); + + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n'); // Clean line break before exit + process.exit(0); +}); + +// Main execution +transcribeMultichannel(); \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript.js b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript.js new file mode 100644 index 00000000..de6efe4b --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-javascript.js @@ -0,0 +1,323 @@ +const WebSocket = require('ws'); +const fs = require('fs'); +const { spawn } = require('child_process'); + +// Configuration +const YOUR_API_KEY = ''; +const AUDIO_FILE_PATH = ''; +const API_BASE_URL = 'wss://streaming.assemblyai.com/v3/ws'; +const API_PARAMS = { + sample_rate: 8000, + format_turns: 'true', + end_of_turn_confidence_threshold: 0.4, + min_end_of_turn_silence_when_confident: 160, + max_turn_silence: 400, +}; + +// Build API endpoint with URL encoding +const queryString = new URLSearchParams(API_PARAMS).toString(); +const API_ENDPOINT = `${API_BASE_URL}?${queryString}`; + +// Simple WAV file parser +class SimpleWavParser { + constructor(filePath) { + this.buffer = fs.readFileSync(filePath); + this.parseHeader(); + } + + parseHeader() { + // Read WAV header + this.channels = this.buffer.readUInt16LE(22); + this.sampleRate = this.buffer.readUInt32LE(24); + this.bitsPerSample = this.buffer.readUInt16LE(34); + + // Find data chunk + let dataOffset = 12; + while (dataOffset < this.buffer.length - 8) { + const chunkId = this.buffer.toString('ascii', dataOffset, dataOffset + 4); + const chunkSize = this.buffer.readUInt32LE(dataOffset + 4); + + if (chunkId === 'data') { + this.dataStart = dataOffset + 8; + this.dataSize = chunkSize; + break; + } + + dataOffset += 8 + chunkSize; + } + } + + getChannelData(channelIndex) { + if (this.channels !== 2) { + throw new Error('Audio file is not stereo'); + } + + const bytesPerSample = this.bitsPerSample / 8; + const samplesPerChannel = this.dataSize / (bytesPerSample * this.channels); + const channelData = []; + + // Extract samples for the specified channel + for (let i = 0; i < samplesPerChannel; i++) { + const sampleOffset = this.dataStart + (i * this.channels + channelIndex) * bytesPerSample; + + if (this.bitsPerSample === 16) { + const sample = this.buffer.readInt16LE(sampleOffset); + channelData.push(sample); + } else if (this.bitsPerSample === 8) { + const sample = this.buffer.readUInt8(sampleOffset) - 128; + channelData.push(sample * 256); // Convert to 16-bit range + } + } + + return channelData; + } +} + +class ChannelTranscriber { + constructor(channelId, channelName) { + this.channelId = channelId; + this.channelName = channelName; + this.ws = null; + this.audioData = []; + this.currentTurnLine = null; + this.lineCount = 0; + this.isConnected = false; + } + + loadAudioChannel() { + try { + const wavParser = new SimpleWavParser(AUDIO_FILE_PATH); + const channelSamples = wavParser.getChannelData(this.channelId); + + // Split into chunks for streaming (50ms chunks at 8000Hz = 400 samples) + const FRAMES_PER_BUFFER = 400; + + for (let i = 0; i < channelSamples.length; i += FRAMES_PER_BUFFER) { + const chunkArray = new Int16Array(FRAMES_PER_BUFFER); + + // Copy samples and pad if necessary + for (let j = 0; j < FRAMES_PER_BUFFER; j++) { + if (i + j < channelSamples.length) { + chunkArray[j] = channelSamples[i + j]; + } else { + chunkArray[j] = 0; // Pad with silence + } + } + + // Convert to Buffer (Little Endian) + const buffer = Buffer.from(chunkArray.buffer); + this.audioData.push(buffer); + } + } catch (error) { + throw error; + } + } + + clearCurrentLine() { + if (this.currentTurnLine !== null) { + process.stdout.write('\r' + ' '.repeat(100) + '\r'); + } + } + + printPartialTranscript(words) { + this.clearCurrentLine(); + // Build transcript from individual words + const wordTexts = words.map(word => word.text || ''); + const transcript = wordTexts.join(' '); + const partialText = `${this.channelName}: ${transcript}`; + process.stdout.write(partialText); + this.currentTurnLine = partialText.length; + } + + printFinalTranscript(transcript) { + this.clearCurrentLine(); + const finalText = `${this.channelName}: ${transcript}`; + console.log(finalText); + this.currentTurnLine = null; + this.lineCount++; + } + + async streamAudio() { + // Wait a bit for connection to stabilize + await new Promise(resolve => setTimeout(resolve, 100)); + + for (const chunk of this.audioData) { + if (this.ws.readyState === WebSocket.OPEN) { + this.ws.send(chunk, { binary: true }); + await new Promise(resolve => setTimeout(resolve, 50)); // 50ms intervals + } else { + break; + } + } + + // Send termination message + if (this.ws.readyState === WebSocket.OPEN) { + const terminateMessage = { type: 'Terminate' }; + this.ws.send(JSON.stringify(terminateMessage)); + } + } + + startTranscription() { + return new Promise((resolve, reject) => { + try { + this.loadAudioChannel(); + } catch (error) { + reject(error); + return; + } + + this.ws = new WebSocket(API_ENDPOINT, { + headers: { + Authorization: YOUR_API_KEY + } + }); + + this.ws.on('open', () => { + this.isConnected = true; + // Start streaming audio + this.streamAudio().catch(error => {}); + }); + + this.ws.on('message', (data) => { + try { + const message = JSON.parse(data.toString()); + const msgType = message.type; + + if (msgType === 'Turn') { + const transcript = (message.transcript || '').trim(); + const formatted = message.turn_is_formatted || false; + const words = message.words || []; + + if (transcript || words.length > 0) { + if (formatted) { + this.printFinalTranscript(transcript); + } else { + this.printPartialTranscript(words); + } + } + } else if (msgType === 'error') { + console.error(`\n${this.channelName}: API Error:`, message.error); + } + } catch (error) { + // Silently ignore parse errors + } + }); + + this.ws.on('close', (code, reason) => { + this.clearCurrentLine(); + if (code !== 1000 && code !== 1001) { + console.log(`\n${this.channelName}: Connection closed unexpectedly`); + } + this.isConnected = false; + resolve(); + }); + + this.ws.on('error', (error) => { + console.error(`\n${this.channelName} WebSocket error:`, error.message); + this.isConnected = false; + reject(error); + }); + }); + } + + close() { + if (this.ws && this.isConnected) { + this.ws.close(); + } + } +} + +function playAudioFile() { + return new Promise((resolve) => { + console.log(`Playing audio: ${AUDIO_FILE_PATH}`); + + // Use platform-specific audio player + let command; + let args; + + if (process.platform === 'darwin') { + // macOS + command = 'afplay'; + args = [AUDIO_FILE_PATH]; + } else if (process.platform === 'win32') { + // Windows - using PowerShell + command = 'powershell'; + args = ['-c', `(New-Object Media.SoundPlayer '${AUDIO_FILE_PATH}').PlaySync()`]; + } else { + // Linux - try aplay + command = 'aplay'; + args = [AUDIO_FILE_PATH]; + } + + try { + const player = spawn(command, args, { + stdio: ['ignore', 'ignore', 'ignore'] // Suppress all output from player + }); + + player.on('close', (code) => { + if (code === 0) { + console.log('Audio playback finished'); + } + resolve(); + }); + + player.on('error', (error) => { + // Silently continue without audio + resolve(); + }); + } catch (error) { + resolve(); + } + }); +} + +async function transcribeMultichannel() { + const transcriber1 = new ChannelTranscriber(0, 'Speaker 1'); + const transcriber2 = new ChannelTranscriber(1, 'Speaker 2'); + + try { + // Verify API key is set + if (YOUR_API_KEY === '') { + console.error('ERROR: Please set YOUR_API_KEY before running'); + process.exit(1); + } + + // Verify file exists + if (!fs.existsSync(AUDIO_FILE_PATH)) { + console.error(`ERROR: Audio file not found: ${AUDIO_FILE_PATH}`); + process.exit(1); + } + + // Start audio playback (non-blocking) + const audioPromise = playAudioFile(); + + // Start both transcriptions + const transcriptionPromises = [ + transcriber1.startTranscription(), + transcriber2.startTranscription() + ]; + + // Wait for all to complete + await Promise.all([...transcriptionPromises, audioPromise]); + + } catch (error) { + console.error('\nError during transcription:', error.message); + + // Clean up + transcriber1.close(); + transcriber2.close(); + + process.exit(1); + } +} + +// Handle graceful shutdown +process.on('SIGINT', () => { + console.log('\n'); // Clean line break before exit + process.exit(0); +}); + +// Main execution +if (require.main === module) { + transcribeMultichannel(); +} \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python-sdk.py b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python-sdk.py new file mode 100644 index 00000000..906935e1 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python-sdk.py @@ -0,0 +1,210 @@ +import logging +from typing import Type +import threading +import time +import wave +import numpy as np +import pyaudio + +import assemblyai as aai +from assemblyai.streaming.v3 import ( + BeginEvent, + StreamingClient, + StreamingClientOptions, + StreamingError, + StreamingEvents, + StreamingParameters, + TerminationEvent, + TurnEvent, +) + +# Configuration +API_KEY = "" +AUDIO_FILE_PATH = "" + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class ChannelTranscriber: + def __init__(self, channel_id, channel_name, sample_rate): + self.channel_id = channel_id + self.channel_name = channel_name + self.sample_rate = sample_rate + self.client = None + self.audio_data = [] + self.current_turn_line = None + self.line_count = 0 + self.streaming_done = threading.Event() + + def load_audio_channel(self): + """Extract single channel from dual-channel audio file.""" + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + frames = wf.readframes(wf.getnframes()) + audio_array = np.frombuffer(frames, dtype=np.int16) + + if wf.getnchannels() == 2: + audio_array = audio_array.reshape(-1, 2) + channel_audio = audio_array[:, self.channel_id] + + # Split into chunks for streaming + FRAMES_PER_BUFFER = 400 # 50ms chunks + for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): + chunk = channel_audio[i:i+FRAMES_PER_BUFFER] + if len(chunk) < FRAMES_PER_BUFFER: + chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') + self.audio_data.append(chunk.astype(np.int16).tobytes()) + + def clear_current_line(self): + if self.current_turn_line is not None: + print("\r" + " " * 100 + "\r", end="", flush=True) + + def print_partial_transcript(self, words): + self.clear_current_line() + # Build transcript from individual words + word_texts = [word.text for word in words] + transcript = ' '.join(word_texts) + partial_text = f"{self.channel_name}: {transcript}" + print(partial_text, end="", flush=True) + self.current_turn_line = len(partial_text) + + def print_final_transcript(self, transcript): + self.clear_current_line() + final_text = f"{self.channel_name}: {transcript}" + print(final_text, flush=True) + self.current_turn_line = None + self.line_count += 1 + + def on_begin(self, client: Type[StreamingClient], event: BeginEvent): + """Called when the streaming session begins.""" + pass # Session started + + def on_turn(self, client: Type[StreamingClient], event: TurnEvent): + """Called when a turn is received.""" + transcript = event.transcript.strip() if event.transcript else '' + formatted = event.turn_is_formatted + words = event.words if event.words else [] + + if transcript or words: + if formatted: + self.print_final_transcript(transcript) + else: + self.print_partial_transcript(words) + + def on_terminated(self, client: Type[StreamingClient], event: TerminationEvent): + """Called when the session is terminated.""" + self.clear_current_line() + self.streaming_done.set() + + def on_error(self, client: Type[StreamingClient], error: StreamingError): + """Called when an error occurs.""" + print(f"\n{self.channel_name}: Error: {error}") + self.streaming_done.set() + + def start_transcription(self): + """Start the transcription for this channel.""" + self.load_audio_channel() + + # Create streaming client + self.client = StreamingClient( + StreamingClientOptions( + api_key=API_KEY, + api_host="streaming.assemblyai.com", + ) + ) + + # Register event handlers + self.client.on(StreamingEvents.Begin, self.on_begin) + self.client.on(StreamingEvents.Turn, self.on_turn) + self.client.on(StreamingEvents.Termination, self.on_terminated) + self.client.on(StreamingEvents.Error, self.on_error) + + # Connect to streaming service with turn detection configuration + self.client.connect( + StreamingParameters( + sample_rate=self.sample_rate, + format_turns=True, + end_of_turn_confidence_threshold=0.4, + min_end_of_turn_silence_when_confident=160, + max_turn_silence=400, + ) + ) + + # Create audio generator + def audio_generator(): + for chunk in self.audio_data: + yield chunk + time.sleep(0.05) # 50ms intervals + + try: + # Stream audio + self.client.stream(audio_generator()) + finally: + # Disconnect + self.client.disconnect(terminate=True) + self.streaming_done.set() + + def start_transcription_thread(self): + """Start transcription in a separate thread.""" + thread = threading.Thread(target=self.start_transcription, daemon=True) + thread.start() + return thread + + +def play_audio_file(): + try: + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + p = pyaudio.PyAudio() + + stream = p.open( + format=p.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True + ) + + print(f"Playing audio: {AUDIO_FILE_PATH}") + + # Play audio in chunks + chunk_size = 1024 + data = wf.readframes(chunk_size) + + while data: + stream.write(data) + data = wf.readframes(chunk_size) + + stream.stop_stream() + stream.close() + p.terminate() + + print("Audio playback finished") + + except Exception as e: + print(f"Error playing audio: {e}") + + +def transcribe_multichannel(): + # Get sample rate from file + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + sample_rate = wf.getframerate() + + # Create transcribers for each channel + transcriber_1 = ChannelTranscriber(0, "Speaker 1", sample_rate) + transcriber_2 = ChannelTranscriber(1, "Speaker 2", sample_rate) + + # Start audio playback + audio_thread = threading.Thread(target=play_audio_file, daemon=True) + audio_thread.start() + + # Start both transcriptions + thread_1 = transcriber_1.start_transcription_thread() + thread_2 = transcriber_2.start_transcription_thread() + + # Wait for completion + thread_1.join() + thread_2.join() + audio_thread.join() + + +if __name__ == "__main__": + transcribe_multichannel() \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python.py b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python.py new file mode 100644 index 00000000..5a770dfa --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/code/quickstart-python.py @@ -0,0 +1,165 @@ +import websocket +import json +import threading +import numpy as np +import wave +import time +import pyaudio +from urllib.parse import urlencode + +# Configuration +YOUR_API_KEY = "" +AUDIO_FILE_PATH = "" +API_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" +API_PARAMS = { + "sample_rate": 8000, + "format_turns": "true", + "end_of_turn_confidence_threshold": 0.4, + "min_end_of_turn_silence_when_confident": 160, + "max_turn_silence": 400, +} +# Build API endpoint with URL encoding +API_ENDPOINT = f"{API_BASE_URL}?{urlencode(API_PARAMS)}" + +class ChannelTranscriber: + def __init__(self, channel_id, channel_name): + self.channel_id = channel_id + self.channel_name = channel_name + self.ws_app = None + self.audio_data = [] + self.current_turn_line = None + self.line_count = 0 + + def load_audio_channel(self): + """Extract single channel from dual-channel audio file.""" + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + frames = wf.readframes(wf.getnframes()) + audio_array = np.frombuffer(frames, dtype=np.int16) + + if wf.getnchannels() == 2: + audio_array = audio_array.reshape(-1, 2) + channel_audio = audio_array[:, self.channel_id] + + # Split into chunks for streaming + FRAMES_PER_BUFFER = 400 # 50ms chunks + for i in range(0, len(channel_audio), FRAMES_PER_BUFFER): + chunk = channel_audio[i:i+FRAMES_PER_BUFFER] + if len(chunk) < FRAMES_PER_BUFFER: + chunk = np.pad(chunk, (0, FRAMES_PER_BUFFER - len(chunk)), 'constant') + self.audio_data.append(chunk.astype(np.int16).tobytes()) + + def on_open(self, ws): + """Stream audio data when connection opens.""" + def stream_audio(): + for chunk in self.audio_data: + ws.send(chunk, websocket.ABNF.OPCODE_BINARY) + time.sleep(0.05) # 50ms intervals + + # Send termination message + terminate_message = {"type": "Terminate"} + ws.send(json.dumps(terminate_message)) + + threading.Thread(target=stream_audio, daemon=True).start() + + def clear_current_line(self): + if self.current_turn_line is not None: + print("\r" + " " * 100 + "\r", end="", flush=True) + + def print_partial_transcript(self, words): + self.clear_current_line() + # Build transcript from individual words + word_texts = [word.get('text', '') for word in words] + transcript = ' '.join(word_texts) + partial_text = f"{self.channel_name}: {transcript}" + print(partial_text, end="", flush=True) + self.current_turn_line = len(partial_text) + + def print_final_transcript(self, transcript): + self.clear_current_line() + final_text = f"{self.channel_name}: {transcript}" + print(final_text, flush=True) + self.current_turn_line = None + self.line_count += 1 + + def on_message(self, ws, message): + """Handle transcription results.""" + data = json.loads(message) + msg_type = data.get('type') + + if msg_type == "Turn": + transcript = data.get('transcript', '').strip() + formatted = data.get('turn_is_formatted', False) + words = data.get('words', []) + + if transcript or words: + if formatted: + self.print_final_transcript(transcript) + else: + self.print_partial_transcript(words) + + def start_transcription(self): + self.load_audio_channel() + + self.ws_app = websocket.WebSocketApp( + API_ENDPOINT, + header={"Authorization": YOUR_API_KEY}, + on_open=self.on_open, + on_message=self.on_message, + ) + + thread = threading.Thread(target=self.ws_app.run_forever, daemon=True) + thread.start() + return thread + +def play_audio_file(): + try: + with wave.open(AUDIO_FILE_PATH, 'rb') as wf: + p = pyaudio.PyAudio() + + stream = p.open( + format=p.get_format_from_width(wf.getsampwidth()), + channels=wf.getnchannels(), + rate=wf.getframerate(), + output=True + ) + + print(f"Playing audio: {AUDIO_FILE_PATH}") + + # Play audio in chunks + chunk_size = 1024 + data = wf.readframes(chunk_size) + + while data: + stream.write(data) + data = wf.readframes(chunk_size) + + stream.stop_stream() + stream.close() + p.terminate() + + print("Audio playback finished") + + except Exception as e: + print(f"Error playing audio: {e}") + + +def transcribe_multichannel(): + # Create transcribers for each channel + transcriber_1 = ChannelTranscriber(0, "Speaker 1") + transcriber_2 = ChannelTranscriber(1, "Speaker 2") + + # Start audio playback + audio_thread = threading.Thread(target=play_audio_file, daemon=True) + audio_thread.start() + + # Start both transcriptions + thread_1 = transcriber_1.start_transcription() + thread_2 = transcriber_2.start_transcription() + + # Wait for completion + thread_1.join() + thread_2.join() + audio_thread.join() + +if __name__ == "__main__": + transcribe_multichannel() \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/multichannel-streaming.mdx similarity index 95% rename from fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx rename to fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/multichannel-streaming.mdx index cf65015c..f6fd6ae7 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/multichannel-streaming/multichannel-streaming.mdx @@ -1106,65 +1106,12 @@ The examples above use turn detection settings optimized for short responses and For configuration examples tailored to different use cases, refer to our [Configuration examples](/docs/universal-streaming/turn-detection#quick-start-configurations). - - -Modify the `StreamingParameters` in the `start_transcription` method: -```python -# Connect to streaming service with turn detection configuration -self.client.connect( - StreamingParameters( - sample_rate=self.sample_rate, - format_turns=True, - end_of_turn_confidence_threshold=0.4, - min_end_of_turn_silence_when_confident=160, - max_turn_silence=400, - ) -) -``` - + - -Modify the turn detection parameters in `API_PARAMS`: -```python -API_PARAMS = { - "sample_rate": 8000, - "format_turns": "true", - "end_of_turn_confidence_threshold": 0.4, - "min_end_of_turn_silence_when_confident": 160, - "max_turn_silence": 400, -} -``` - + + + + - -Modify the turn detection configuration object: -```javascript -const turnDetectionConfig = { - endOfTurnConfidenceThreshold: 0.4, - minEndOfTurnSilenceWhenConfident: 160, - maxTurnSilence: 400 -}; - -// Create transcriber with SDK -this.transcriber = this.client.streaming.transcriber({ - sampleRate: this.sampleRate, - formatTurns: true, - ...turnDetectionConfig -}); -``` - - - -Modify the turn detection parameters in `API_PARAMS`: -```javascript -const API_PARAMS = { - sample_rate: 8000, - format_turns: 'true', - end_of_turn_confidence_threshold: 0.4, - min_end_of_turn_silence_when_confident: 160, - max_turn_silence: 400, -}; -``` - - + diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx b/fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-javascript.js similarity index 61% rename from fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx rename to fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-javascript.js index 1e36912d..f22f1c10 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-javascript.js @@ -1,166 +1,3 @@ ---- -title: "Multilingual streaming" -description: "Transcribe audio in multiple languages" ---- - - - English, Spanish, French, German, Italian, and Portuguese - - -Multilingual streaming allows you to transcribe audio streams in multiple languages. - -## Configuration - - - Keyterms prompting is not supported with multilingual streaming. - - -To utilize multilingual streaming, you need to include `"speech_model":"universal-streaming-multilingual"` as a query parameter in the WebSocket URL. - -### Supported languages - -Multilingual currently supports English, Spanish, French, German, Italian, and Portuguese. - -# Understanding formatting - -The multilingual model produces transcripts with punctuation and capitalization already built into the model outputs. This means you'll receive properly formatted text without requiring any additional post-processing. - - - While the API still returns the `turn_is_formatted` parameter to maintain interface consistency with other streaming models, the multilingual model doesn't perform additional formatting operations. All transcripts from the multilingual model are already formatted as they're generated. - - -In the future, this built-in formatting capability will be extended to our English-only streaming model as well. - -## Quickstart - - - -Firstly, install the required dependencies. - - - -```bash -pip install websockets pyaudio -``` - - - - - -```bash -npm install ws mic -``` - - - - - - - - - -```python {26} -import websockets -import asyncio -import json -from urllib.parse import urlencode - -import pyaudio - -FRAMES_PER_BUFFER = 3200 -FORMAT = pyaudio.paInt16 -CHANNELS = 1 -RATE = 48000 -p = pyaudio.PyAudio() - -stream = p.open( - format=FORMAT, - channels=CHANNELS, - rate=RATE, - input=True, - frames_per_buffer=FRAMES_PER_BUFFER -) - -BASE_URL = "wss://streaming.assemblyai.com/v3/ws" -CONNECTION_PARAMS = { - "sample_rate": RATE, - "format_turns": True, - "speech_model": "universal-streaming-multilingual", -} -URL = f"{BASE_URL}?{urlencode(CONNECTION_PARAMS)}" - -async def send_receive(): - - print(f'Connecting websocket to url ${URL}') - - async with websockets.connect( - URL, - extra_headers={"Authorization": "YOUR-API-KEY"}, - ping_interval=5, - ping_timeout=20 - ) as _ws: - await asyncio.sleep(0.1) - print("Receiving SessionBegins ...") - - session_begins = await _ws.recv() - print(session_begins) - print("Sending messages ...") - - async def send(): - while True: - try: - data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) - await _ws.send(data) - except websockets.exceptions.ConnectionClosedError as e: - print(e) - except Exception as e: - print(e) - await asyncio.sleep(0.01) - - async def receive(): - while True: - try: - result_str = await _ws.recv() - data = json.loads(result_str) - transcript = data['transcript'] - - if data['type'] == 'Turn': - if data.get('turn_is_formatted'): - print(f"\r{transcript}") - else: - print(f"\r{transcript}", end="") - print(data) - else: - pass - - except websockets.exceptions.ConnectionClosed: - break - except Exception as e: - print(f"\nError receiving data: {e}") - break - - try: - await asyncio.gather(send(), receive()) - except KeyboardInterrupt: - await _ws.send({"type": "Terminate"}) - # Wait for the server to close the connection after receiving the message - await _ws.wait_closed() - print("Session terminated and connection closed.") - -if __name__ == "__main__": - try: - asyncio.run(send_receive()) - finally: - stream.stop_stream() - stream.close() - p.terminate() -``` - - - - - -```js {11} const WebSocket = require("ws"); const mic = require("mic"); const querystring = require("querystring"); @@ -420,9 +257,4 @@ function setupTerminationHandlers() { } // Start the application -run(); -``` - - - - +run(); \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-python.py b/fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-python.py new file mode 100644 index 00000000..0f45fb0c --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual/code/quickstart-python.py @@ -0,0 +1,94 @@ +import websockets +import asyncio +import json +from urllib.parse import urlencode + +import pyaudio + +FRAMES_PER_BUFFER = 3200 +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 48000 +p = pyaudio.PyAudio() + +stream = p.open( + format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=FRAMES_PER_BUFFER +) + +BASE_URL = "wss://streaming.assemblyai.com/v3/ws" +CONNECTION_PARAMS = { + "sample_rate": RATE, + "format_turns": True, + "speech_model": "universal-streaming-multilingual", +} +URL = f"{BASE_URL}?{urlencode(CONNECTION_PARAMS)}" + +async def send_receive(): + + print(f'Connecting websocket to url ${URL}') + + async with websockets.connect( + URL, + extra_headers={"Authorization": "YOUR-API-KEY"}, + ping_interval=5, + ping_timeout=20 + ) as _ws: + await asyncio.sleep(0.1) + print("Receiving SessionBegins ...") + + session_begins = await _ws.recv() + print(session_begins) + print("Sending messages ...") + + async def send(): + while True: + try: + data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) + await _ws.send(data) + except websockets.exceptions.ConnectionClosedError as e: + print(e) + except Exception as e: + print(e) + await asyncio.sleep(0.01) + + async def receive(): + while True: + try: + result_str = await _ws.recv() + data = json.loads(result_str) + transcript = data['transcript'] + + if data['type'] == 'Turn': + if data.get('turn_is_formatted'): + print(f"\r{transcript}") + else: + print(f"\r{transcript}", end="") + print(data) + else: + pass + + except websockets.exceptions.ConnectionClosed: + break + except Exception as e: + print(f"\nError receiving data: {e}") + break + + try: + await asyncio.gather(send(), receive()) + except KeyboardInterrupt: + await _ws.send({"type": "Terminate"}) + # Wait for the server to close the connection after receiving the message + await _ws.wait_closed() + print("Session terminated and connection closed.") + +if __name__ == "__main__": + try: + asyncio.run(send_receive()) + finally: + stream.stop_stream() + stream.close() + p.terminate() \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual/multilingual.mdx b/fern/pages/02-speech-to-text/universal-streaming/multilingual/multilingual.mdx new file mode 100644 index 00000000..537a6f5d --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual/multilingual.mdx @@ -0,0 +1,63 @@ +--- +title: "Multilingual streaming" +description: "Transcribe audio in multiple languages" +--- + + + English, Spanish, French, German, Italian, and Portuguese + + +Multilingual streaming allows you to transcribe audio streams in multiple languages. + +## Configuration + + + Keyterms prompting is not supported with multilingual streaming. + + +To utilize multilingual streaming, you need to include `"speech_model":"universal-streaming-multilingual"` as a query parameter in the WebSocket URL. + +### Supported languages + +Multilingual currently supports English, Spanish, French, German, Italian, and Portuguese. + +# Understanding formatting + +The multilingual model produces transcripts with punctuation and capitalization already built into the model outputs. This means you'll receive properly formatted text without requiring any additional post-processing. + + + While the API still returns the `turn_is_formatted` parameter to maintain interface consistency with other streaming models, the multilingual model doesn't perform additional formatting operations. All transcripts from the multilingual model are already formatted as they're generated. + + +In the future, this built-in formatting capability will be extended to our English-only streaming model as well. + +## Quickstart + + + +Firstly, install the required dependencies. + + + +```bash +pip install websockets pyaudio +``` + + + + + +```bash +npm install ws mic +``` + + + + + + + + + + + diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms.mdx deleted file mode 100644 index f7d6c231..00000000 --- a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms.mdx +++ /dev/null @@ -1,646 +0,0 @@ -# Keyterms prompting for Universal-Streaming - -The keyterms prompting feature helps improve recognition accuracy for specific words and phrases that are important to your use case. - - - -Keyterms Prompting costs an additional $0.04/hour. - - - -## Quickstart - - - -Firstly, install the required dependencies. - - - -```bash -pip install websocket-client pyaudio -``` - - - - - -```bash -npm install ws mic -``` - - - - - - - - - -```python {16} -import pyaudio -import websocket -import json -import threading -import time -import wave -from urllib.parse import urlencode -from datetime import datetime - -# --- Configuration --- -YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key - -CONNECTION_PARAMS = { - "sample_rate": 16000, - "format_turns": True, # Request formatted final transcripts - "keyterms_prompt": json.dumps(["Keanu Reeves", "AssemblyAI", "Universal-2"]) -} -API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" -API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}" - -# Audio Configuration -FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz) -SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"] -CHANNELS = 1 -FORMAT = pyaudio.paInt16 - -# Global variables for audio stream and websocket -audio = None -stream = None -ws_app = None -audio_thread = None -stop_event = threading.Event() # To signal the audio thread to stop - -# WAV recording variables -recorded_frames = [] # Store audio frames for WAV file -recording_lock = threading.Lock() # Thread-safe access to recorded_frames - -# --- WebSocket Event Handlers --- - - -def on_open(ws): - """Called when the WebSocket connection is established.""" - print("WebSocket connection opened.") - print(f"Connected to: {API_ENDPOINT}") - - # Start sending audio data in a separate thread - def stream_audio(): - global stream - print("Starting audio streaming...") - while not stop_event.is_set(): - try: - audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) - - # Store audio data for WAV recording - with recording_lock: - recorded_frames.append(audio_data) - - # Send audio data as binary message - ws.send(audio_data, websocket.ABNF.OPCODE_BINARY) - except Exception as e: - print(f"Error streaming audio: {e}") - # If stream read fails, likely means it's closed, stop the loop - break - print("Audio streaming stopped.") - - global audio_thread - audio_thread = threading.Thread(target=stream_audio) - audio_thread.daemon = ( - True # Allow main thread to exit even if this thread is running - ) - audio_thread.start() - -def on_message(ws, message): - try: - data = json.loads(message) - msg_type = data.get('type') - - if msg_type == "Begin": - session_id = data.get('id') - expires_at = data.get('expires_at') - print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}") - elif msg_type == "Turn": - transcript = data.get('transcript', '') - formatted = data.get('turn_is_formatted', False) - - # Clear previous line for formatted messages - if formatted: - print('\r' + ' ' * 80 + '\r', end='') - print(transcript) - else: - print(f"\r{transcript}", end='') - elif msg_type == "Termination": - audio_duration = data.get('audio_duration_seconds', 0) - session_duration = data.get('session_duration_seconds', 0) - print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s") - except json.JSONDecodeError as e: - print(f"Error decoding message: {e}") - except Exception as e: - print(f"Error handling message: {e}") - -def on_error(ws, error): - """Called when a WebSocket error occurs.""" - print(f"\nWebSocket Error: {error}") - # Attempt to signal stop on error - stop_event.set() - - -def on_close(ws, close_status_code, close_msg): - """Called when the WebSocket connection is closed.""" - print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}") - - # Save recorded audio to WAV file - save_wav_file() - - # Ensure audio resources are released - global stream, audio - stop_event.set() # Signal audio thread just in case it's still running - - if stream: - if stream.is_active(): - stream.stop_stream() - stream.close() - stream = None - if audio: - audio.terminate() - audio = None - # Try to join the audio thread to ensure clean exit - if audio_thread and audio_thread.is_alive(): - audio_thread.join(timeout=1.0) - - -def save_wav_file(): - """Save recorded audio frames to a WAV file.""" - if not recorded_frames: - print("No audio data recorded.") - return - - # Generate filename with timestamp - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"recorded_audio_{timestamp}.wav" - - try: - with wave.open(filename, 'wb') as wf: - wf.setnchannels(CHANNELS) - wf.setsampwidth(2) # 16-bit = 2 bytes - wf.setframerate(SAMPLE_RATE) - - # Write all recorded frames - with recording_lock: - wf.writeframes(b''.join(recorded_frames)) - - print(f"Audio saved to: {filename}") - print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds") - - except Exception as e: - print(f"Error saving WAV file: {e}") - - -# --- Main Execution --- -def run(): - global audio, stream, ws_app - - # Initialize PyAudio - audio = pyaudio.PyAudio() - - # Open microphone stream - try: - stream = audio.open( - input=True, - frames_per_buffer=FRAMES_PER_BUFFER, - channels=CHANNELS, - format=FORMAT, - rate=SAMPLE_RATE, - ) - print("Microphone stream opened successfully.") - print("Speak into your microphone. Press Ctrl+C to stop.") - print("Audio will be saved to a WAV file when the session ends.") - except Exception as e: - print(f"Error opening microphone stream: {e}") - if audio: - audio.terminate() - return # Exit if microphone cannot be opened - - # Create WebSocketApp - ws_app = websocket.WebSocketApp( - API_ENDPOINT, - header={"Authorization": YOUR_API_KEY}, - on_open=on_open, - on_message=on_message, - on_error=on_error, - on_close=on_close, - ) - - # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt - ws_thread = threading.Thread(target=ws_app.run_forever) - ws_thread.daemon = True - ws_thread.start() - - try: - # Keep main thread alive until interrupted - while ws_thread.is_alive(): - time.sleep(0.1) - except KeyboardInterrupt: - print("\nCtrl+C received. Stopping...") - stop_event.set() # Signal audio thread to stop - - # Send termination message to the server - if ws_app and ws_app.sock and ws_app.sock.connected: - try: - terminate_message = {"type": "Terminate"} - print(f"Sending termination message: {json.dumps(terminate_message)}") - ws_app.send(json.dumps(terminate_message)) - # Give a moment for messages to process before forceful close - time.sleep(5) - except Exception as e: - print(f"Error sending termination message: {e}") - - # Close the WebSocket connection (will trigger on_close) - if ws_app: - ws_app.close() - - # Wait for WebSocket thread to finish - ws_thread.join(timeout=2.0) - - except Exception as e: - print(f"\nAn unexpected error occurred: {e}") - stop_event.set() - if ws_app: - ws_app.close() - ws_thread.join(timeout=2.0) - - finally: - # Final cleanup (already handled in on_close, but good as a fallback) - if stream and stream.is_active(): - stream.stop_stream() - if stream: - stream.close() - if audio: - audio.terminate() - print("Cleanup complete. Exiting.") - - -if __name__ == "__main__": - run() -``` - - - - - -```js {11} -const WebSocket = require("ws"); -const mic = require("mic"); -const querystring = require("querystring"); -const fs = require("fs"); - -// --- Configuration --- -const YOUR_API_KEY = "YOUR-API-KEY"; // Replace with your actual API key -const CONNECTION_PARAMS = { - sample_rate: 16000, - format_turns: true, // Request formatted final transcripts - keyterms_prompt: JSON.stringify([ - "Keanu Reeves", - "AssemblyAI", - "Universal-2", - ]), -}; -const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"; -const API_ENDPOINT = `${API_ENDPOINT_BASE_URL}?${querystring.stringify(CONNECTION_PARAMS)}`; - -// Audio Configuration -const SAMPLE_RATE = CONNECTION_PARAMS.sample_rate; -const CHANNELS = 1; - -// Global variables -let micInstance = null; -let micInputStream = null; -let ws = null; -let stopRequested = false; - -// WAV recording variables -let recordedFrames = []; // Store audio frames for WAV file - -// --- Helper functions --- -function clearLine() { - process.stdout.write("\r" + " ".repeat(80) + "\r"); -} - -function formatTimestamp(timestamp) { - return new Date(timestamp * 1000).toISOString(); -} - -function createWavHeader(sampleRate, channels, dataLength) { - const buffer = Buffer.alloc(44); - - // RIFF header - buffer.write("RIFF", 0); - buffer.writeUInt32LE(36 + dataLength, 4); - buffer.write("WAVE", 8); - - // fmt chunk - buffer.write("fmt ", 12); - buffer.writeUInt32LE(16, 16); // fmt chunk size - buffer.writeUInt16LE(1, 20); // PCM format - buffer.writeUInt16LE(channels, 22); - buffer.writeUInt32LE(sampleRate, 24); - buffer.writeUInt32LE(sampleRate * channels * 2, 28); // byte rate - buffer.writeUInt16LE(channels * 2, 32); // block align - buffer.writeUInt16LE(16, 34); // bits per sample - - // data chunk - buffer.write("data", 36); - buffer.writeUInt32LE(dataLength, 40); - - return buffer; -} - -function saveWavFile() { - if (recordedFrames.length === 0) { - console.log("No audio data recorded."); - return; - } - - // Generate filename with timestamp - const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); - const filename = `recorded_audio_${timestamp}.wav`; - - try { - // Combine all recorded frames - const audioData = Buffer.concat(recordedFrames); - const dataLength = audioData.length; - - // Create WAV header - const wavHeader = createWavHeader(SAMPLE_RATE, CHANNELS, dataLength); - - // Write WAV file - const wavFile = Buffer.concat([wavHeader, audioData]); - fs.writeFileSync(filename, wavFile); - - console.log(`Audio saved to: ${filename}`); - console.log( - `Duration: ${(dataLength / (SAMPLE_RATE * CHANNELS * 2)).toFixed(2)} seconds` - ); - } catch (error) { - console.error(`Error saving WAV file: ${error}`); - } -} - -// --- Main function --- -async function run() { - console.log("Starting AssemblyAI real-time transcription..."); - console.log("Audio will be saved to a WAV file when the session ends."); - - // Initialize WebSocket connection - ws = new WebSocket(API_ENDPOINT, { - headers: { - Authorization: YOUR_API_KEY, - }, - }); - - // Setup WebSocket event handlers - ws.on("open", () => { - console.log("WebSocket connection opened."); - console.log(`Connected to: ${API_ENDPOINT}`); - // Start the microphone - startMicrophone(); - }); - - ws.on("message", (message) => { - try { - const data = JSON.parse(message); - const msgType = data.type; - - if (msgType === "Begin") { - const sessionId = data.id; - const expiresAt = data.expires_at; - console.log( - `\nSession began: ID=${sessionId}, ExpiresAt=${formatTimestamp(expiresAt)}` - ); - } else if (msgType === "Turn") { - const transcript = data.transcript || ""; - const formatted = data.turn_is_formatted; - - if (formatted) { - clearLine(); - console.log(transcript); - } else { - process.stdout.write(`\r${transcript}`); - } - } else if (msgType === "Termination") { - const audioDuration = data.audio_duration_seconds; - const sessionDuration = data.session_duration_seconds; - console.log( - `\nSession Terminated: Audio Duration=${audioDuration}s, Session Duration=${sessionDuration}s` - ); - } - } catch (error) { - console.error(`\nError handling message: ${error}`); - console.error(`Message data: ${message}`); - } - }); - - ws.on("error", (error) => { - console.error(`\nWebSocket Error: ${error}`); - cleanup(); - }); - - ws.on("close", (code, reason) => { - console.log(`\nWebSocket Disconnected: Status=${code}, Msg=${reason}`); - cleanup(); - }); - - // Handle process termination - setupTerminationHandlers(); -} - -function startMicrophone() { - try { - micInstance = mic({ - rate: SAMPLE_RATE.toString(), - channels: CHANNELS.toString(), - debug: false, - exitOnSilence: 6, // This won't actually exit, just a parameter for mic - }); - - micInputStream = micInstance.getAudioStream(); - - micInputStream.on("data", (data) => { - if (ws && ws.readyState === WebSocket.OPEN && !stopRequested) { - // Store audio data for WAV recording - recordedFrames.push(Buffer.from(data)); - - // Send audio data to WebSocket - ws.send(data); - } - }); - - micInputStream.on("error", (err) => { - console.error(`Microphone Error: ${err}`); - cleanup(); - }); - - micInstance.start(); - console.log("Microphone stream opened successfully."); - console.log("Speak into your microphone. Press Ctrl+C to stop."); - } catch (error) { - console.error(`Error opening microphone stream: ${error}`); - cleanup(); - } -} - -function cleanup() { - stopRequested = true; - - // Save recorded audio to WAV file - saveWavFile(); - - // Stop microphone if it's running - if (micInstance) { - try { - micInstance.stop(); - } catch (error) { - console.error(`Error stopping microphone: ${error}`); - } - micInstance = null; - } - - // Close WebSocket connection if it's open - if (ws && [WebSocket.OPEN, WebSocket.CONNECTING].includes(ws.readyState)) { - try { - // Send termination message if possible - if (ws.readyState === WebSocket.OPEN) { - const terminateMessage = { type: "Terminate" }; - console.log( - `Sending termination message: ${JSON.stringify(terminateMessage)}` - ); - ws.send(JSON.stringify(terminateMessage)); - } - ws.close(); - } catch (error) { - console.error(`Error closing WebSocket: ${error}`); - } - ws = null; - } - - console.log("Cleanup complete."); -} - -function setupTerminationHandlers() { - // Handle Ctrl+C and other termination signals - process.on("SIGINT", () => { - console.log("\nCtrl+C received. Stopping..."); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(0), 1000); - }); - - process.on("SIGTERM", () => { - console.log("\nTermination signal received. Stopping..."); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(0), 1000); - }); - - // Handle uncaught exceptions - process.on("uncaughtException", (error) => { - console.error(`\nUncaught exception: ${error}`); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(1), 1000); - }); -} - -// Start the application -run(); -``` - - - - - -## Configuration - -To utilize keyterms prompting, you need to include your desired keyterms as query parameters in the WebSocket URL. - -- You can include a maximum of 100 keyterms per session. -- Each individual keyterm string must be 50 characters or less in length. - -## How it works - -Streaming Keyterms Prompting has two components to improve accuracy for your terms. - -### Word-level boosting - -The streaming model itself is biased during inference to be more accurate at identifying words from your keyterms list. This happens in real-time as words are emitted during the streaming process, providing immediate improvements to recognition accuracy. This component is enabled by default. - -### Turn-level boosting - -After each turn is completed, an additional boosting pass analyzes the full transcript using your keyterms list. This post-processing step, similar to formatting, provides a second layer of accuracy improvement by examining the complete context of the turn. To enable this component, set `format_turns` to `True`. - -Both stages work together to maximize recognition accuracy for your keyterms throughout the streaming process. - -## Dynamic keyterms prompting - -Dynamic keyterms prompting allows you to update keyterms during an active streaming session using the `UpdateConfiguration` message. This enables you to adapt the recognition context in real-time based on conversation flow or changing requirements. - -### Updating keyterms during a session - -To update keyterms while streaming, send an `UpdateConfiguration` message with a new `keyterms_prompt` array: - - - - - -```python -# Replace or establish new set of keyterms -websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": ["Universal-3"]}') - -# Remove keyterms and reset context biasing -websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": []}') -``` - - - - - -```javascript -// Replace or establish new set of keyterms -websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": ["Universal-3"]}'); - -// Remove keyterms and reset context biasing -websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": []}'); -``` - - - - - -### How dynamic keyterms work - -When you send an `UpdateConfiguration` message: - -- **Replacing keyterms**: Providing a new array of keyterms completely replaces the existing set. The new keyterms take effect immediately for subsequent audio processing. -- **Clearing keyterms**: Sending an empty array `[]` removes all keyterms and resets context biasing to the default state. -- **Both boosting stages**: Dynamic keyterms work with both word-level boosting (native context biasing) and turn-level boosting (metaphone-based), just like initial keyterms. - -### Use cases for dynamic keyterms - -Dynamic keyterms are particularly useful for: - -- **Context-aware voice agents**: Update keyterms based on conversation stage (e.g., switching from menu items to payment terms) -- **Multi-topic conversations**: Adapt vocabulary as the conversation topic changes -- **Progressive disclosure**: Add relevant keyterms as new information becomes available -- **Cleanup**: Remove keyterms that are no longer relevant to reduce processing overhead - -## Important notes - -- Keyterms prompts longer than 50 characters are ignored. -- Requests containing more than 100 keyterms will result in an error. - -## Best practices - -To maximize the effectiveness of keyterms prompting: - -- Specify Unique Terminology: Include proper names, company names, technical terms, or vocabulary specific to your domain that might not be commonly recognized. -- Exact Spelling and Capitalization: Provide keyterms with the precise spelling and capitalization you expect to see in the output transcript. This helps the system accurately identify the terms. -- Avoid Common Words: Do not include single, common English words (e.g., "information") as keyterms. The system is generally proficient with such words, and adding them as keyterms can be redundant. diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-javascript.js b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-javascript.js new file mode 100644 index 00000000..611aef96 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-javascript.js @@ -0,0 +1,264 @@ +const WebSocket = require("ws"); +const mic = require("mic"); +const querystring = require("querystring"); +const fs = require("fs"); + +// --- Configuration --- +const YOUR_API_KEY = "YOUR-API-KEY"; // Replace with your actual API key +const CONNECTION_PARAMS = { + sample_rate: 16000, + format_turns: true, // Request formatted final transcripts + keyterms_prompt: JSON.stringify([ + "Keanu Reeves", + "AssemblyAI", + "Universal-2", + ]), +}; +const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"; +const API_ENDPOINT = `${API_ENDPOINT_BASE_URL}?${querystring.stringify(CONNECTION_PARAMS)}`; + +// Audio Configuration +const SAMPLE_RATE = CONNECTION_PARAMS.sample_rate; +const CHANNELS = 1; + +// Global variables +let micInstance = null; +let micInputStream = null; +let ws = null; +let stopRequested = false; + +// WAV recording variables +let recordedFrames = []; // Store audio frames for WAV file + +// --- Helper functions --- +function clearLine() { + process.stdout.write("\r" + " ".repeat(80) + "\r"); +} + +function formatTimestamp(timestamp) { + return new Date(timestamp * 1000).toISOString(); +} + +function createWavHeader(sampleRate, channels, dataLength) { + const buffer = Buffer.alloc(44); + + // RIFF header + buffer.write("RIFF", 0); + buffer.writeUInt32LE(36 + dataLength, 4); + buffer.write("WAVE", 8); + + // fmt chunk + buffer.write("fmt ", 12); + buffer.writeUInt32LE(16, 16); // fmt chunk size + buffer.writeUInt16LE(1, 20); // PCM format + buffer.writeUInt16LE(channels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(sampleRate * channels * 2, 28); // byte rate + buffer.writeUInt16LE(channels * 2, 32); // block align + buffer.writeUInt16LE(16, 34); // bits per sample + + // data chunk + buffer.write("data", 36); + buffer.writeUInt32LE(dataLength, 40); + + return buffer; +} + +function saveWavFile() { + if (recordedFrames.length === 0) { + console.log("No audio data recorded."); + return; + } + + // Generate filename with timestamp + const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + const filename = `recorded_audio_${timestamp}.wav`; + + try { + // Combine all recorded frames + const audioData = Buffer.concat(recordedFrames); + const dataLength = audioData.length; + + // Create WAV header + const wavHeader = createWavHeader(SAMPLE_RATE, CHANNELS, dataLength); + + // Write WAV file + const wavFile = Buffer.concat([wavHeader, audioData]); + fs.writeFileSync(filename, wavFile); + + console.log(`Audio saved to: ${filename}`); + console.log( + `Duration: ${(dataLength / (SAMPLE_RATE * CHANNELS * 2)).toFixed(2)} seconds` + ); + } catch (error) { + console.error(`Error saving WAV file: ${error}`); + } +} + +// --- Main function --- +async function run() { + console.log("Starting AssemblyAI real-time transcription..."); + console.log("Audio will be saved to a WAV file when the session ends."); + + // Initialize WebSocket connection + ws = new WebSocket(API_ENDPOINT, { + headers: { + Authorization: YOUR_API_KEY, + }, + }); + + // Setup WebSocket event handlers + ws.on("open", () => { + console.log("WebSocket connection opened."); + console.log(`Connected to: ${API_ENDPOINT}`); + // Start the microphone + startMicrophone(); + }); + + ws.on("message", (message) => { + try { + const data = JSON.parse(message); + const msgType = data.type; + + if (msgType === "Begin") { + const sessionId = data.id; + const expiresAt = data.expires_at; + console.log( + `\nSession began: ID=${sessionId}, ExpiresAt=${formatTimestamp(expiresAt)}` + ); + } else if (msgType === "Turn") { + const transcript = data.transcript || ""; + const formatted = data.turn_is_formatted; + + if (formatted) { + clearLine(); + console.log(transcript); + } else { + process.stdout.write(`\r${transcript}`); + } + } else if (msgType === "Termination") { + const audioDuration = data.audio_duration_seconds; + const sessionDuration = data.session_duration_seconds; + console.log( + `\nSession Terminated: Audio Duration=${audioDuration}s, Session Duration=${sessionDuration}s` + ); + } + } catch (error) { + console.error(`\nError handling message: ${error}`); + console.error(`Message data: ${message}`); + } + }); + + ws.on("error", (error) => { + console.error(`\nWebSocket Error: ${error}`); + cleanup(); + }); + + ws.on("close", (code, reason) => { + console.log(`\nWebSocket Disconnected: Status=${code}, Msg=${reason}`); + cleanup(); + }); + + // Handle process termination + setupTerminationHandlers(); +} + +function startMicrophone() { + try { + micInstance = mic({ + rate: SAMPLE_RATE.toString(), + channels: CHANNELS.toString(), + debug: false, + exitOnSilence: 6, // This won't actually exit, just a parameter for mic + }); + + micInputStream = micInstance.getAudioStream(); + + micInputStream.on("data", (data) => { + if (ws && ws.readyState === WebSocket.OPEN && !stopRequested) { + // Store audio data for WAV recording + recordedFrames.push(Buffer.from(data)); + + // Send audio data to WebSocket + ws.send(data); + } + }); + + micInputStream.on("error", (err) => { + console.error(`Microphone Error: ${err}`); + cleanup(); + }); + + micInstance.start(); + console.log("Microphone stream opened successfully."); + console.log("Speak into your microphone. Press Ctrl+C to stop."); + } catch (error) { + console.error(`Error opening microphone stream: ${error}`); + cleanup(); + } +} + +function cleanup() { + stopRequested = true; + + // Save recorded audio to WAV file + saveWavFile(); + + // Stop microphone if it's running + if (micInstance) { + try { + micInstance.stop(); + } catch (error) { + console.error(`Error stopping microphone: ${error}`); + } + micInstance = null; + } + + // Close WebSocket connection if it's open + if (ws && [WebSocket.OPEN, WebSocket.CONNECTING].includes(ws.readyState)) { + try { + // Send termination message if possible + if (ws.readyState === WebSocket.OPEN) { + const terminateMessage = { type: "Terminate" }; + console.log( + `Sending termination message: ${JSON.stringify(terminateMessage)}` + ); + ws.send(JSON.stringify(terminateMessage)); + } + ws.close(); + } catch (error) { + console.error(`Error closing WebSocket: ${error}`); + } + ws = null; + } + + console.log("Cleanup complete."); +} + +function setupTerminationHandlers() { + // Handle Ctrl+C and other termination signals + process.on("SIGINT", () => { + console.log("\nCtrl+C received. Stopping..."); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(0), 1000); + }); + + process.on("SIGTERM", () => { + console.log("\nTermination signal received. Stopping..."); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(0), 1000); + }); + + // Handle uncaught exceptions + process.on("uncaughtException", (error) => { + console.error(`\nUncaught exception: ${error}`); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(1), 1000); + }); +} + +// Start the application +run(); \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-python.py b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-python.py new file mode 100644 index 00000000..e8356e68 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/code/quickstart-python.py @@ -0,0 +1,244 @@ +import pyaudio +import websocket +import json +import threading +import time +import wave +from urllib.parse import urlencode +from datetime import datetime + +# --- Configuration --- +YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key + +CONNECTION_PARAMS = { + "sample_rate": 16000, + "format_turns": True, # Request formatted final transcripts + "keyterms_prompt": json.dumps(["Keanu Reeves", "AssemblyAI", "Universal-2"]) +} +API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" +API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}" + +# Audio Configuration +FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz) +SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"] +CHANNELS = 1 +FORMAT = pyaudio.paInt16 + +# Global variables for audio stream and websocket +audio = None +stream = None +ws_app = None +audio_thread = None +stop_event = threading.Event() # To signal the audio thread to stop + +# WAV recording variables +recorded_frames = [] # Store audio frames for WAV file +recording_lock = threading.Lock() # Thread-safe access to recorded_frames + +# --- WebSocket Event Handlers --- + + +def on_open(ws): + """Called when the WebSocket connection is established.""" + print("WebSocket connection opened.") + print(f"Connected to: {API_ENDPOINT}") + + # Start sending audio data in a separate thread + def stream_audio(): + global stream + print("Starting audio streaming...") + while not stop_event.is_set(): + try: + audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) + + # Store audio data for WAV recording + with recording_lock: + recorded_frames.append(audio_data) + + # Send audio data as binary message + ws.send(audio_data, websocket.ABNF.OPCODE_BINARY) + except Exception as e: + print(f"Error streaming audio: {e}") + # If stream read fails, likely means it's closed, stop the loop + break + print("Audio streaming stopped.") + + global audio_thread + audio_thread = threading.Thread(target=stream_audio) + audio_thread.daemon = ( + True # Allow main thread to exit even if this thread is running + ) + audio_thread.start() + +def on_message(ws, message): + try: + data = json.loads(message) + msg_type = data.get('type') + + if msg_type == "Begin": + session_id = data.get('id') + expires_at = data.get('expires_at') + print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}") + elif msg_type == "Turn": + transcript = data.get('transcript', '') + formatted = data.get('turn_is_formatted', False) + + # Clear previous line for formatted messages + if formatted: + print('\r' + ' ' * 80 + '\r', end='') + print(transcript) + else: + print(f"\r{transcript}", end='') + elif msg_type == "Termination": + audio_duration = data.get('audio_duration_seconds', 0) + session_duration = data.get('session_duration_seconds', 0) + print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s") + except json.JSONDecodeError as e: + print(f"Error decoding message: {e}") + except Exception as e: + print(f"Error handling message: {e}") + +def on_error(ws, error): + """Called when a WebSocket error occurs.""" + print(f"\nWebSocket Error: {error}") + # Attempt to signal stop on error + stop_event.set() + + +def on_close(ws, close_status_code, close_msg): + """Called when the WebSocket connection is closed.""" + print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}") + + # Save recorded audio to WAV file + save_wav_file() + + # Ensure audio resources are released + global stream, audio + stop_event.set() # Signal audio thread just in case it's still running + + if stream: + if stream.is_active(): + stream.stop_stream() + stream.close() + stream = None + if audio: + audio.terminate() + audio = None + # Try to join the audio thread to ensure clean exit + if audio_thread and audio_thread.is_alive(): + audio_thread.join(timeout=1.0) + + +def save_wav_file(): + """Save recorded audio frames to a WAV file.""" + if not recorded_frames: + print("No audio data recorded.") + return + + # Generate filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"recorded_audio_{timestamp}.wav" + + try: + with wave.open(filename, 'wb') as wf: + wf.setnchannels(CHANNELS) + wf.setsampwidth(2) # 16-bit = 2 bytes + wf.setframerate(SAMPLE_RATE) + + # Write all recorded frames + with recording_lock: + wf.writeframes(b''.join(recorded_frames)) + + print(f"Audio saved to: {filename}") + print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds") + + except Exception as e: + print(f"Error saving WAV file: {e}") + + +# --- Main Execution --- +def run(): + global audio, stream, ws_app + + # Initialize PyAudio + audio = pyaudio.PyAudio() + + # Open microphone stream + try: + stream = audio.open( + input=True, + frames_per_buffer=FRAMES_PER_BUFFER, + channels=CHANNELS, + format=FORMAT, + rate=SAMPLE_RATE, + ) + print("Microphone stream opened successfully.") + print("Speak into your microphone. Press Ctrl+C to stop.") + print("Audio will be saved to a WAV file when the session ends.") + except Exception as e: + print(f"Error opening microphone stream: {e}") + if audio: + audio.terminate() + return # Exit if microphone cannot be opened + + # Create WebSocketApp + ws_app = websocket.WebSocketApp( + API_ENDPOINT, + header={"Authorization": YOUR_API_KEY}, + on_open=on_open, + on_message=on_message, + on_error=on_error, + on_close=on_close, + ) + + # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt + ws_thread = threading.Thread(target=ws_app.run_forever) + ws_thread.daemon = True + ws_thread.start() + + try: + # Keep main thread alive until interrupted + while ws_thread.is_alive(): + time.sleep(0.1) + except KeyboardInterrupt: + print("\nCtrl+C received. Stopping...") + stop_event.set() # Signal audio thread to stop + + # Send termination message to the server + if ws_app and ws_app.sock and ws_app.sock.connected: + try: + terminate_message = {"type": "Terminate"} + print(f"Sending termination message: {json.dumps(terminate_message)}") + ws_app.send(json.dumps(terminate_message)) + # Give a moment for messages to process before forceful close + time.sleep(5) + except Exception as e: + print(f"Error sending termination message: {e}") + + # Close the WebSocket connection (will trigger on_close) + if ws_app: + ws_app.close() + + # Wait for WebSocket thread to finish + ws_thread.join(timeout=2.0) + + except Exception as e: + print(f"\nAn unexpected error occurred: {e}") + stop_event.set() + if ws_app: + ws_app.close() + ws_thread.join(timeout=2.0) + + finally: + # Final cleanup (already handled in on_close, but good as a fallback) + if stream and stream.is_active(): + stream.stop_stream() + if stream: + stream.close() + if audio: + audio.terminate() + print("Cleanup complete. Exiting.") + + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/universal-streaming-keyterms.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/universal-streaming-keyterms.mdx new file mode 100644 index 00000000..a9fe8d8c --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming-keyterms/universal-streaming-keyterms.mdx @@ -0,0 +1,127 @@ +# Keyterms prompting for Universal-Streaming + +The keyterms prompting feature helps improve recognition accuracy for specific words and phrases that are important to your use case. + + + +Keyterms Prompting costs an additional $0.04/hour. + + + +## Quickstart + + + +Firstly, install the required dependencies. + + + +```bash +pip install websocket-client pyaudio +``` + + + + + +```bash +npm install ws mic +``` + + + + + + + + + + + + +## Configuration + +To utilize keyterms prompting, you need to include your desired keyterms as query parameters in the WebSocket URL. + +- You can include a maximum of 100 keyterms per session. +- Each individual keyterm string must be 50 characters or less in length. + +## How it works + +Streaming Keyterms Prompting has two components to improve accuracy for your terms. + +### Word-level boosting + +The streaming model itself is biased during inference to be more accurate at identifying words from your keyterms list. This happens in real-time as words are emitted during the streaming process, providing immediate improvements to recognition accuracy. This component is enabled by default. + +### Turn-level boosting + +After each turn is completed, an additional boosting pass analyzes the full transcript using your keyterms list. This post-processing step, similar to formatting, provides a second layer of accuracy improvement by examining the complete context of the turn. To enable this component, set `format_turns` to `True`. + +Both stages work together to maximize recognition accuracy for your keyterms throughout the streaming process. + +## Dynamic keyterms prompting + +Dynamic keyterms prompting allows you to update keyterms during an active streaming session using the `UpdateConfiguration` message. This enables you to adapt the recognition context in real-time based on conversation flow or changing requirements. + +### Updating keyterms during a session + +To update keyterms while streaming, send an `UpdateConfiguration` message with a new `keyterms_prompt` array: + + + + + +```python +# Replace or establish new set of keyterms +websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": ["Universal-3"]}') + +# Remove keyterms and reset context biasing +websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": []}') +``` + + + + + +```javascript +// Replace or establish new set of keyterms +websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": ["Universal-3"]}'); + +// Remove keyterms and reset context biasing +websocket.send('{"type": "UpdateConfiguration", "keyterms_prompt": []}'); +``` + + + + + +### How dynamic keyterms work + +When you send an `UpdateConfiguration` message: + +- **Replacing keyterms**: Providing a new array of keyterms completely replaces the existing set. The new keyterms take effect immediately for subsequent audio processing. +- **Clearing keyterms**: Sending an empty array `[]` removes all keyterms and resets context biasing to the default state. +- **Both boosting stages**: Dynamic keyterms work with both word-level boosting (native context biasing) and turn-level boosting (metaphone-based), just like initial keyterms. + +### Use cases for dynamic keyterms + +Dynamic keyterms are particularly useful for: + +- **Context-aware voice agents**: Update keyterms based on conversation stage (e.g., switching from menu items to payment terms) +- **Multi-topic conversations**: Adapt vocabulary as the conversation topic changes +- **Progressive disclosure**: Add relevant keyterms as new information becomes available +- **Cleanup**: Remove keyterms that are no longer relevant to reduce processing overhead + +## Important notes + +- Keyterms prompts longer than 50 characters are ignored. +- Requests containing more than 100 keyterms will result in an error. + +## Best practices + +To maximize the effectiveness of keyterms prompting: + +- Specify Unique Terminology: Include proper names, company names, technical terms, or vocabulary specific to your domain that might not be commonly recognized. +- Exact Spelling and Capitalization: Provide keyterms with the precise spelling and capitalization you expect to see in the output transcript. This helps the system accurately identify the terms. +- Avoid Common Words: Do not include single, common English words (e.g., "information") as keyterms. The system is generally proficient with such words, and adding them as keyterms can be redundant. diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx deleted file mode 100644 index 235c087b..00000000 --- a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx +++ /dev/null @@ -1,1074 +0,0 @@ ---- -title: "Streaming Audio" -description: "Transcribe live audio with Streaming Speech-to-Text" ---- - - - By default, Universal-Streaming is set to transcribe English audio. - If you'd like to enable multilingual streaming (support for English, Spanish, French, German, Italian, and Portuguese), enable [multilingual transcription](/docs/speech-to-text/universal-streaming/multilingual-transcription) instead. - - - - Streaming is now available in EU-West via `streaming.eu.assemblyai.com`. To use the EU streaming endpoint, replace `streaming.assemblyai.com` with `streaming.eu.assemblyai.com` in your connection configuration. - - -## Quickstart - -In this quick guide you will learn how to use AssemblyAI's Streaming Speech-to-Text feature to transcribe audio from your microphone. - -To run this quickstart you will need: - -- Python or JavaScript installed -- A valid AssemblyAI API key - -To run the quickstart: - - - - - - - Create a new Python file (for example, `main.py`) and paste the code provided below inside. - - - Insert your API key to line 17. - - - Install the necessary libraries - - ```bash - pip install assemblyai pyaudio - ``` - - - - Run with `python main.py` - - - - - - - - - - Create a new Python file (for example, `main.py`) and paste the code provided below inside. - - - Insert your API key to line 11. - - - Install the necessary libraries - - ```bash - pip install websocket-client pyaudio - ``` - - - - Run with `python main.py` - - - - - - - - - - Create a new JavaScript file (for example, `main.js`) and paste the code provided below inside. - - - Insert your API key to line 7. - - - Install the necessary libraries - - ```bash - npm install assemblyai node-record-lpcm16 - ``` - - - - Run with `node main.js` - - - - - - - - - Create a new JavaScript file (for example, `main.js`) and paste the code provided below inside. - - - Insert your API key to line 7. - - - Install the necessary libraries - - ```bash - npm install ws mic - ``` - - - - Run with `node main.js` - - - - - - - - - -```python -import logging -from typing import Type - -import assemblyai as aai -from assemblyai.streaming.v3 import ( - BeginEvent, - StreamingClient, - StreamingClientOptions, - StreamingError, - StreamingEvents, - StreamingParameters, - StreamingSessionParameters, - TerminationEvent, - TurnEvent, -) - -api_key = "" - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(**name**) - -def on_begin(self: Type[StreamingClient], event: BeginEvent): -print(f"Session started: {event.id}") - -def on_turn(self: Type[StreamingClient], event: TurnEvent): -print(f"{event.transcript} ({event.end_of_turn})") - - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - -def on_terminated(self: Type[StreamingClient], event: TerminationEvent): -print( -f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" -) - -def on_error(self: Type[StreamingClient], error: StreamingError): -print(f"Error occurred: {error}") - -def main(): -client = StreamingClient( -StreamingClientOptions( -api_key=api_key, -api_host="streaming.assemblyai.com", -) -) - - client.on(StreamingEvents.Begin, on_begin) - client.on(StreamingEvents.Turn, on_turn) - client.on(StreamingEvents.Termination, on_terminated) - client.on(StreamingEvents.Error, on_error) - - client.connect( - StreamingParameters( - sample_rate=16000, - format_turns=True, - ) - ) - - try: - client.stream( - aai.extras.MicrophoneStream(sample_rate=16000) - ) - finally: - client.disconnect(terminate=True) - -if **name** == "**main**": -main() - -```` - - - - -```python -import pyaudio -import websocket -import json -import threading -import time -import wave -from urllib.parse import urlencode -from datetime import datetime - -# --- Configuration --- -YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key - -CONNECTION_PARAMS = { - "sample_rate": 16000, - "format_turns": True, # Request formatted final transcripts -} -API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" -API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}" - -# Audio Configuration -FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz) -SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"] -CHANNELS = 1 -FORMAT = pyaudio.paInt16 - -# Global variables for audio stream and websocket -audio = None -stream = None -ws_app = None -audio_thread = None -stop_event = threading.Event() # To signal the audio thread to stop - -# WAV recording variables -recorded_frames = [] # Store audio frames for WAV file -recording_lock = threading.Lock() # Thread-safe access to recorded_frames - -# --- WebSocket Event Handlers --- - - -def on_open(ws): - """Called when the WebSocket connection is established.""" - print("WebSocket connection opened.") - print(f"Connected to: {API_ENDPOINT}") - - # Start sending audio data in a separate thread - def stream_audio(): - global stream - print("Starting audio streaming...") - while not stop_event.is_set(): - try: - audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) - - # Store audio data for WAV recording - with recording_lock: - recorded_frames.append(audio_data) - - # Send audio data as binary message - ws.send(audio_data, websocket.ABNF.OPCODE_BINARY) - except Exception as e: - print(f"Error streaming audio: {e}") - # If stream read fails, likely means it's closed, stop the loop - break - print("Audio streaming stopped.") - - global audio_thread - audio_thread = threading.Thread(target=stream_audio) - audio_thread.daemon = ( - True # Allow main thread to exit even if this thread is running - ) - audio_thread.start() - -def on_message(ws, message): - try: - data = json.loads(message) - msg_type = data.get('type') - - if msg_type == "Begin": - session_id = data.get('id') - expires_at = data.get('expires_at') - print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}") - elif msg_type == "Turn": - transcript = data.get('transcript', '') - formatted = data.get('turn_is_formatted', False) - - # Clear previous line for formatted messages - if formatted: - print('\r' + ' ' * 80 + '\r', end='') - print(transcript) - else: - print(f"\r{transcript}", end='') - elif msg_type == "Termination": - audio_duration = data.get('audio_duration_seconds', 0) - session_duration = data.get('session_duration_seconds', 0) - print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s") - except json.JSONDecodeError as e: - print(f"Error decoding message: {e}") - except Exception as e: - print(f"Error handling message: {e}") - -def on_error(ws, error): - """Called when a WebSocket error occurs.""" - print(f"\nWebSocket Error: {error}") - # Attempt to signal stop on error - stop_event.set() - - -def on_close(ws, close_status_code, close_msg): - """Called when the WebSocket connection is closed.""" - print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}") - - # Save recorded audio to WAV file - save_wav_file() - - # Ensure audio resources are released - global stream, audio - stop_event.set() # Signal audio thread just in case it's still running - - if stream: - if stream.is_active(): - stream.stop_stream() - stream.close() - stream = None - if audio: - audio.terminate() - audio = None - # Try to join the audio thread to ensure clean exit - if audio_thread and audio_thread.is_alive(): - audio_thread.join(timeout=1.0) - - -def save_wav_file(): - """Save recorded audio frames to a WAV file.""" - if not recorded_frames: - print("No audio data recorded.") - return - - # Generate filename with timestamp - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"recorded_audio_{timestamp}.wav" - - try: - with wave.open(filename, 'wb') as wf: - wf.setnchannels(CHANNELS) - wf.setsampwidth(2) # 16-bit = 2 bytes - wf.setframerate(SAMPLE_RATE) - - # Write all recorded frames - with recording_lock: - wf.writeframes(b''.join(recorded_frames)) - - print(f"Audio saved to: {filename}") - print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds") - - except Exception as e: - print(f"Error saving WAV file: {e}") - - -# --- Main Execution --- -def run(): - global audio, stream, ws_app - - # Initialize PyAudio - audio = pyaudio.PyAudio() - - # Open microphone stream - try: - stream = audio.open( - input=True, - frames_per_buffer=FRAMES_PER_BUFFER, - channels=CHANNELS, - format=FORMAT, - rate=SAMPLE_RATE, - ) - print("Microphone stream opened successfully.") - print("Speak into your microphone. Press Ctrl+C to stop.") - print("Audio will be saved to a WAV file when the session ends.") - except Exception as e: - print(f"Error opening microphone stream: {e}") - if audio: - audio.terminate() - return # Exit if microphone cannot be opened - - # Create WebSocketApp - ws_app = websocket.WebSocketApp( - API_ENDPOINT, - header={"Authorization": YOUR_API_KEY}, - on_open=on_open, - on_message=on_message, - on_error=on_error, - on_close=on_close, - ) - - # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt - ws_thread = threading.Thread(target=ws_app.run_forever) - ws_thread.daemon = True - ws_thread.start() - - try: - # Keep main thread alive until interrupted - while ws_thread.is_alive(): - time.sleep(0.1) - except KeyboardInterrupt: - print("\nCtrl+C received. Stopping...") - stop_event.set() # Signal audio thread to stop - - # Send termination message to the server - if ws_app and ws_app.sock and ws_app.sock.connected: - try: - terminate_message = {"type": "Terminate"} - print(f"Sending termination message: {json.dumps(terminate_message)}") - ws_app.send(json.dumps(terminate_message)) - # Give a moment for messages to process before forceful close - time.sleep(5) - except Exception as e: - print(f"Error sending termination message: {e}") - - # Close the WebSocket connection (will trigger on_close) - if ws_app: - ws_app.close() - - # Wait for WebSocket thread to finish - ws_thread.join(timeout=2.0) - - except Exception as e: - print(f"\nAn unexpected error occurred: {e}") - stop_event.set() - if ws_app: - ws_app.close() - ws_thread.join(timeout=2.0) - - finally: - # Final cleanup (already handled in on_close, but good as a fallback) - if stream and stream.is_active(): - stream.stop_stream() - if stream: - stream.close() - if audio: - audio.terminate() - print("Cleanup complete. Exiting.") - - -if __name__ == "__main__": - run() -```` - - - - - -```javascript -import { Readable } from "stream"; -import { AssemblyAI } from "assemblyai"; -import recorder from "node-record-lpcm16"; - -const run = async () => { - const client = new AssemblyAI({ - apiKey: "", - }); - - const transcriber = client.streaming.transcriber({ - sampleRate: 16_000, - formatTurns: true, - }); - - transcriber.on("open", ({ id }) => { - console.log(`Session opened with ID: ${id}`); - }); - - transcriber.on("error", (error) => { - console.error("Error:", error); - }); - - transcriber.on("close", (code, reason) => - console.log("Session closed:", code, reason) - ); - - transcriber.on("turn", (turn) => { - if (!turn.transcript) { - return; - } - - console.log("Turn:", turn.transcript); - }); - - try { - console.log("Connecting to streaming transcript service"); - - await transcriber.connect(); - - console.log("Starting recording"); - - const recording = recorder.record({ - channels: 1, - sampleRate: 16_000, - audioType: "wav", // Linear PCM - }); - - Readable.toWeb(recording.stream()).pipeTo(transcriber.stream()); - - // Stop recording and close connection using Ctrl-C. - - process.on("SIGINT", async function () { - console.log(); - console.log("Stopping recording"); - recording.stop(); - - console.log("Closing streaming transcript connection"); - await transcriber.close(); - - process.exit(); - }); - } catch (error) { - console.error(error); - } -}; - -run(); -``` - - - - - -```javascript -const WebSocket = require("ws"); -const mic = require("mic"); -const querystring = require("querystring"); -const fs = require("fs"); - -// --- Configuration --- -const YOUR_API_KEY = "YOUR-API-KEY"; // Replace with your actual API key -const CONNECTION_PARAMS = { - sample_rate: 16000, - format_turns: true, // Request formatted final transcripts -}; -const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"; -const API_ENDPOINT = `${API_ENDPOINT_BASE_URL}?${querystring.stringify(CONNECTION_PARAMS)}`; - -// Audio Configuration -const SAMPLE_RATE = CONNECTION_PARAMS.sample_rate; -const CHANNELS = 1; - -// Global variables -let micInstance = null; -let micInputStream = null; -let ws = null; -let stopRequested = false; - -// WAV recording variables -let recordedFrames = []; // Store audio frames for WAV file - -// --- Helper functions --- -function clearLine() { - process.stdout.write("\r" + " ".repeat(80) + "\r"); -} - -function formatTimestamp(timestamp) { - return new Date(timestamp * 1000).toISOString(); -} - -function createWavHeader(sampleRate, channels, dataLength) { - const buffer = Buffer.alloc(44); - - // RIFF header - buffer.write("RIFF", 0); - buffer.writeUInt32LE(36 + dataLength, 4); - buffer.write("WAVE", 8); - - // fmt chunk - buffer.write("fmt ", 12); - buffer.writeUInt32LE(16, 16); // fmt chunk size - buffer.writeUInt16LE(1, 20); // PCM format - buffer.writeUInt16LE(channels, 22); - buffer.writeUInt32LE(sampleRate, 24); - buffer.writeUInt32LE(sampleRate * channels * 2, 28); // byte rate - buffer.writeUInt16LE(channels * 2, 32); // block align - buffer.writeUInt16LE(16, 34); // bits per sample - - // data chunk - buffer.write("data", 36); - buffer.writeUInt32LE(dataLength, 40); - - return buffer; -} - -function saveWavFile() { - if (recordedFrames.length === 0) { - console.log("No audio data recorded."); - return; - } - - // Generate filename with timestamp - const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); - const filename = `recorded_audio_${timestamp}.wav`; - - try { - // Combine all recorded frames - const audioData = Buffer.concat(recordedFrames); - const dataLength = audioData.length; - - // Create WAV header - const wavHeader = createWavHeader(SAMPLE_RATE, CHANNELS, dataLength); - - // Write WAV file - const wavFile = Buffer.concat([wavHeader, audioData]); - fs.writeFileSync(filename, wavFile); - - console.log(`Audio saved to: ${filename}`); - console.log( - `Duration: ${(dataLength / (SAMPLE_RATE * CHANNELS * 2)).toFixed(2)} seconds` - ); - } catch (error) { - console.error(`Error saving WAV file: ${error}`); - } -} - -// --- Main function --- -async function run() { - console.log("Starting AssemblyAI real-time transcription..."); - console.log("Audio will be saved to a WAV file when the session ends."); - - // Initialize WebSocket connection - ws = new WebSocket(API_ENDPOINT, { - headers: { - Authorization: YOUR_API_KEY, - }, - }); - - // Setup WebSocket event handlers - ws.on("open", () => { - console.log("WebSocket connection opened."); - console.log(`Connected to: ${API_ENDPOINT}`); - // Start the microphone - startMicrophone(); - }); - - ws.on("message", (message) => { - try { - const data = JSON.parse(message); - const msgType = data.type; - - if (msgType === "Begin") { - const sessionId = data.id; - const expiresAt = data.expires_at; - console.log( - `\nSession began: ID=${sessionId}, ExpiresAt=${formatTimestamp(expiresAt)}` - ); - } else if (msgType === "Turn") { - const transcript = data.transcript || ""; - const formatted = data.turn_is_formatted; - - if (formatted) { - clearLine(); - console.log(transcript); - } else { - process.stdout.write(`\r${transcript}`); - } - } else if (msgType === "Termination") { - const audioDuration = data.audio_duration_seconds; - const sessionDuration = data.session_duration_seconds; - console.log( - `\nSession Terminated: Audio Duration=${audioDuration}s, Session Duration=${sessionDuration}s` - ); - } - } catch (error) { - console.error(`\nError handling message: ${error}`); - console.error(`Message data: ${message}`); - } - }); - - ws.on("error", (error) => { - console.error(`\nWebSocket Error: ${error}`); - cleanup(); - }); - - ws.on("close", (code, reason) => { - console.log(`\nWebSocket Disconnected: Status=${code}, Msg=${reason}`); - cleanup(); - }); - - // Handle process termination - setupTerminationHandlers(); -} - -function startMicrophone() { - try { - micInstance = mic({ - rate: SAMPLE_RATE.toString(), - channels: CHANNELS.toString(), - debug: false, - exitOnSilence: 6, // This won't actually exit, just a parameter for mic - }); - - micInputStream = micInstance.getAudioStream(); - - micInputStream.on("data", (data) => { - if (ws && ws.readyState === WebSocket.OPEN && !stopRequested) { - // Store audio data for WAV recording - recordedFrames.push(Buffer.from(data)); - - // Send audio data to WebSocket - ws.send(data); - } - }); - - micInputStream.on("error", (err) => { - console.error(`Microphone Error: ${err}`); - cleanup(); - }); - - micInstance.start(); - console.log("Microphone stream opened successfully."); - console.log("Speak into your microphone. Press Ctrl+C to stop."); - } catch (error) { - console.error(`Error opening microphone stream: ${error}`); - cleanup(); - } -} - -function cleanup() { - stopRequested = true; - - // Save recorded audio to WAV file - saveWavFile(); - - // Stop microphone if it's running - if (micInstance) { - try { - micInstance.stop(); - } catch (error) { - console.error(`Error stopping microphone: ${error}`); - } - micInstance = null; - } - - // Close WebSocket connection if it's open - if (ws && [WebSocket.OPEN, WebSocket.CONNECTING].includes(ws.readyState)) { - try { - // Send termination message if possible - if (ws.readyState === WebSocket.OPEN) { - const terminateMessage = { type: "Terminate" }; - console.log( - `Sending termination message: ${JSON.stringify(terminateMessage)}` - ); - ws.send(JSON.stringify(terminateMessage)); - } - ws.close(); - } catch (error) { - console.error(`Error closing WebSocket: ${error}`); - } - ws = null; - } - - console.log("Cleanup complete."); -} - -function setupTerminationHandlers() { - // Handle Ctrl+C and other termination signals - process.on("SIGINT", () => { - console.log("\nCtrl+C received. Stopping..."); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(0), 1000); - }); - - process.on("SIGTERM", () => { - console.log("\nTermination signal received. Stopping..."); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(0), 1000); - }); - - // Handle uncaught exceptions - process.on("uncaughtException", (error) => { - console.error(`\nUncaught exception: ${error}`); - cleanup(); - // Give time for cleanup before exiting - setTimeout(() => process.exit(1), 1000); - }); -} - -// Start the application -run(); -``` - - - - -## Core concepts - - - For a message-by-message breakdown of a turn, see our [Streaming API: Message - Sequence Breakdown](/docs/speech-to-text/universal-streaming/message-sequence) - guide. - - -Universal-Streaming is built based upon two core concepts: Turn objects and immutable transcriptions. - -### Turn object - -A Turn object is intended to correspond to a speaking turn in the context of voice agent applications, and therefore it roughly corresponds to an utterance in a broader context. We assign a unique ID to each Turn object, which is included in our response. Specifically, the Universal-Streaming response is formatted as follows: - -```json -{ - "turn_order": 1, - "turn_is_formatted": false, - "end_of_turn": false, - "transcript": "modern medicine is", - "end_of_turn_confidence": 0.7, - "words": [ - { "text": "modern", "word_is_final": true, ... }, - { "text": "medicine", "word_is_final": true, ... }, - { "text": "is", "word_is_final": true, ... }, - { "text": "amazing", "word_is_final": false, ... } - ] -} -``` - -- `turn_order`: Integer that increments with each new turn -- `turn_is_formatted`: Boolean indicating if the text in the transcript field is formatted. Text formatting is enabled when `format_turns` is set to `true`. It adds punctuation as well as performs casing and inverse text normalization to display various entities, such as dates, times, and phone numbers, in a human-friendly format -- `end_of_turn`: Boolean indicating if this is the end of the current turn -- `transcript`: String containing only finalized words -- `end_of_turn_confidence`: Floating number (0-1) representing the confidence that the current turn has finished, i.e., the current speaker has completed their turn -- `words`: List of Word objects with individual metadata - -Each Word object in the `words` array includes: - -- `text`: The string representation of the word -- `word_is_final`: Boolean indicating if the word is finalized, where a finalized word means the word won't be altered in future transcription responses -- `start`: Timestamp for word start -- `end`: Timestamp for word end -- `confidence`: Confidence score for the word - -### Immutable transcription - -AssemblyAI's streaming system receives audio in a streaming fashion, it returns transcription responses in real-time using the format specified above. Unlike many other streaming speech-to-text models that implement the concept of partial/variable transcriptions to show transcripts in an ongoing manner, Universal-Streaming transcriptions are immutable. In other words, the text that has already been produced will not be overwritten in future transcription responses. Therefore, with Universal-Streaming, the transcriptions will be delivered in the following way: - -```json -→ hello my na -→ hello my name -→ hello my name -→ hello my name is -→ hello my name is zac -→ hello my name is zack -``` - -When an end of the current turn is detected, you then receive a message with `end_of_turn` being `true`. Additionally, if you enable text formatting, you will also receive a transcription response with `turn_is_formatted` being `true`. - -```json -→ hello my name is zack (unformatted) -→ Hello my name is Zack. (formatted) -``` - -In this example, you may have noticed that the last word of each transcript may occasionally be a subword ("zac" in the example shown above). Each Word object has the `word_is_final` field to indicate whether the model is confident that the last word is a completed word. Note that, except for the last word, `word_is_final` is always true. - -## Use-case specific recommendations - -### Live captioning - -The default setting for Streaming Speech-to-Text is optimized for the Voice Agent use case, where you expect one person speaking with long silences happening during the agent's speaking turn. -For applications such as live captioning, where the input audio stream typically contains multiple people speaking, it is usually beneficial to wait longer before detecting turns, which trigger text formatting. - -When captioning conversations with multiple speakers, we recommend setting `min_end_of_turn_silence_when_confident` to 560 ms. By default, this is set to 400 ms. - -### Voice agents - -To optimize for latency when building a voice agent, we recommend using the unformatted transcript as it’s received more quickly than the formatted version. In typical voice agent applications involving large language models (LLMs), the lack of formatting makes little impact on the subsequent LLM processing. For more information, see [Voice agents](/docs/speech-to-text/universal-streaming/voice-agents). - -## API Reference - -### Connection parameters - - - Authenticate the session using a generated temporary token. - - - - The sample rate of the audio stream. - - - - The encoding of the audio stream. Allowed values: `pcm_s16le`, `pcm_mulaw` - - - - Whether to return formatted final transcripts. - - If enabled, formatted final transcripts will be emitted shortly following an - end-of-turn detection. - - - - - - A list of words and phrases to improve recognition accuracy for. - - - [Keyterms prompts](/docs/speech-to-text/universal-streaming/keyterms-prompting) longer than 50 characters are ignored. Requests containing more than 100 keyterms will result in an error. - - - - - - - The speech model for the Streaming session. If not specified, defaults to `universal-streaming-english`. - Allowed values: `universal-streaming-english`, `universal-streaming-multi`. - - - [Multilingual transcription](/docs/speech-to-text/universal-streaming/multilingual-transcription) is currently in beta and supports English, Spanish, French, German, Italian, and Portuguese. - - - - - - The confidence threshold `(0.0 to 1.0)` to use when determining if the end of a turn has been - reached. - - Raise or lower the threshold based on how confident you’d like us to be before triggering end of turn based on confidence score - - - - The minimum amount of silence in `milliseconds` required to detect end of turn - when confident. - - Increase or decrease the amount of time we wait to trigger end of turn when confident - - - - The maximum amount of silence in `milliseconds` allowed in a turn before end of - turn is triggered. - - Lower or raise the amount of time needed to trigger end of turn when end of turn isn't triggered by a high confidence score - - -### Audio requirements - -The audio format must conform to the following requirements: - -- PCM16 or Mu-law encoding (See Specify the encoding) -- A sample rate that matches the value of the `sample_rate` parameter -- Single-channel -- 50 milliseconds of audio per message (recommended) - -### Message types - -You send: - - - - -``` -"\x52\x49\x46\x46\xd8\xc8\x00\x00\x57\x41\x56\x45\x46" -``` - - - - - -```json -{ - "type": "UpdateConfiguration", - "end_of_turn_confidence_threshold": 0.5 -} -``` - - - - - -```json -{ "type": "Terminate" } -``` - - - - - -```json -{ "type": "ForceEndpoint" } -``` - - - - - -You receive: - - - - - ```json - { - "type": "Begin", - "id": "cfd280c7-5a9b-4dd6-8c05-235ccfa3c97f", - "expires_at": 1745483367 - } - ``` - - - - - ```json - { - "turn_order": 0, - "turn_is_formatted": true, - "end_of_turn": true, - "transcript": "Hi, my name is Sonny.", - "end_of_turn_confidence": 0.8095446228981018, - "words": - [ - { - "start": 1440, - "end": 1520, - "text": "Hi,", - "confidence": 0.9967870712280273, - "word_is_final": true - }, - { - "start": 1600, - "end": 1680, - "text": "my", - "confidence": 0.999546468257904, - "word_is_final": true - }, - { - "start": 1600, - "end": 1680, - "text": "name", - "confidence": 0.9597182273864746, - "word_is_final": true - }, - { - "start": 1680, - "end": 1760, - "text": "is", - "confidence": 0.8261497616767883, - "word_is_final": true - }, - { - "start": 2320, - "end": 3040, - "text": "Sonny.", - "confidence": 0.5737350583076477, - "word_is_final": true - } - ], - "type": "Turn" - } - ``` - - For the full breakdown of the message sequence for a turn, see the [Message sequence breakdown guide](/docs/speech-to-text/universal-streaming/message-sequence). - - - - - - ```json - { - "type": "Termination", - "audio_duration_seconds": 2000, - "session_duration_seconds": 2000 - } - ``` - - - diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript-sdk.js b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript-sdk.js new file mode 100644 index 00000000..4b4dc50f --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript-sdk.js @@ -0,0 +1,67 @@ +import { Readable } from "stream"; +import { AssemblyAI } from "assemblyai"; +import recorder from "node-record-lpcm16"; + +const run = async () => { + const client = new AssemblyAI({ + apiKey: "", + }); + + const transcriber = client.streaming.transcriber({ + sampleRate: 16_000, + formatTurns: true, + }); + + transcriber.on("open", ({ id }) => { + console.log(`Session opened with ID: ${id}`); + }); + + transcriber.on("error", (error) => { + console.error("Error:", error); + }); + + transcriber.on("close", (code, reason) => + console.log("Session closed:", code, reason) + ); + + transcriber.on("turn", (turn) => { + if (!turn.transcript) { + return; + } + + console.log("Turn:", turn.transcript); + }); + + try { + console.log("Connecting to streaming transcript service"); + + await transcriber.connect(); + + console.log("Starting recording"); + + const recording = recorder.record({ + channels: 1, + sampleRate: 16_000, + audioType: "wav", // Linear PCM + }); + + Readable.toWeb(recording.stream()).pipeTo(transcriber.stream()); + + // Stop recording and close connection using Ctrl-C. + + process.on("SIGINT", async function () { + console.log(); + console.log("Stopping recording"); + recording.stop(); + + console.log("Closing streaming transcript connection"); + await transcriber.close(); + + process.exit(); + }); + } catch (error) { + console.error(error); + } +}; + +run(); \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript.js b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript.js new file mode 100644 index 00000000..cbb9a3eb --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-javascript.js @@ -0,0 +1,259 @@ +const WebSocket = require("ws"); +const mic = require("mic"); +const querystring = require("querystring"); +const fs = require("fs"); + +// --- Configuration --- +const YOUR_API_KEY = "YOUR-API-KEY"; // Replace with your actual API key +const CONNECTION_PARAMS = { + sample_rate: 16000, + format_turns: true, // Request formatted final transcripts +}; +const API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws"; +const API_ENDPOINT = `${API_ENDPOINT_BASE_URL}?${querystring.stringify(CONNECTION_PARAMS)}`; + +// Audio Configuration +const SAMPLE_RATE = CONNECTION_PARAMS.sample_rate; +const CHANNELS = 1; + +// Global variables +let micInstance = null; +let micInputStream = null; +let ws = null; +let stopRequested = false; + +// WAV recording variables +let recordedFrames = []; // Store audio frames for WAV file + +// --- Helper functions --- +function clearLine() { + process.stdout.write("\r" + " ".repeat(80) + "\r"); +} + +function formatTimestamp(timestamp) { + return new Date(timestamp * 1000).toISOString(); +} + +function createWavHeader(sampleRate, channels, dataLength) { + const buffer = Buffer.alloc(44); + + // RIFF header + buffer.write("RIFF", 0); + buffer.writeUInt32LE(36 + dataLength, 4); + buffer.write("WAVE", 8); + + // fmt chunk + buffer.write("fmt ", 12); + buffer.writeUInt32LE(16, 16); // fmt chunk size + buffer.writeUInt16LE(1, 20); // PCM format + buffer.writeUInt16LE(channels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(sampleRate * channels * 2, 28); // byte rate + buffer.writeUInt16LE(channels * 2, 32); // block align + buffer.writeUInt16LE(16, 34); // bits per sample + + // data chunk + buffer.write("data", 36); + buffer.writeUInt32LE(dataLength, 40); + + return buffer; +} + +function saveWavFile() { + if (recordedFrames.length === 0) { + console.log("No audio data recorded."); + return; + } + + // Generate filename with timestamp + const timestamp = new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19); + const filename = `recorded_audio_${timestamp}.wav`; + + try { + // Combine all recorded frames + const audioData = Buffer.concat(recordedFrames); + const dataLength = audioData.length; + + // Create WAV header + const wavHeader = createWavHeader(SAMPLE_RATE, CHANNELS, dataLength); + + // Write WAV file + const wavFile = Buffer.concat([wavHeader, audioData]); + fs.writeFileSync(filename, wavFile); + + console.log(`Audio saved to: ${filename}`); + console.log( + `Duration: ${(dataLength / (SAMPLE_RATE * CHANNELS * 2)).toFixed(2)} seconds` + ); + } catch (error) { + console.error(`Error saving WAV file: ${error}`); + } +} + +// --- Main function --- +async function run() { + console.log("Starting AssemblyAI real-time transcription..."); + console.log("Audio will be saved to a WAV file when the session ends."); + + // Initialize WebSocket connection + ws = new WebSocket(API_ENDPOINT, { + headers: { + Authorization: YOUR_API_KEY, + }, + }); + + // Setup WebSocket event handlers + ws.on("open", () => { + console.log("WebSocket connection opened."); + console.log(`Connected to: ${API_ENDPOINT}`); + // Start the microphone + startMicrophone(); + }); + + ws.on("message", (message) => { + try { + const data = JSON.parse(message); + const msgType = data.type; + + if (msgType === "Begin") { + const sessionId = data.id; + const expiresAt = data.expires_at; + console.log( + `\nSession began: ID=${sessionId}, ExpiresAt=${formatTimestamp(expiresAt)}` + ); + } else if (msgType === "Turn") { + const transcript = data.transcript || ""; + const formatted = data.turn_is_formatted; + + if (formatted) { + clearLine(); + console.log(transcript); + } else { + process.stdout.write(`\r${transcript}`); + } + } else if (msgType === "Termination") { + const audioDuration = data.audio_duration_seconds; + const sessionDuration = data.session_duration_seconds; + console.log( + `\nSession Terminated: Audio Duration=${audioDuration}s, Session Duration=${sessionDuration}s` + ); + } + } catch (error) { + console.error(`\nError handling message: ${error}`); + console.error(`Message data: ${message}`); + } + }); + + ws.on("error", (error) => { + console.error(`\nWebSocket Error: ${error}`); + cleanup(); + }); + + ws.on("close", (code, reason) => { + console.log(`\nWebSocket Disconnected: Status=${code}, Msg=${reason}`); + cleanup(); + }); + + // Handle process termination + setupTerminationHandlers(); +} + +function startMicrophone() { + try { + micInstance = mic({ + rate: SAMPLE_RATE.toString(), + channels: CHANNELS.toString(), + debug: false, + exitOnSilence: 6, // This won't actually exit, just a parameter for mic + }); + + micInputStream = micInstance.getAudioStream(); + + micInputStream.on("data", (data) => { + if (ws && ws.readyState === WebSocket.OPEN && !stopRequested) { + // Store audio data for WAV recording + recordedFrames.push(Buffer.from(data)); + + // Send audio data to WebSocket + ws.send(data); + } + }); + + micInputStream.on("error", (err) => { + console.error(`Microphone Error: ${err}`); + cleanup(); + }); + + micInstance.start(); + console.log("Microphone stream opened successfully."); + console.log("Speak into your microphone. Press Ctrl+C to stop."); + } catch (error) { + console.error(`Error opening microphone stream: ${error}`); + cleanup(); + } +} + +function cleanup() { + stopRequested = true; + + // Save recorded audio to WAV file + saveWavFile(); + + // Stop microphone if it's running + if (micInstance) { + try { + micInstance.stop(); + } catch (error) { + console.error(`Error stopping microphone: ${error}`); + } + micInstance = null; + } + + // Close WebSocket connection if it's open + if (ws && [WebSocket.OPEN, WebSocket.CONNECTING].includes(ws.readyState)) { + try { + // Send termination message if possible + if (ws.readyState === WebSocket.OPEN) { + const terminateMessage = { type: "Terminate" }; + console.log( + `Sending termination message: ${JSON.stringify(terminateMessage)}` + ); + ws.send(JSON.stringify(terminateMessage)); + } + ws.close(); + } catch (error) { + console.error(`Error closing WebSocket: ${error}`); + } + ws = null; + } + + console.log("Cleanup complete."); +} + +function setupTerminationHandlers() { + // Handle Ctrl+C and other termination signals + process.on("SIGINT", () => { + console.log("\nCtrl+C received. Stopping..."); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(0), 1000); + }); + + process.on("SIGTERM", () => { + console.log("\nTermination signal received. Stopping..."); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(0), 1000); + }); + + // Handle uncaught exceptions + process.on("uncaughtException", (error) => { + console.error(`\nUncaught exception: ${error}`); + cleanup(); + // Give time for cleanup before exiting + setTimeout(() => process.exit(1), 1000); + }); +} + +// Start the application +run(); \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python-sdk.py b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python-sdk.py new file mode 100644 index 00000000..f7fcaa95 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python-sdk.py @@ -0,0 +1,71 @@ +import logging +from typing import Type + +import assemblyai as aai +from assemblyai.streaming.v3 import ( + BeginEvent, + StreamingClient, + StreamingClientOptions, + StreamingError, + StreamingEvents, + StreamingParameters, + StreamingSessionParameters, + TerminationEvent, + TurnEvent, +) + +api_key = "" + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(**name**) + +def on_begin(self: Type[StreamingClient], event: BeginEvent): +print(f"Session started: {event.id}") + +def on_turn(self: Type[StreamingClient], event: TurnEvent): +print(f"{event.transcript} ({event.end_of_turn})") + + if event.end_of_turn and not event.turn_is_formatted: + params = StreamingSessionParameters( + format_turns=True, + ) + + self.set_params(params) + +def on_terminated(self: Type[StreamingClient], event: TerminationEvent): +print( +f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" +) + +def on_error(self: Type[StreamingClient], error: StreamingError): +print(f"Error occurred: {error}") + +def main(): +client = StreamingClient( +StreamingClientOptions( +api_key=api_key, +api_host="streaming.assemblyai.com", +) +) + + client.on(StreamingEvents.Begin, on_begin) + client.on(StreamingEvents.Turn, on_turn) + client.on(StreamingEvents.Termination, on_terminated) + client.on(StreamingEvents.Error, on_error) + + client.connect( + StreamingParameters( + sample_rate=16000, + format_turns=True, + ) + ) + + try: + client.stream( + aai.extras.MicrophoneStream(sample_rate=16000) + ) + finally: + client.disconnect(terminate=True) + +if **name** == "**main**": +main() diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python.py b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python.py new file mode 100644 index 00000000..7ed43780 --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/code/quickstart-python.py @@ -0,0 +1,243 @@ +import pyaudio +import websocket +import json +import threading +import time +import wave +from urllib.parse import urlencode +from datetime import datetime + +# --- Configuration --- +YOUR_API_KEY = "YOUR-API-KEY" # Replace with your actual API key + +CONNECTION_PARAMS = { + "sample_rate": 16000, + "format_turns": True, # Request formatted final transcripts +} +API_ENDPOINT_BASE_URL = "wss://streaming.assemblyai.com/v3/ws" +API_ENDPOINT = f"{API_ENDPOINT_BASE_URL}?{urlencode(CONNECTION_PARAMS)}" + +# Audio Configuration +FRAMES_PER_BUFFER = 800 # 50ms of audio (0.05s * 16000Hz) +SAMPLE_RATE = CONNECTION_PARAMS["sample_rate"] +CHANNELS = 1 +FORMAT = pyaudio.paInt16 + +# Global variables for audio stream and websocket +audio = None +stream = None +ws_app = None +audio_thread = None +stop_event = threading.Event() # To signal the audio thread to stop + +# WAV recording variables +recorded_frames = [] # Store audio frames for WAV file +recording_lock = threading.Lock() # Thread-safe access to recorded_frames + +# --- WebSocket Event Handlers --- + + +def on_open(ws): + """Called when the WebSocket connection is established.""" + print("WebSocket connection opened.") + print(f"Connected to: {API_ENDPOINT}") + + # Start sending audio data in a separate thread + def stream_audio(): + global stream + print("Starting audio streaming...") + while not stop_event.is_set(): + try: + audio_data = stream.read(FRAMES_PER_BUFFER, exception_on_overflow=False) + + # Store audio data for WAV recording + with recording_lock: + recorded_frames.append(audio_data) + + # Send audio data as binary message + ws.send(audio_data, websocket.ABNF.OPCODE_BINARY) + except Exception as e: + print(f"Error streaming audio: {e}") + # If stream read fails, likely means it's closed, stop the loop + break + print("Audio streaming stopped.") + + global audio_thread + audio_thread = threading.Thread(target=stream_audio) + audio_thread.daemon = ( + True # Allow main thread to exit even if this thread is running + ) + audio_thread.start() + +def on_message(ws, message): + try: + data = json.loads(message) + msg_type = data.get('type') + + if msg_type == "Begin": + session_id = data.get('id') + expires_at = data.get('expires_at') + print(f"\nSession began: ID={session_id}, ExpiresAt={datetime.fromtimestamp(expires_at)}") + elif msg_type == "Turn": + transcript = data.get('transcript', '') + formatted = data.get('turn_is_formatted', False) + + # Clear previous line for formatted messages + if formatted: + print('\r' + ' ' * 80 + '\r', end='') + print(transcript) + else: + print(f"\r{transcript}", end='') + elif msg_type == "Termination": + audio_duration = data.get('audio_duration_seconds', 0) + session_duration = data.get('session_duration_seconds', 0) + print(f"\nSession Terminated: Audio Duration={audio_duration}s, Session Duration={session_duration}s") + except json.JSONDecodeError as e: + print(f"Error decoding message: {e}") + except Exception as e: + print(f"Error handling message: {e}") + +def on_error(ws, error): + """Called when a WebSocket error occurs.""" + print(f"\nWebSocket Error: {error}") + # Attempt to signal stop on error + stop_event.set() + + +def on_close(ws, close_status_code, close_msg): + """Called when the WebSocket connection is closed.""" + print(f"\nWebSocket Disconnected: Status={close_status_code}, Msg={close_msg}") + + # Save recorded audio to WAV file + save_wav_file() + + # Ensure audio resources are released + global stream, audio + stop_event.set() # Signal audio thread just in case it's still running + + if stream: + if stream.is_active(): + stream.stop_stream() + stream.close() + stream = None + if audio: + audio.terminate() + audio = None + # Try to join the audio thread to ensure clean exit + if audio_thread and audio_thread.is_alive(): + audio_thread.join(timeout=1.0) + + +def save_wav_file(): + """Save recorded audio frames to a WAV file.""" + if not recorded_frames: + print("No audio data recorded.") + return + + # Generate filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"recorded_audio_{timestamp}.wav" + + try: + with wave.open(filename, 'wb') as wf: + wf.setnchannels(CHANNELS) + wf.setsampwidth(2) # 16-bit = 2 bytes + wf.setframerate(SAMPLE_RATE) + + # Write all recorded frames + with recording_lock: + wf.writeframes(b''.join(recorded_frames)) + + print(f"Audio saved to: {filename}") + print(f"Duration: {len(recorded_frames) * FRAMES_PER_BUFFER / SAMPLE_RATE:.2f} seconds") + + except Exception as e: + print(f"Error saving WAV file: {e}") + + +# --- Main Execution --- +def run(): + global audio, stream, ws_app + + # Initialize PyAudio + audio = pyaudio.PyAudio() + + # Open microphone stream + try: + stream = audio.open( + input=True, + frames_per_buffer=FRAMES_PER_BUFFER, + channels=CHANNELS, + format=FORMAT, + rate=SAMPLE_RATE, + ) + print("Microphone stream opened successfully.") + print("Speak into your microphone. Press Ctrl+C to stop.") + print("Audio will be saved to a WAV file when the session ends.") + except Exception as e: + print(f"Error opening microphone stream: {e}") + if audio: + audio.terminate() + return # Exit if microphone cannot be opened + + # Create WebSocketApp + ws_app = websocket.WebSocketApp( + API_ENDPOINT, + header={"Authorization": YOUR_API_KEY}, + on_open=on_open, + on_message=on_message, + on_error=on_error, + on_close=on_close, + ) + + # Run WebSocketApp in a separate thread to allow main thread to catch KeyboardInterrupt + ws_thread = threading.Thread(target=ws_app.run_forever) + ws_thread.daemon = True + ws_thread.start() + + try: + # Keep main thread alive until interrupted + while ws_thread.is_alive(): + time.sleep(0.1) + except KeyboardInterrupt: + print("\nCtrl+C received. Stopping...") + stop_event.set() # Signal audio thread to stop + + # Send termination message to the server + if ws_app and ws_app.sock and ws_app.sock.connected: + try: + terminate_message = {"type": "Terminate"} + print(f"Sending termination message: {json.dumps(terminate_message)}") + ws_app.send(json.dumps(terminate_message)) + # Give a moment for messages to process before forceful close + time.sleep(5) + except Exception as e: + print(f"Error sending termination message: {e}") + + # Close the WebSocket connection (will trigger on_close) + if ws_app: + ws_app.close() + + # Wait for WebSocket thread to finish + ws_thread.join(timeout=2.0) + + except Exception as e: + print(f"\nAn unexpected error occurred: {e}") + stop_event.set() + if ws_app: + ws_app.close() + ws_thread.join(timeout=2.0) + + finally: + # Final cleanup (already handled in on_close, but good as a fallback) + if stream and stream.is_active(): + stream.stop_stream() + if stream: + stream.close() + if audio: + audio.terminate() + print("Cleanup complete. Exiting.") + + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/universal-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/universal-streaming.mdx new file mode 100644 index 00000000..4f52513c --- /dev/null +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming/universal-streaming.mdx @@ -0,0 +1,414 @@ +--- +title: "Streaming Audio" +description: "Transcribe live audio with Streaming Speech-to-Text" +--- + + + By default, Universal-Streaming is set to transcribe English audio. + If you'd like to enable multilingual streaming (support for English, Spanish, French, German, Italian, and Portuguese), enable [multilingual transcription](/docs/speech-to-text/universal-streaming/multilingual-transcription) instead. + + + + Streaming is now available in EU-West via `streaming.eu.assemblyai.com`. To use the EU streaming endpoint, replace `streaming.assemblyai.com` with `streaming.eu.assemblyai.com` in your connection configuration. + + +## Quickstart + +In this quick guide you will learn how to use AssemblyAI's Streaming Speech-to-Text feature to transcribe audio from your microphone. + +To run this quickstart you will need: + +- Python or JavaScript installed +- A valid AssemblyAI API key + +To run the quickstart: + + + + + + + Create a new Python file (for example, `main.py`) and paste the code provided below inside. + + + Insert your API key to line 17. + + + Install the necessary libraries + + ```bash + pip install assemblyai pyaudio + ``` + + + + Run with `python main.py` + + + + + + + + + + Create a new Python file (for example, `main.py`) and paste the code provided below inside. + + + Insert your API key to line 11. + + + Install the necessary libraries + + ```bash + pip install websocket-client pyaudio + ``` + + + + Run with `python main.py` + + + + + + + + + + Create a new JavaScript file (for example, `main.js`) and paste the code provided below inside. + + + Insert your API key to line 7. + + + Install the necessary libraries + + ```bash + npm install assemblyai node-record-lpcm16 + ``` + + + + Run with `node main.js` + + + + + + + + + Create a new JavaScript file (for example, `main.js`) and paste the code provided below inside. + + + Insert your API key to line 7. + + + Install the necessary libraries + + ```bash + npm install ws mic + ``` + + + + Run with `node main.js` + + + + + + + + + + + + + + + + +## Core concepts + + + For a message-by-message breakdown of a turn, see our [Streaming API: Message + Sequence Breakdown](/docs/speech-to-text/universal-streaming/message-sequence) + guide. + + +Universal-Streaming is built based upon two core concepts: Turn objects and immutable transcriptions. + +### Turn object + +A Turn object is intended to correspond to a speaking turn in the context of voice agent applications, and therefore it roughly corresponds to an utterance in a broader context. We assign a unique ID to each Turn object, which is included in our response. Specifically, the Universal-Streaming response is formatted as follows: + +```json +{ + "turn_order": 1, + "turn_is_formatted": false, + "end_of_turn": false, + "transcript": "modern medicine is", + "end_of_turn_confidence": 0.7, + "words": [ + { "text": "modern", "word_is_final": true, ... }, + { "text": "medicine", "word_is_final": true, ... }, + { "text": "is", "word_is_final": true, ... }, + { "text": "amazing", "word_is_final": false, ... } + ] +} +``` + +- `turn_order`: Integer that increments with each new turn +- `turn_is_formatted`: Boolean indicating if the text in the transcript field is formatted. Text formatting is enabled when `format_turns` is set to `true`. It adds punctuation as well as performs casing and inverse text normalization to display various entities, such as dates, times, and phone numbers, in a human-friendly format +- `end_of_turn`: Boolean indicating if this is the end of the current turn +- `transcript`: String containing only finalized words +- `end_of_turn_confidence`: Floating number (0-1) representing the confidence that the current turn has finished, i.e., the current speaker has completed their turn +- `words`: List of Word objects with individual metadata + +Each Word object in the `words` array includes: + +- `text`: The string representation of the word +- `word_is_final`: Boolean indicating if the word is finalized, where a finalized word means the word won't be altered in future transcription responses +- `start`: Timestamp for word start +- `end`: Timestamp for word end +- `confidence`: Confidence score for the word + +### Immutable transcription + +AssemblyAI's streaming system receives audio in a streaming fashion, it returns transcription responses in real-time using the format specified above. Unlike many other streaming speech-to-text models that implement the concept of partial/variable transcriptions to show transcripts in an ongoing manner, Universal-Streaming transcriptions are immutable. In other words, the text that has already been produced will not be overwritten in future transcription responses. Therefore, with Universal-Streaming, the transcriptions will be delivered in the following way: + +```json +→ hello my na +→ hello my name +→ hello my name +→ hello my name is +→ hello my name is zac +→ hello my name is zack +``` + +When an end of the current turn is detected, you then receive a message with `end_of_turn` being `true`. Additionally, if you enable text formatting, you will also receive a transcription response with `turn_is_formatted` being `true`. + +```json +→ hello my name is zack (unformatted) +→ Hello my name is Zack. (formatted) +``` + +In this example, you may have noticed that the last word of each transcript may occasionally be a subword ("zac" in the example shown above). Each Word object has the `word_is_final` field to indicate whether the model is confident that the last word is a completed word. Note that, except for the last word, `word_is_final` is always true. + +## Use-case specific recommendations + +### Live captioning + +The default setting for Streaming Speech-to-Text is optimized for the Voice Agent use case, where you expect one person speaking with long silences happening during the agent's speaking turn. +For applications such as live captioning, where the input audio stream typically contains multiple people speaking, it is usually beneficial to wait longer before detecting turns, which trigger text formatting. + +When captioning conversations with multiple speakers, we recommend setting `min_end_of_turn_silence_when_confident` to 560 ms. By default, this is set to 400 ms. + +### Voice agents + +To optimize for latency when building a voice agent, we recommend using the unformatted transcript as it’s received more quickly than the formatted version. In typical voice agent applications involving large language models (LLMs), the lack of formatting makes little impact on the subsequent LLM processing. For more information, see [Voice agents](/docs/speech-to-text/universal-streaming/voice-agents). + +## API Reference + +### Connection parameters + + + Authenticate the session using a generated temporary token. + + + + The sample rate of the audio stream. + + + + The encoding of the audio stream. Allowed values: `pcm_s16le`, `pcm_mulaw` + + + + Whether to return formatted final transcripts. + + If enabled, formatted final transcripts will be emitted shortly following an + end-of-turn detection. + + + + + + A list of words and phrases to improve recognition accuracy for. + + + [Keyterms prompts](/docs/speech-to-text/universal-streaming/keyterms-prompting) longer than 50 characters are ignored. Requests containing more than 100 keyterms will result in an error. + + + + + + + The speech model for the Streaming session. If not specified, defaults to `universal-streaming-english`. + Allowed values: `universal-streaming-english`, `universal-streaming-multi`. + + + [Multilingual transcription](/docs/speech-to-text/universal-streaming/multilingual-transcription) is currently in beta and supports English, Spanish, French, German, Italian, and Portuguese. + + + + + + The confidence threshold `(0.0 to 1.0)` to use when determining if the end of a turn has been + reached. + + Raise or lower the threshold based on how confident you’d like us to be before triggering end of turn based on confidence score + + + + The minimum amount of silence in `milliseconds` required to detect end of turn + when confident. + + Increase or decrease the amount of time we wait to trigger end of turn when confident + + + + The maximum amount of silence in `milliseconds` allowed in a turn before end of + turn is triggered. + + Lower or raise the amount of time needed to trigger end of turn when end of turn isn't triggered by a high confidence score + + +### Audio requirements + +The audio format must conform to the following requirements: + +- PCM16 or Mu-law encoding (See Specify the encoding) +- A sample rate that matches the value of the `sample_rate` parameter +- Single-channel +- 50 milliseconds of audio per message (recommended) + +### Message types + +You send: + + + + +``` +"\x52\x49\x46\x46\xd8\xc8\x00\x00\x57\x41\x56\x45\x46" +``` + + + + + +```json +{ + "type": "UpdateConfiguration", + "end_of_turn_confidence_threshold": 0.5 +} +``` + + + + + +```json +{ "type": "Terminate" } +``` + + + + + +```json +{ "type": "ForceEndpoint" } +``` + + + + + +You receive: + + + + + ```json + { + "type": "Begin", + "id": "cfd280c7-5a9b-4dd6-8c05-235ccfa3c97f", + "expires_at": 1745483367 + } + ``` + + + + + ```json + { + "turn_order": 0, + "turn_is_formatted": true, + "end_of_turn": true, + "transcript": "Hi, my name is Sonny.", + "end_of_turn_confidence": 0.8095446228981018, + "words": + [ + { + "start": 1440, + "end": 1520, + "text": "Hi,", + "confidence": 0.9967870712280273, + "word_is_final": true + }, + { + "start": 1600, + "end": 1680, + "text": "my", + "confidence": 0.999546468257904, + "word_is_final": true + }, + { + "start": 1600, + "end": 1680, + "text": "name", + "confidence": 0.9597182273864746, + "word_is_final": true + }, + { + "start": 1680, + "end": 1760, + "text": "is", + "confidence": 0.8261497616767883, + "word_is_final": true + }, + { + "start": 2320, + "end": 3040, + "text": "Sonny.", + "confidence": 0.5737350583076477, + "word_is_final": true + } + ], + "type": "Turn" + } + ``` + + For the full breakdown of the message sequence for a turn, see the [Message sequence breakdown guide](/docs/speech-to-text/universal-streaming/message-sequence). + + + + + + ```json + { + "type": "Termination", + "audio_duration_seconds": 2000, + "session_duration_seconds": 2000 + } + ``` + + +