fromagenticaiframework.speechimportSpeechToText,OpenAISTTConfigconfig=OpenAISTTConfig(model="whisper-1",language="en",# Optional, auto-detect if not settemperature=0.0,response_format="verbose_json")stt=SpeechToText(provider="openai",config=config)
fromagenticaiframework.speechimportTextToSpeech# Initialize TTStts=TextToSpeech(provider="openai")# Generate speechaudio=tts.synthesize("Hello! How can I help you today?")# Save to fileaudio.save("output.mp3")# Get audio bytesaudio_bytes=audio.to_bytes()
fromagenticaiframework.speechimportTextToSpeech,OpenAITTSConfigconfig=OpenAITTSConfig(model="tts-1-hd",# or "tts-1" for fastervoice="alloy",# alloy, echo, fable, onyx, nova, shimmerspeed=1.0,# 0.25 to 4.0response_format="mp3")tts=TextToSpeech(provider="openai",config=config)
importlogginglogger=logging.getLogger(__name__)# List available voicesvoices=tts.list_voices()forvoiceinvoices:logger.info(f"ID: {voice.id}")logger.info(f"Name: {voice.name}")logger.info(f"Language: {voice.language}")logger.info(f"Gender: {voice.gender}")logger.info("---")# Filter by languageenglish_voices=tts.list_voices(language="en")
# Use SSML for advanced controlssml="""<speak> <prosody rate="slow" pitch="+2st"> Welcome to the AgenticAI Framework. </prosody> <break time="500ms"/> <emphasis level="strong">This is important!</emphasis></speak>"""audio=tts.synthesize_ssml(ssml)
fromagenticaiframework.speechimportVoiceActivityDetectorvad=VoiceActivityDetector(sensitivity=0.5,# 0.0 to 1.0min_speech_duration=0.25,min_silence_duration=0.5)asyncforaudio_chunkinmicrophone.record():ifvad.is_speech(audio_chunk):# Process speechtranscription=awaitstt.transcribe_chunk(audio_chunk)else:# Silence detectedifvad.is_end_of_speech():# User finished speakingprocess_complete_utterance()
importlogginglogger=logging.getLogger(__name__)fromagenticaiframework.speechimportVoiceChat# Create voice chat sessionchat=VoiceChat(stt_provider="deepgram",tts_provider="elevenlabs",agent=my_agent)# Run voice conversationasyncdefvoice_conversation():awaitchat.start()try:whileTrue:# Listen for user speechuser_text=awaitchat.listen()logger.info(f"User: {user_text}")# Get agent responseresponse=awaitchat.respond(user_text)logger.info(f"Agent: {response}")# Speak responseawaitchat.speak(response)exceptKeyboardInterrupt:awaitchat.stop()
fromagenticaiframeworkimportSpeechMemoryManager# Initialize speech memoryspeech_memory=SpeechMemoryManager()# Store transcriptspeech_memory.store_transcript(session_id="voice_001",speaker="user",text="What's the weather like today?",timestamp="2024-01-15T10:30:00Z",audio_metadata={"duration_ms":2500,"sample_rate":16000,"format":"wav"})# Store agent responsespeech_memory.store_transcript(session_id="voice_001",speaker="agent",text="The weather today is sunny with a high of 72Β°F.",timestamp="2024-01-15T10:30:05Z",tts_metadata={"voice":"alloy","model":"tts-1-hd"})# Get conversation historyhistory=speech_memory.get_conversation(session_id="voice_001")
# Store voice profile for userspeech_memory.store_voice_profile(user_id="user_123",profile={"voice_embedding":voice_embedding,# For speaker recognition"language":"en-US","accent":"american","speaking_rate":1.2,"preferred_tts_voice":"nova","preferred_tts_speed":1.0})# Get user's voice preferencesprofile=speech_memory.get_voice_profile(user_id="user_123")# Use profile for personalized TTStts=TextToSpeech(provider="openai",voice=profile["preferred_tts_voice"],speed=profile["preferred_tts_speed"])
importlogginglogger=logging.getLogger(__name__)fromagenticaiframeworkimportAgent,AgentConfig# Agent that handles both text and voiceagent=Agent(config=AgentConfig(name="multimodal_assistant",role="Multi-Modal Assistant",modalities=["text","voice"]))# Process based on input typeifinput_type=="voice":transcript=awaitagent.listen()response=agent.execute(transcript)awaitagent.speak(response.output)else:response=agent.execute(text_input)logger.info(response.output)
fromagenticaiframework.speechimportAudioChunkerchunker=AudioChunker(chunk_duration_ms=1000,overlap_ms=100)# Chunk audio for processingchunks=chunker.chunk(audio_data)forchunkinchunks:result=awaitstt.transcribe_chunk(chunk)
fromagenticaiframework.speechimportAudioPreprocessorpreprocessor=AudioPreprocessor(noise_reduction=True,normalize=True,target_sample_rate=16000)# Preprocess before transcriptionclean_audio=preprocessor.process(audio_data)result=stt.transcribe_bytes(clean_audio)
# For long audio files, use chunked processingfromagenticaiframework.speechimportLongAudioProcessorprocessor=LongAudioProcessor(stt=stt,chunk_duration=30,# 30 second chunksoverlap=2# 2 second overlap)result=awaitprocessor.transcribe("long_recording.wav")