diff --git a/apps/desktop/src/components/settings/ai/stt/shared.tsx b/apps/desktop/src/components/settings/ai/stt/shared.tsx index c48d22dc71..80ff5b64b8 100644 --- a/apps/desktop/src/components/settings/ai/stt/shared.tsx +++ b/apps/desktop/src/components/settings/ai/stt/shared.tsx @@ -200,6 +200,16 @@ const _PROVIDERS = [ models: ["voxtral-mini-2602"], requirements: [{ kind: "requires_config", fields: ["api_key"] }], }, + { + disabled: false, + id: "bedrock", + displayName: "Amazon Bedrock", + badge: "Beta", + icon: , + baseUrl: "https://bedrock-mantle.us-east-1.api.aws/v1", + models: ["openai.gpt-4o-transcribe", "openai.whisper-1"], + requirements: [{ kind: "requires_config", fields: ["api_key"] }], + }, { disabled: false, id: "custom", diff --git a/crates/listener-core/src/actors/listener/adapters.rs b/crates/listener-core/src/actors/listener/adapters.rs index 041f876386..41fd439473 100644 --- a/crates/listener-core/src/actors/listener/adapters.rs +++ b/crates/listener-core/src/actors/listener/adapters.rs @@ -4,7 +4,7 @@ use bytes::Bytes; use ractor::{ActorProcessingErr, ActorRef}; use owhisper_client::{ - AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, CactusAdapter, DashScopeAdapter, + AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, BedrockAdapter, CactusAdapter, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, FireworksAdapter, GladiaAdapter, HyprnoteAdapter, MistralAdapter, OpenAIAdapter, RealtimeSttAdapter, SonioxAdapter, }; @@ -95,6 +95,12 @@ pub(super) async fn spawn_rx_task( (AdapterKind::Mistral, true) => { spawn_rx_task_dual_with_adapter::(args, myself).await } + (AdapterKind::Bedrock, false) => { + spawn_rx_task_single_with_adapter::(args, myself).await + } + (AdapterKind::Bedrock, true) => { + spawn_rx_task_dual_with_adapter::(args, myself).await + } (AdapterKind::Hyprnote, false) => { spawn_rx_task_single_with_adapter::(args, myself).await } diff --git a/crates/listener2-core/src/batch.rs b/crates/listener2-core/src/batch.rs index 4f952286ed..783406b0a6 100644 --- a/crates/listener2-core/src/batch.rs +++ b/crates/listener2-core/src/batch.rs @@ -4,7 +4,7 @@ use std::time::Duration; use futures_util::StreamExt; use owhisper_client::{ - AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, BatchSttAdapter, CactusAdapter, + AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, BatchSttAdapter, BedrockAdapter, CactusAdapter, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, FireworksAdapter, GladiaAdapter, HyprnoteAdapter, MistralAdapter, OpenAIAdapter, RealtimeSttAdapter, SonioxAdapter, }; @@ -390,6 +390,7 @@ async fn spawn_batch_task( spawn_batch_task_with_adapter::(args, myself).await } AdapterKind::Mistral => spawn_batch_task_with_adapter::(args, myself).await, + AdapterKind::Bedrock => spawn_batch_task_with_adapter::(args, myself).await, AdapterKind::Hyprnote => { spawn_batch_task_with_adapter::(args, myself).await } diff --git a/crates/listener2-core/src/lib.rs b/crates/listener2-core/src/lib.rs index fdeb2ba3ab..f724470a0c 100644 --- a/crates/listener2-core/src/lib.rs +++ b/crates/listener2-core/src/lib.rs @@ -41,6 +41,7 @@ pub fn suggest_providers_for_languages_batch(languages: &[hypr_language::Languag AdapterKind::ElevenLabs, AdapterKind::DashScope, AdapterKind::Mistral, + AdapterKind::Bedrock, ]; let mut with_support: Vec<_> = all_providers diff --git a/crates/owhisper-client/src/adapter/bedrock/batch.rs b/crates/owhisper-client/src/adapter/bedrock/batch.rs new file mode 100644 index 0000000000..31c840a76d --- /dev/null +++ b/crates/owhisper-client/src/adapter/bedrock/batch.rs @@ -0,0 +1,164 @@ +use std::path::Path; + +use owhisper_interface::ListenParams; +use owhisper_interface::batch::{ + Alternatives as BatchAlternatives, Channel as BatchChannel, Response as BatchResponse, + Results as BatchResults, Word, +}; +use serde::Deserialize; + +use super::BedrockAdapter; +use crate::adapter::http::mime_type_from_extension; +use crate::adapter::{BatchFuture, BatchSttAdapter, ClientWithMiddleware}; +use crate::error::Error; +use crate::providers::{Provider, is_meta_model}; + +// Amazon Bedrock supports OpenAI-compatible audio transcription via +// the bedrock-mantle endpoint: POST /v1/audio/transcriptions +// https://docs.aws.amazon.com/bedrock/latest/userguide/apis.html +impl BatchSttAdapter for BedrockAdapter { + fn is_supported_languages( + &self, + languages: &[hypr_language::Language], + _model: Option<&str>, + ) -> bool { + BedrockAdapter::language_support_batch(languages).is_supported() + } + + fn transcribe_file<'a, P: AsRef + Send + 'a>( + &'a self, + client: &'a ClientWithMiddleware, + api_base: &'a str, + api_key: &'a str, + params: &'a ListenParams, + file_path: P, + ) -> BatchFuture<'a> { + let path = file_path.as_ref().to_path_buf(); + Box::pin(async move { do_transcribe_file(client, api_base, api_key, params, &path).await }) + } +} + +#[derive(Debug, Deserialize)] +struct BedrockWord { + word: String, + start: f64, + end: f64, +} + +#[derive(Debug, Deserialize)] +struct BedrockTranscriptionResponse { + text: String, + #[serde(default)] + words: Option>, + #[serde(default)] + #[allow(dead_code)] + language: Option, + #[serde(default)] + #[allow(dead_code)] + duration: Option, +} + +async fn do_transcribe_file( + client: &ClientWithMiddleware, + api_base: &str, + api_key: &str, + params: &ListenParams, + file_path: &Path, +) -> Result { + let fallback_name = match file_path.extension().and_then(|e| e.to_str()) { + Some(ext) => format!("audio.{}", ext), + None => "audio".to_string(), + }; + + let file_name = file_path + .file_name() + .and_then(|n| n.to_str()) + .map(ToOwned::to_owned) + .unwrap_or(fallback_name); + + let file_bytes = tokio::fs::read(file_path) + .await + .map_err(|e| Error::AudioProcessing(e.to_string()))?; + + let mime_type = mime_type_from_extension(file_path); + + let file_part = reqwest::multipart::Part::bytes(file_bytes) + .file_name(file_name) + .mime_str(mime_type) + .map_err(|e| Error::AudioProcessing(e.to_string()))?; + + let default = Provider::Bedrock.default_batch_model(); + let model = match params.model.as_deref() { + Some(m) if is_meta_model(m) => default, + Some(m) => m, + None => default, + }; + + let mut form = reqwest::multipart::Form::new() + .part("file", file_part) + .text("model", model.to_string()); + + form = form.text("response_format", "verbose_json"); + form = form.text("timestamp_granularities[]", "word"); + + if let Some(lang) = params.languages.first() { + form = form.text("language", lang.iso639().code().to_string()); + } + + let base = if api_base.is_empty() { + Provider::Bedrock.default_api_base() + } else { + api_base.trim_end_matches('/') + }; + let url = format!("{}/audio/transcriptions", base); + + let response = client + .post(&url) + .header("Authorization", format!("Bearer {}", api_key)) + .multipart(form) + .send() + .await?; + + let status = response.status(); + if !status.is_success() { + let body = response.text().await.unwrap_or_default(); + return Err(Error::UnexpectedStatus { status, body }); + } + + let bedrock_response: BedrockTranscriptionResponse = response.json().await?; + + let words: Vec = bedrock_response + .words + .unwrap_or_default() + .into_iter() + .map(|w| Word { + word: w.word.clone(), + start: w.start, + end: w.end, + confidence: 1.0, + speaker: None, + punctuated_word: Some(w.word), + }) + .collect(); + + let alternatives = BatchAlternatives { + transcript: bedrock_response.text.trim().to_string(), + confidence: 1.0, + words, + }; + + let channel = BatchChannel { + alternatives: vec![alternatives], + }; + + let metadata = serde_json::json!({ + "language": bedrock_response.language, + }); + + Ok(BatchResponse { + metadata, + results: BatchResults { + channels: vec![channel], + }, + }) +} diff --git a/crates/owhisper-client/src/adapter/bedrock/live.rs b/crates/owhisper-client/src/adapter/bedrock/live.rs new file mode 100644 index 0000000000..df582e60e3 --- /dev/null +++ b/crates/owhisper-client/src/adapter/bedrock/live.rs @@ -0,0 +1,372 @@ +use hypr_ws_client::client::Message; +use owhisper_interface::ListenParams; +use owhisper_interface::stream::{Alternatives, Channel, Metadata, StreamResponse}; +use serde::{Deserialize, Serialize}; + +use super::BedrockAdapter; +use crate::adapter::RealtimeSttAdapter; +use crate::adapter::parsing::{WordBuilder, calculate_time_span}; + +// Amazon Bedrock via bedrock-mantle exposes an OpenAI-compatible Realtime API. +// This adapter follows the same protocol as the OpenAI adapter. +impl RealtimeSttAdapter for BedrockAdapter { + fn provider_name(&self) -> &'static str { + "bedrock" + } + + fn is_supported_languages( + &self, + languages: &[hypr_language::Language], + _model: Option<&str>, + ) -> bool { + BedrockAdapter::is_supported_languages_live(languages) + } + + fn supports_native_multichannel(&self) -> bool { + false + } + + fn build_ws_url(&self, api_base: &str, _params: &ListenParams, _channels: u8) -> url::Url { + let (mut url, existing_params) = Self::build_ws_url_from_base(api_base); + + if !existing_params.is_empty() { + let mut query_pairs = url.query_pairs_mut(); + for (key, value) in &existing_params { + query_pairs.append_pair(key, value); + } + } + + url + } + + fn build_auth_header(&self, api_key: Option<&str>) -> Option<(&'static str, String)> { + api_key.and_then(|k| crate::providers::Provider::Bedrock.build_auth_header(k)) + } + + fn keep_alive_message(&self) -> Option { + None + } + + fn audio_to_message(&self, audio: bytes::Bytes) -> Message { + use base64::Engine; + let base64_audio = base64::engine::general_purpose::STANDARD.encode(&audio); + let event = InputAudioBufferAppend { + event_type: "input_audio_buffer.append".to_string(), + audio: base64_audio, + }; + Message::Text(serde_json::to_string(&event).unwrap().into()) + } + + fn initial_message( + &self, + _api_key: Option<&str>, + params: &ListenParams, + _channels: u8, + ) -> Option { + let language = params + .languages + .first() + .map(|l| l.iso639().code().to_string()); + + let default = crate::providers::Provider::Bedrock.default_live_model(); + let model = match params.model.as_deref() { + Some(m) if crate::providers::is_meta_model(m) => default, + Some(m) => m, + None => default, + }; + + let session_config = SessionUpdateEvent { + event_type: "session.update".to_string(), + session: SessionConfig { + session_type: "transcription".to_string(), + audio: Some(AudioConfig { + input: Some(AudioInputConfig { + format: Some(AudioFormat { + format_type: "audio/pcm".to_string(), + rate: params.sample_rate, + }), + transcription: Some(TranscriptionConfig { + model: model.to_string(), + language, + }), + turn_detection: Some(TurnDetection { + detection_type: "server_vad".to_string(), + threshold: Some(0.5), + prefix_padding_ms: Some(300), + silence_duration_ms: Some(500), + }), + }), + }), + include: Some(vec!["item.input_audio_transcription.logprobs".to_string()]), + }, + }; + + let json = serde_json::to_string(&session_config).ok()?; + tracing::debug!(payload = %json, "bedrock_session_update_payload"); + Some(Message::Text(json.into())) + } + + fn finalize_message(&self) -> Message { + let commit = InputAudioBufferCommit { + event_type: "input_audio_buffer.commit".to_string(), + }; + Message::Text(serde_json::to_string(&commit).unwrap().into()) + } + + fn parse_response(&self, raw: &str) -> Vec { + let event: BedrockEvent = match serde_json::from_str(raw) { + Ok(e) => e, + Err(e) => { + tracing::warn!(error = ?e, raw = raw, "bedrock_json_parse_failed"); + return vec![]; + } + }; + + match event { + BedrockEvent::SessionCreated { session } => { + tracing::debug!(session_id = %session.id, "bedrock_session_created"); + vec![] + } + BedrockEvent::SessionUpdated { session } => { + tracing::debug!(session_id = %session.id, "bedrock_session_updated"); + vec![] + } + BedrockEvent::InputAudioBufferCommitted { item_id } => { + tracing::debug!(item_id = %item_id, "bedrock_audio_buffer_committed"); + vec![] + } + BedrockEvent::InputAudioBufferCleared => { + tracing::debug!("bedrock_audio_buffer_cleared"); + vec![] + } + BedrockEvent::InputAudioBufferSpeechStarted { item_id } => { + tracing::debug!(item_id = %item_id, "bedrock_speech_started"); + vec![] + } + BedrockEvent::InputAudioBufferSpeechStopped { item_id } => { + tracing::debug!(item_id = %item_id, "bedrock_speech_stopped"); + vec![] + } + BedrockEvent::ConversationItemInputAudioTranscriptionCompleted { + item_id, + content_index, + transcript, + } => { + tracing::debug!( + item_id = %item_id, + content_index = content_index, + transcript = %transcript, + "bedrock_transcription_completed" + ); + Self::build_transcript_response(&transcript, true, true) + } + BedrockEvent::ConversationItemInputAudioTranscriptionDelta { + item_id, + content_index, + delta, + } => { + tracing::debug!( + item_id = %item_id, + content_index = content_index, + delta = %delta, + "bedrock_transcription_delta" + ); + Self::build_transcript_response(&delta, false, false) + } + BedrockEvent::ConversationItemInputAudioTranscriptionFailed { + item_id, error, .. + } => { + tracing::error!( + item_id = %item_id, + error_type = %error.error_type, + error_message = %error.message, + "bedrock_transcription_failed" + ); + vec![StreamResponse::ErrorResponse { + error_code: None, + error_message: format!("{}: {}", error.error_type, error.message), + provider: "bedrock".to_string(), + }] + } + BedrockEvent::Error { error } => { + tracing::error!( + error_type = %error.error_type, + error_message = %error.message, + "bedrock_error" + ); + vec![StreamResponse::ErrorResponse { + error_code: None, + error_message: format!("{}: {}", error.error_type, error.message), + provider: "bedrock".to_string(), + }] + } + BedrockEvent::Unknown => { + tracing::debug!(raw = raw, "bedrock_unknown_event"); + vec![] + } + } + } +} + +#[derive(Debug, Serialize)] +struct SessionUpdateEvent { + #[serde(rename = "type")] + event_type: String, + session: SessionConfig, +} + +#[derive(Debug, Serialize)] +struct SessionConfig { + #[serde(rename = "type")] + session_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + audio: Option, + #[serde(skip_serializing_if = "Option::is_none")] + include: Option>, +} + +#[derive(Debug, Serialize)] +struct AudioConfig { + #[serde(skip_serializing_if = "Option::is_none")] + input: Option, +} + +#[derive(Debug, Serialize)] +struct AudioInputConfig { + #[serde(skip_serializing_if = "Option::is_none")] + format: Option, + #[serde(skip_serializing_if = "Option::is_none")] + transcription: Option, + #[serde(skip_serializing_if = "Option::is_none")] + turn_detection: Option, +} + +#[derive(Debug, Serialize)] +struct AudioFormat { + #[serde(rename = "type")] + format_type: String, + rate: u32, +} + +#[derive(Debug, Serialize)] +struct TranscriptionConfig { + model: String, + #[serde(skip_serializing_if = "Option::is_none")] + language: Option, +} + +#[derive(Debug, Serialize)] +struct TurnDetection { + #[serde(rename = "type")] + detection_type: String, + #[serde(skip_serializing_if = "Option::is_none")] + threshold: Option, + #[serde(skip_serializing_if = "Option::is_none")] + prefix_padding_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + silence_duration_ms: Option, +} + +#[derive(Debug, Serialize)] +struct InputAudioBufferAppend { + #[serde(rename = "type")] + event_type: String, + audio: String, +} + +#[derive(Debug, Serialize)] +struct InputAudioBufferCommit { + #[serde(rename = "type")] + event_type: String, +} + +#[derive(Debug, Deserialize)] +#[serde(tag = "type")] +#[allow(dead_code)] +enum BedrockEvent { + #[serde(rename = "session.created")] + SessionCreated { session: SessionInfo }, + #[serde(rename = "session.updated")] + SessionUpdated { session: SessionInfo }, + #[serde(rename = "input_audio_buffer.committed")] + InputAudioBufferCommitted { item_id: String }, + #[serde(rename = "input_audio_buffer.cleared")] + InputAudioBufferCleared, + #[serde(rename = "input_audio_buffer.speech_started")] + InputAudioBufferSpeechStarted { item_id: String }, + #[serde(rename = "input_audio_buffer.speech_stopped")] + InputAudioBufferSpeechStopped { item_id: String }, + #[serde(rename = "conversation.item.input_audio_transcription.completed")] + ConversationItemInputAudioTranscriptionCompleted { + item_id: String, + content_index: u32, + transcript: String, + }, + #[serde(rename = "conversation.item.input_audio_transcription.delta")] + ConversationItemInputAudioTranscriptionDelta { + item_id: String, + content_index: u32, + delta: String, + }, + #[serde(rename = "conversation.item.input_audio_transcription.failed")] + ConversationItemInputAudioTranscriptionFailed { + item_id: String, + content_index: u32, + error: BedrockError, + }, + #[serde(rename = "error")] + Error { error: BedrockError }, + #[serde(other)] + Unknown, +} + +#[derive(Debug, Deserialize)] +struct SessionInfo { + id: String, +} + +#[derive(Debug, Deserialize)] +struct BedrockError { + #[serde(rename = "type")] + error_type: String, + message: String, +} + +impl BedrockAdapter { + fn build_transcript_response( + transcript: &str, + is_final: bool, + speech_final: bool, + ) -> Vec { + if transcript.is_empty() { + return vec![]; + } + + let words: Vec<_> = transcript + .split_whitespace() + .map(|word| WordBuilder::new(word).confidence(1.0).build()) + .collect(); + + let (start, duration) = calculate_time_span(&words); + + let channel = Channel { + alternatives: vec![Alternatives { + transcript: transcript.to_string(), + words, + confidence: 1.0, + languages: vec![], + }], + }; + + vec![StreamResponse::TranscriptResponse { + is_final, + speech_final, + from_finalize: false, + start, + duration, + channel, + metadata: Metadata::default(), + channel_index: vec![0, 1], + }] + } +} diff --git a/crates/owhisper-client/src/adapter/bedrock/mod.rs b/crates/owhisper-client/src/adapter/bedrock/mod.rs new file mode 100644 index 0000000000..f58c3e925f --- /dev/null +++ b/crates/owhisper-client/src/adapter/bedrock/mod.rs @@ -0,0 +1,65 @@ +mod batch; +mod live; + +use crate::providers::Provider; + +use super::{LanguageQuality, LanguageSupport}; + +#[derive(Clone, Default)] +pub struct BedrockAdapter; + +impl BedrockAdapter { + pub fn language_support_live(_languages: &[hypr_language::Language]) -> LanguageSupport { + LanguageSupport::Supported { + quality: LanguageQuality::NoData, + } + } + + pub fn language_support_batch(_languages: &[hypr_language::Language]) -> LanguageSupport { + Self::language_support_live(_languages) + } + + pub fn is_supported_languages_live(languages: &[hypr_language::Language]) -> bool { + Self::language_support_live(languages).is_supported() + } + + pub fn is_supported_languages_batch(languages: &[hypr_language::Language]) -> bool { + Self::language_support_batch(languages).is_supported() + } + + pub(crate) fn build_ws_url_from_base(api_base: &str) -> (url::Url, Vec<(String, String)>) { + // Bedrock Mantle is OpenAI-compatible and uses the same Realtime API surface. + // We follow the OpenAI adapter's URL behavior (including intent=transcription). + if api_base.is_empty() { + return ( + Provider::Bedrock + .default_ws_url() + .parse() + .expect("invalid_default_ws_url"), + vec![("intent".to_string(), "transcription".to_string())], + ); + } + + if let Some(proxy_result) = super::build_proxy_ws_url(api_base) { + return proxy_result; + } + + let parsed: url::Url = api_base.parse().expect("invalid_api_base"); + let mut existing_params = super::extract_query_params(&parsed); + + if !existing_params.iter().any(|(k, _)| k == "intent") { + existing_params.push(("intent".to_string(), "transcription".to_string())); + } + + let host = parsed + .host_str() + .unwrap_or(Provider::Bedrock.default_ws_host()); + let mut url: url::Url = format!("wss://{}{}", host, Provider::Bedrock.ws_path()) + .parse() + .expect("invalid_ws_url"); + + super::set_scheme_from_host(&mut url); + + (url, existing_params) + } +} diff --git a/crates/owhisper-client/src/adapter/mod.rs b/crates/owhisper-client/src/adapter/mod.rs index 3eaf1cc694..2350a60aeb 100644 --- a/crates/owhisper-client/src/adapter/mod.rs +++ b/crates/owhisper-client/src/adapter/mod.rs @@ -1,5 +1,6 @@ mod argmax; pub(crate) mod assemblyai; +mod bedrock; mod cactus; mod dashscope; pub mod deepgram; @@ -19,6 +20,7 @@ mod url_builder; pub use argmax::*; pub use assemblyai::*; +pub use bedrock::*; pub use cactus::*; pub use dashscope::*; pub use deepgram::*; @@ -362,6 +364,8 @@ pub enum AdapterKind { DashScope, #[strum(serialize = "mistral")] Mistral, + #[strum(serialize = "bedrock")] + Bedrock, #[strum(serialize = "hyprnote")] Hyprnote, #[strum(serialize = "cactus")] @@ -413,6 +417,7 @@ impl AdapterKind { Self::DashScope => DashScopeAdapter::language_support_live(languages), Self::Argmax => ArgmaxAdapter::language_support_live(languages, model), Self::Mistral => MistralAdapter::language_support_live(languages), + Self::Bedrock => BedrockAdapter::language_support_live(languages), Self::Hyprnote | Self::Cactus => LanguageSupport::Supported { quality: LanguageQuality::NoData, }, @@ -438,6 +443,7 @@ impl AdapterKind { Self::DashScope => DashScopeAdapter::language_support_batch(languages), Self::Argmax => ArgmaxAdapter::language_support_batch(languages, model), Self::Mistral => MistralAdapter::language_support_batch(languages), + Self::Bedrock => BedrockAdapter::language_support_batch(languages), Self::Hyprnote | Self::Cactus => LanguageSupport::Supported { quality: LanguageQuality::NoData, }, @@ -484,6 +490,7 @@ impl From for AdapterKind { Provider::ElevenLabs => Self::ElevenLabs, Provider::DashScope => Self::DashScope, Provider::Mistral => Self::Mistral, + Provider::Bedrock => Self::Bedrock, } } } diff --git a/crates/owhisper-client/src/lib.rs b/crates/owhisper-client/src/lib.rs index 34b8e0fba3..eca4d05447 100644 --- a/crates/owhisper-client/src/lib.rs +++ b/crates/owhisper-client/src/lib.rs @@ -17,10 +17,10 @@ use std::marker::PhantomData; pub use adapter::deepgram::DeepgramModel; pub use adapter::{ - AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, BatchSttAdapter, CactusAdapter, CallbackResult, - CallbackSttAdapter, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, FireworksAdapter, - GladiaAdapter, HyprnoteAdapter, LanguageQuality, LanguageSupport, MistralAdapter, - OpenAIAdapter, RealtimeSttAdapter, SonioxAdapter, append_provider_param, + AdapterKind, ArgmaxAdapter, AssemblyAIAdapter, BatchSttAdapter, BedrockAdapter, CactusAdapter, + CallbackResult, CallbackSttAdapter, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, + FireworksAdapter, GladiaAdapter, HyprnoteAdapter, LanguageQuality, LanguageSupport, + MistralAdapter, OpenAIAdapter, RealtimeSttAdapter, SonioxAdapter, append_provider_param, documented_language_codes_batch, documented_language_codes_live, is_hyprnote_proxy, is_local_host, normalize_languages, }; diff --git a/crates/owhisper-client/src/providers.rs b/crates/owhisper-client/src/providers.rs index 3f3581c0ba..3cde3561c9 100644 --- a/crates/owhisper-client/src/providers.rs +++ b/crates/owhisper-client/src/providers.rs @@ -84,10 +84,12 @@ pub enum Provider { DashScope, #[strum(serialize = "mistral")] Mistral, + #[strum(serialize = "bedrock")] + Bedrock, } impl Provider { - const ALL: [Provider; 9] = [ + const ALL: [Provider; 10] = [ Self::Deepgram, Self::AssemblyAI, Self::Soniox, @@ -97,6 +99,7 @@ impl Provider { Self::ElevenLabs, Self::DashScope, Self::Mistral, + Self::Bedrock, ]; pub fn from_host(host: &str) -> Option { @@ -139,6 +142,10 @@ impl Provider { name: "Authorization", prefix: Some("Bearer "), }, + Self::Bedrock => Auth::Header { + name: "Authorization", + prefix: Some("Bearer "), + }, } } @@ -161,6 +168,7 @@ impl Provider { Self::ElevenLabs => "api.elevenlabs.io", Self::DashScope => "dashscope-intl.aliyuncs.com", Self::Mistral => "api.mistral.ai", + Self::Bedrock => "bedrock-mantle.us-east-1.api.aws", } } @@ -175,6 +183,7 @@ impl Provider { Self::ElevenLabs => "api.elevenlabs.io", Self::DashScope => "dashscope-intl.aliyuncs.com", Self::Mistral => "api.mistral.ai", + Self::Bedrock => "bedrock-mantle.us-east-1.api.aws", } } @@ -189,6 +198,7 @@ impl Provider { Self::ElevenLabs => "/v1/speech-to-text/realtime", Self::DashScope => "/api-ws/v1/realtime", Self::Mistral => "/v1/audio/transcriptions/realtime", + Self::Bedrock => "/v1/realtime", } } @@ -203,6 +213,7 @@ impl Provider { Self::ElevenLabs => Some("https://api.elevenlabs.io/v1"), Self::DashScope => None, Self::Mistral => None, + Self::Bedrock => None, } } @@ -217,6 +228,7 @@ impl Provider { Self::ElevenLabs => "https://api.elevenlabs.io", Self::DashScope => "https://dashscope-intl.aliyuncs.com", Self::Mistral => "https://api.mistral.ai/v1", + Self::Bedrock => "https://bedrock-mantle.us-east-1.api.aws/v1", } } @@ -231,6 +243,7 @@ impl Provider { Self::ElevenLabs => "elevenlabs.io", Self::DashScope => "aliyuncs.com", Self::Mistral => "mistral.ai", + Self::Bedrock => "api.aws", } } @@ -263,6 +276,7 @@ impl Provider { Self::ElevenLabs => "ELEVENLABS_API_KEY", Self::DashScope => "DASHSCOPE_API_KEY", Self::Mistral => "MISTRAL_API_KEY", + Self::Bedrock => "BEDROCK_API_KEY", } } @@ -277,6 +291,7 @@ impl Provider { Self::ElevenLabs => "scribe_v2_realtime", Self::DashScope => "qwen3-asr-flash-realtime", Self::Mistral => "voxtral-mini-transcribe-realtime-2602", + Self::Bedrock => "openai.gpt-4o-transcribe", } } @@ -284,6 +299,7 @@ impl Provider { match self { Self::OpenAI => 24000, Self::ElevenLabs | Self::DashScope | Self::Mistral => 16000, + Self::Bedrock => 24000, _ => 16000, } } @@ -299,6 +315,7 @@ impl Provider { Self::ElevenLabs => "scribe_v2", Self::DashScope => "qwen3-asr-flash-filetrans", Self::Mistral => "voxtral-mini-2602", + Self::Bedrock => "amazon.nova-sonic-v1:0", } } @@ -306,7 +323,7 @@ impl Provider { match self { Self::Deepgram => &[("model", "nova-3-general"), ("mip_opt_out", "false")], Self::OpenAI => &[("intent", "transcription")], - Self::DashScope | Self::Mistral => &[], + Self::DashScope | Self::Mistral | Self::Bedrock => &[], _ => &[], } } @@ -320,7 +337,8 @@ impl Provider { | Self::OpenAI | Self::ElevenLabs | Self::DashScope - | Self::Mistral => false, + | Self::Mistral + | Self::Bedrock => false, } } @@ -333,7 +351,7 @@ impl Provider { Self::OpenAI => &[], Self::Gladia => &[], Self::ElevenLabs => &["commit"], - Self::DashScope | Self::Mistral => &[], + Self::DashScope | Self::Mistral | Self::Bedrock => &[], } } @@ -352,7 +370,7 @@ impl Provider { "words_accurate_timestamps": true } })), - Self::Mistral => None, + Self::Mistral | Self::Bedrock => None, _ => None, } } @@ -363,7 +381,12 @@ impl Provider { Self::Soniox => soniox::error::detect_error(data), Self::ElevenLabs => elevenlabs::error::detect_error(data), Self::AssemblyAI => assemblyai::error::detect_error(data), - Self::Fireworks | Self::OpenAI | Self::Gladia | Self::DashScope | Self::Mistral => None, + Self::Fireworks + | Self::OpenAI + | Self::Gladia + | Self::DashScope + | Self::Mistral + | Self::Bedrock => None, } } diff --git a/crates/transcribe-proxy/src/env.rs b/crates/transcribe-proxy/src/env.rs index 22449de50a..dccd1a3677 100644 --- a/crates/transcribe-proxy/src/env.rs +++ b/crates/transcribe-proxy/src/env.rs @@ -23,6 +23,8 @@ pub struct SttApiKeysEnv { pub dashscope_api_key: Option, #[serde(default)] pub mistral_api_key: Option, + #[serde(default)] + pub bedrock_api_key: Option, } #[derive(Deserialize, Default)] @@ -88,6 +90,9 @@ impl From<&SttApiKeysEnv> for ApiKeys { if let Some(key) = env.mistral_api_key.as_ref().filter(|s| !s.is_empty()) { map.insert(Provider::Mistral, key.clone()); } + if let Some(key) = env.bedrock_api_key.as_ref().filter(|s| !s.is_empty()) { + map.insert(Provider::Bedrock, key.clone()); + } Self(map) } } diff --git a/crates/transcribe-proxy/src/routes/batch/sync.rs b/crates/transcribe-proxy/src/routes/batch/sync.rs index 37592902b3..572a537e46 100644 --- a/crates/transcribe-proxy/src/routes/batch/sync.rs +++ b/crates/transcribe-proxy/src/routes/batch/sync.rs @@ -8,8 +8,8 @@ use axum::{ }; use backon::{ExponentialBuilder, Retryable}; use owhisper_client::{ - AssemblyAIAdapter, BatchClient, DeepgramAdapter, ElevenLabsAdapter, GladiaAdapter, - MistralAdapter, OpenAIAdapter, Provider, SonioxAdapter, + AssemblyAIAdapter, BatchClient, BedrockAdapter, DeepgramAdapter, ElevenLabsAdapter, + GladiaAdapter, MistralAdapter, OpenAIAdapter, Provider, SonioxAdapter, }; use owhisper_interface::ListenParams; use owhisper_interface::batch::Response as BatchResponse; @@ -185,6 +185,7 @@ pub(super) async fn transcribe_with_provider( Provider::Gladia => batch_transcribe!(GladiaAdapter), Provider::ElevenLabs => batch_transcribe!(ElevenLabsAdapter), Provider::Mistral => batch_transcribe!(MistralAdapter), + Provider::Bedrock => batch_transcribe!(BedrockAdapter), Provider::Fireworks | Provider::DashScope => { return Err(format!( "{:?} does not support batch transcription", diff --git a/crates/transcribe-proxy/src/routes/streaming/hyprnote.rs b/crates/transcribe-proxy/src/routes/streaming/hyprnote.rs index 9969c474d8..8154adf137 100644 --- a/crates/transcribe-proxy/src/routes/streaming/hyprnote.rs +++ b/crates/transcribe-proxy/src/routes/streaming/hyprnote.rs @@ -1,7 +1,7 @@ use std::sync::Arc; use owhisper_client::{ - AssemblyAIAdapter, Auth, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, + AssemblyAIAdapter, Auth, BedrockAdapter, DashScopeAdapter, DeepgramAdapter, ElevenLabsAdapter, FireworksAdapter, GladiaAdapter, MistralAdapter, OpenAIAdapter, Provider, RealtimeSttAdapter, SonioxAdapter, }; @@ -47,6 +47,7 @@ fn build_upstream_url_with_adapter( Provider::ElevenLabs => ElevenLabsAdapter.build_ws_url(api_base, params, channels), Provider::DashScope => DashScopeAdapter.build_ws_url(api_base, params, channels), Provider::Mistral => MistralAdapter::default().build_ws_url(api_base, params, channels), + Provider::Bedrock => BedrockAdapter.build_ws_url(api_base, params, channels), } } @@ -66,6 +67,7 @@ fn build_initial_message_with_adapter( Provider::ElevenLabs => ElevenLabsAdapter.initial_message(api_key, params, channels), Provider::DashScope => DashScopeAdapter.initial_message(api_key, params, channels), Provider::Mistral => MistralAdapter::default().initial_message(api_key, params, channels), + Provider::Bedrock => BedrockAdapter.initial_message(api_key, params, channels), }; msg.and_then(|m| match m { @@ -89,6 +91,7 @@ fn build_response_transformer( Provider::ElevenLabs => ElevenLabsAdapter.parse_response(raw), Provider::DashScope => DashScopeAdapter.parse_response(raw), Provider::Mistral => mistral_adapter.parse_response(raw), + Provider::Bedrock => BedrockAdapter.parse_response(raw), }; if responses.is_empty() { diff --git a/crates/transcribe-proxy/tests/common/mod.rs b/crates/transcribe-proxy/tests/common/mod.rs index ca3bad2b6c..e0c00ca01c 100644 --- a/crates/transcribe-proxy/tests/common/mod.rs +++ b/crates/transcribe-proxy/tests/common/mod.rs @@ -87,6 +87,7 @@ pub fn env_with_provider(provider: Provider, api_key: String) -> transcribe_prox Provider::ElevenLabs => env.stt.elevenlabs_api_key = Some(api_key), Provider::DashScope => env.stt.dashscope_api_key = Some(api_key), Provider::Mistral => env.stt.mistral_api_key = Some(api_key), + Provider::Bedrock => env.stt.bedrock_api_key = Some(api_key), } env }