askidmobile
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/runtime/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎crates/runtime/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/runtime/src/pipeline.rs‎
Lines changed: 60 additions & 20 deletions b/‎crates/runtime/src/pipeline.rs‎
Lines changed: 60 additions & 20 deletions
diff --git a/‎crates/tts-app/Cargo.toml‎
Lines changed: 53 additions & 0 deletions b/‎crates/tts-app/Cargo.toml‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎crates/tts-app/build.rs‎
Lines changed: 3 additions & 0 deletions b/‎crates/tts-app/build.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/tts-app/gen/schemas/acl-manifests.json‎
Lines changed: 1 addition & 0 deletions b/‎crates/tts-app/gen/schemas/acl-manifests.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/tts-app/gen/schemas/capabilities.json‎
Lines changed: 1 addition & 0 deletions b/‎crates/tts-app/gen/schemas/capabilities.json‎
Lines changed: 1 addition & 0 deletions
@@ -9,6 +9,7 @@ members = [
     "crates/runtime",
     "crates/tts-cli",
     "crates/tts-server",
+    "crates/tts-app",
 ]
 
 [workspace.package]
 
@@ -34,6 +34,7 @@ metrics-exporter-prometheus.workspace = true
 
 # Serialization
 serde.workspace = true
+serde_json.workspace = true
 toml.workspace = true
 
 # Utilities
 
@@ -41,6 +41,8 @@ pub struct PipelineConfig {
     pub max_seq_len: usize,
     /// Default speaker for CustomVoice models (e.g., "vivian", "ryan").
     pub default_speaker: Option<String>,
+    /// Whether the model is a CustomVoice model (requires speaker prompt format).
+    pub is_custom_voice: bool,
 }
 
 impl Default for PipelineConfig {
@@ -52,6 +54,7 @@ impl Default for PipelineConfig {
             chunk_tokens: 10,
             max_seq_len: 4096, // Max sequence length for audio generation
             default_speaker: None,
+            is_custom_voice: false,
         }
     }
 }
@@ -220,9 +223,16 @@ impl TtsPipeline {
             AcousticBackend::Mock
         };
 
-        let config = PipelineConfig::neural();
+        // Check if this is a CustomVoice model by looking for spk_id in config.json
+        let is_custom_voice = Self::detect_custom_voice_model(talker_dir);
 
-        info!("Pipeline created with {:?} acoustic backend", acoustic);
+        let mut config = PipelineConfig::neural();
+        config.is_custom_voice = is_custom_voice;
+
+        info!(
+            "Pipeline created with {:?} acoustic backend, is_custom_voice={}",
+            acoustic, is_custom_voice
+        );
 
         Ok(Self {
             normalizer: Normalizer::new(),
@@ -235,6 +245,27 @@ impl TtsPipeline {
         })
     }
 
+    /// Detect if this is a CustomVoice model by checking config.json for spk_id.
+    fn detect_custom_voice_model(model_dir: &Path) -> bool {
+        let config_path = model_dir.join("config.json");
+        if let Ok(content) = std::fs::read_to_string(&config_path) {
+            if let Ok(config) = serde_json::from_str::<serde_json::Value>(&content) {
+                // CustomVoice models have talker_config.spk_id object with speaker mappings
+                if let Some(spk_id) = config
+                    .get("talker_config")
+                    .and_then(|t| t.get("spk_id"))
+                    .and_then(|s| s.as_object())
+                {
+                    if !spk_id.is_empty() {
+                        info!("Detected CustomVoice model with {} speakers", spk_id.len());
+                        return true;
+                    }
+                }
+            }
+        }
+        false
+    }
+
     /// Try to load CodePredictor from the same weights file as the main model.
     fn try_load_code_predictor(
         weights_path: &Path,
@@ -454,8 +485,9 @@ impl TtsPipeline {
             seed: None,
         };
 
-        // Minimum tokens based on text length
-        let min_tokens = (text_tokens.len() * 5).max(20);
+        // Match Python SDK: min_new_tokens = 2
+        // This allows EOS early if model decides the text is complete
+        let min_tokens = 2;
 
         // Generate using embeddings
         if let Some(cp) = code_predictor {
@@ -701,24 +733,20 @@ impl TtsPipeline {
             "Combined embeddings built (non_streaming format)"
         );
 
-        // Configure sampling - use greedy for debugging to compare with reference
+        // Configure sampling - match Python SDK parameters
         // Python SDK: temperature=0.9, top_p=1.0, top_k=50, repetition_penalty=1.05
-        // TODO: Make this configurable, use temp=0 for greedy comparison
         let sampling_config = SamplingConfig {
-            temperature: 0.0, // Greedy for debugging
+            temperature: 0.9,
             top_p: 1.0,
             top_k: 50,
-            repetition_penalty: 1.0, // No penalty for greedy
+            repetition_penalty: 1.05,
             seed: None,
         };
 
-        // min_new_tokens based on text length: ~5-10 audio tokens per text token
-        let min_tokens = (text_tokens.len() * 5).max(20);
-        info!(
-            "Setting min_new_tokens={} based on {} text tokens",
-            min_tokens,
-            text_tokens.len()
-        );
+        // Match Python SDK: min_new_tokens = 2
+        // This allows EOS early if model decides the text is complete
+        let min_tokens = 2;
+        info!("min_new_tokens={} (matching Python SDK)", min_tokens);
 
         // ========== COMPUTE trailing_text_hidden ==========
         // Python SDK (modeling_qwen3_tts.py:2230-2232):
@@ -985,8 +1013,9 @@ impl TtsPipeline {
             seed: None,
         };
 
-        // min_new_tokens based on text length
-        let min_tokens = (text_tokens.len() * 5).max(20);
+        // Match Python SDK: min_new_tokens = 2
+        // This allows EOS early if model decides the text is complete
+        let min_tokens = 2;
 
         // If no CodePredictor, generate only zeroth codebook
         let Some(cp) = code_predictor else {
@@ -1108,11 +1137,21 @@ impl TtsPipeline {
                 model,
                 code_predictor,
             } => {
-                // Use CustomVoice format if speaker is provided or if we have speaker configured
-                let use_speaker = speaker.is_some() || self.config.default_speaker.is_some();
+                // Use CustomVoice format if:
+                // 1. Speaker is explicitly provided, OR
+                // 2. We have a default speaker configured, OR
+                // 3. The model is a CustomVoice model (even without speaker, needs proper prompt format)
+                let use_speaker_format = speaker.is_some()
+                    || self.config.default_speaker.is_some()
+                    || self.config.is_custom_voice;
+
                 let actual_speaker = speaker.or(self.config.default_speaker.as_deref());
 
-                if use_speaker {
+                if use_speaker_format {
+                    info!(
+                        "Using CustomVoice format: speaker={:?}, is_custom_voice={}",
+                        actual_speaker, self.config.is_custom_voice
+                    );
                     self.generate_acoustic_with_speaker(
                         model,
                         code_predictor.as_deref(),
@@ -1122,6 +1161,7 @@ impl TtsPipeline {
                         max_tokens,
                     )
                 } else {
+                    info!("Using simple format (non-CustomVoice model)");
                     self.generate_acoustic_neural(
                         model,
                         code_predictor.as_deref(),
 
@@ -0,0 +1,53 @@
+[package]
+name = "tts-app"
+version = "0.1.0"
+description = "Qwen3-TTS Desktop Application"
+authors = ["RustTTS Team"]
+license = "MIT"
+edition = "2021"
+rust-version = "1.70"
+
+[lib]
+name = "tts_app_lib"
+crate-type = ["staticlib", "cdylib", "rlib"]
+
+[build-dependencies]
+tauri-build = { version = "2", features = [] }
+
+[dependencies]
+# Tauri
+tauri = { version = "2", features = [] }
+tauri-plugin-shell = "2"
+
+# Our TTS runtime
+runtime = { path = "../runtime" }
+tts-core = { path = "../tts-core" }
+audio-codec-12hz = { path = "../audio-codec-12hz" }
+
+# Async runtime
+tokio = { version = "1", features = ["rt-multi-thread", "sync", "macros"] }
+
+# Audio playback
+rodio = { version = "0.19", default-features = false, features = ["wav"] }
+
+# Serialization
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+
+# Error handling
+anyhow = "1"
+thiserror = "1"
+
+# Logging
+tracing = "0.1"
+tracing-subscriber = { version = "0.3", features = ["env-filter"] }
+
+# Tensor operations (for device selection)
+candle-core = "0.8"
+
+# System directories
+dirs = "5"
+
+[features]
+default = ["custom-protocol"]
+custom-protocol = ["tauri/custom-protocol"]
@@ -0,0 +1,3 @@
+fn main() {
+    tauri_build::build()
+}
@@ -0,0 +1 @@
+{}
Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ members = [`
`9`	`9`	`"crates/runtime",`
`10`	`10`	`"crates/tts-cli",`
`11`	`11`	`"crates/tts-server",`
	`12`	`+ "crates/tts-app",`
`12`	`13`	`]`
`13`	`14`
`14`	`15`	`[workspace.package]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+fn main() {`
	`2`	`+ tauri_build::build()`
	`3`	`+}`