From 4f409037c21e1e355ac10fee178d008262427c70 Mon Sep 17 00:00:00 2001 From: kewton Date: Wed, 25 Mar 2026 19:50:49 +0900 Subject: [PATCH] fix(search): add semantic fallback when BM25 returns 0 results in hybrid search When BM25 returns 0 results, RRF merge compressed semantic scores to ~0.016 (1/61), making hybrid search nearly useless for queries without keyword matches. This adds a fallback path in try_hybrid_search() that returns semantic results with their original cosine similarity scores when BM25 is empty, consistent with the existing fallback in suggest.rs. Closes #178 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/cli/search.rs | 13 +++- src/search/hybrid.rs | 39 ++++++++++++ tests/e2e_semantic_hybrid.rs | 119 +++++++++++++++++++++++++++++++++++ 3 files changed, 169 insertions(+), 2 deletions(-) diff --git a/src/cli/search.rs b/src/cli/search.rs index 253e0f8..22df1cb 100644 --- a/src/cli/search.rs +++ b/src/cli/search.rs @@ -932,8 +932,17 @@ fn try_hybrid_search( }) .collect(); - // 8. RRFマージ - Ok(rrf_merge(&bm25_results, &filtered_semantic, options.limit)) + // 8. RRFマージ(BM25=0件の場合はセマンティックフォールバック) + if bm25_results.is_empty() && !filtered_semantic.is_empty() { + eprintln!("[hybrid] BM25 returned 0 results, using semantic-only results."); + Ok(crate::search::hybrid::semantic_fallback( + &filtered_semantic, + &similar_results, + options.limit, + )) + } else { + Ok(rrf_merge(&bm25_results, &filtered_semantic, options.limit)) + } } /// セマンティック検索結果をSearchResult型に変換する(ハイブリッド検索用) diff --git a/src/search/hybrid.rs b/src/search/hybrid.rs index f4ab294..753a7b5 100644 --- a/src/search/hybrid.rs +++ b/src/search/hybrid.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; +use crate::embedding::store::EmbeddingSimilarityResult; use crate::indexer::reader::SearchResult; /// RRF定数(業界標準値) @@ -60,6 +61,44 @@ pub fn rrf_merge( rrf_merge_multiple(&[bm25_results.to_vec(), semantic_results.to_vec()], limit) } +/// BM25が0件の場合にセマンティック結果をコサイン類似度スコアで返すフォールバック。 +/// +/// `filtered_semantic` はtantivyのSearchResult型に変換済みのセマンティック結果。 +/// `similar_results` は元のEmbeddingSimilarityResult(コサイン類似度を保持)。 +/// スコアをコサイン類似度に置換し、類似度降順でソートしてlimitで切り詰める。 +pub fn semantic_fallback( + filtered_semantic: &[SearchResult], + similar_results: &[EmbeddingSimilarityResult], + limit: usize, +) -> Vec { + let similarity_map: HashMap<(String, String), f32> = similar_results + .iter() + .map(|r| { + ( + (r.file_path.clone(), r.section_heading.clone()), + r.similarity, + ) + }) + .collect(); + let mut results: Vec = filtered_semantic + .iter() + .map(|r| { + let mut result = r.clone(); + if let Some(&sim) = similarity_map.get(&(r.path.clone(), r.heading.clone())) { + result.score = sim; + } + result + }) + .collect(); + results.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + results.truncate(limit); + results +} + /// ファイルキーのRRFスコアを計算する内部ヘルパー fn compute_file_rrf_scores(ranked_lists: &[&[(String, f32)]]) -> HashMap { let mut scores = HashMap::new(); diff --git a/tests/e2e_semantic_hybrid.rs b/tests/e2e_semantic_hybrid.rs index 8483c51..cc8874f 100644 --- a/tests/e2e_semantic_hybrid.rs +++ b/tests/e2e_semantic_hybrid.rs @@ -569,6 +569,125 @@ fn test_rerank_fallback_llm_comment() { ); } +// =========================================================================== +// BM25=0 semantic fallback tests (Issue #178) +// =========================================================================== + +#[test] +fn test_hybrid_bm25_zero_semantic_fallback() { + // When BM25 returns 0 results but semantic has hits, + // the fallback should return semantic results with cosine similarity scores. + // BM25 is empty — this test verifies the semantic fallback path + let semantic = vec![ + make_search_result("alpha.md", "Alpha Document", 0.95), + make_search_result("beta.md", "Beta Document", 0.80), + ]; + + // Build a similarity map to simulate what try_hybrid_search does + let similar_results = vec![ + commandindex::embedding::store::EmbeddingSimilarityResult { + file_path: "alpha.md".to_string(), + section_heading: "Alpha Document".to_string(), + similarity: 0.95, + }, + commandindex::embedding::store::EmbeddingSimilarityResult { + file_path: "beta.md".to_string(), + section_heading: "Beta Document".to_string(), + similarity: 0.80, + }, + ]; + + // Use the new fallback function + let results = commandindex::search::hybrid::semantic_fallback(&semantic, &similar_results, 10); + + assert!( + !results.is_empty(), + "test_hybrid_bm25_zero_semantic_fallback: should return results when BM25 is empty" + ); + assert_eq!( + results.len(), + 2, + "test_hybrid_bm25_zero_semantic_fallback: should return 2 results" + ); + + // Scores should be cosine similarity values (0.0 to 1.0 range) + for r in &results { + assert!( + r.score >= 0.0 && r.score <= 1.0, + "test_hybrid_bm25_zero_semantic_fallback: score {} should be in [0.0, 1.0]", + r.score + ); + } + + // alpha should rank first (higher similarity) + assert_eq!( + results[0].path, "alpha.md", + "test_hybrid_bm25_zero_semantic_fallback: alpha (0.95) should rank first" + ); + assert_eq!( + results[1].path, "beta.md", + "test_hybrid_bm25_zero_semantic_fallback: beta (0.80) should rank second" + ); + + // Verify actual score values match cosine similarity + assert!( + (results[0].score - 0.95).abs() < 1e-6, + "test_hybrid_bm25_zero_semantic_fallback: alpha score {} should be ~0.95", + results[0].score + ); + assert!( + (results[1].score - 0.80).abs() < 1e-6, + "test_hybrid_bm25_zero_semantic_fallback: beta score {} should be ~0.80", + results[1].score + ); +} + +#[test] +fn test_hybrid_bm25_zero_semantic_zero() { + // When both BM25 and semantic return 0 results, the result should be empty. + // Both BM25 and semantic are empty + let semantic: Vec = vec![]; + let similar_results: Vec = vec![]; + + let results = commandindex::search::hybrid::semantic_fallback(&semantic, &similar_results, 10); + + assert!( + results.is_empty(), + "test_hybrid_bm25_zero_semantic_zero: should return empty when both are empty" + ); +} + +#[test] +fn test_hybrid_bm25_zero_respects_limit() { + // When BM25=0, semantic fallback should respect the limit parameter. + let semantic: Vec = (0..5) + .map(|i| { + make_search_result( + &format!("doc{i}.md"), + &format!("Doc {i}"), + 0.9 - i as f32 * 0.1, + ) + }) + .collect(); + let similar_results: Vec = (0..5) + .map( + |i| commandindex::embedding::store::EmbeddingSimilarityResult { + file_path: format!("doc{i}.md"), + section_heading: format!("Doc {i}"), + similarity: 0.9 - i as f32 * 0.1, + }, + ) + .collect(); + + let results = commandindex::search::hybrid::semantic_fallback(&semantic, &similar_results, 3); + + assert_eq!( + results.len(), + 3, + "test_hybrid_bm25_zero_respects_limit: should truncate to limit=3" + ); +} + // =========================================================================== // Environment-dependent tests (require Ollama) // ===========================================================================