From 9aef0ad6c09333d278b5cfcdeab53edc8e72180b Mon Sep 17 00:00:00 2001 From: Lincoln Stein Date: Wed, 20 May 2026 16:20:45 -0400 Subject: [PATCH] chore(dedupe): collapse low-risk redundancies from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend: * drop the duplicate `/filetree/home` endpoint from `routers/album.py`; `routers/filetree.py` already serves the canonical one (whichever router was included last had been winning) * extract `_compute_curation()` and `_validate_curation_request()` — `/curate` (async background task) and `/curate_sync` were carrying ~150 lines of identical Monte Carlo / mapping / selection logic. The async path passes a small `on_iteration` callback to drive its ProgressTracker * route the three index endpoints (`delete_image`, `move_images`, `copy_images`) through the existing `get_embeddings_for_album` helper instead of each re-instantiating `Embeddings` directly * add `_l2_normalize(x, axis, eps=1e-12)` to `embeddings.py` and use it in all three numpy-normalize sites — also closes the no-epsilon NaN risk in `find_duplicate_clusters` * add `_normalized_filtered_embeddings()` shared loader for FPS / KMeans (npz open + valid mask + normalize); each function is now ~15 lines shorter and the "load" half is provably identical Frontend: * delete the duplicate `createPathField` in `album-manager.js` (defined twice with identical bodies — the second shadowed the first) * share the exported `scoreDisplay` singleton in `seek-slider.js` instead of constructing a second `ScoreDisplay`. The duplicate instance was double-binding the star click handler on `#scoreText`, so a bookmark toggle fired twice and immediately cancelled itself Net: +172 / −263 across 6 files. Co-Authored-By: Claude Opus 4.7 (1M context) --- photomap/backend/embeddings.py | 93 +++--- photomap/backend/routers/album.py | 12 - photomap/backend/routers/curation.py | 278 +++++++----------- photomap/backend/routers/index.py | 22 +- .../static/javascript/album-manager.js | 23 -- .../frontend/static/javascript/seek-slider.js | 7 +- 6 files changed, 172 insertions(+), 263 deletions(-) diff --git a/photomap/backend/embeddings.py b/photomap/backend/embeddings.py index cab98bb5..06299720 100644 --- a/photomap/backend/embeddings.py +++ b/photomap/backend/embeddings.py @@ -69,6 +69,44 @@ def _get_indexing_semaphore() -> asyncio.Semaphore: return _indexing_semaphore +def _l2_normalize(x: np.ndarray, axis: int = -1, eps: float = 1e-12) -> np.ndarray: + """L2-normalize ``x`` along ``axis`` with an epsilon guard against zero vectors. + + Several call sites previously open-coded ``x / (norm + 1e-10)`` (or worse, + no epsilon at all in ``find_duplicate_clusters``, which would NaN on any + all-zero embedding). Funneling through one helper makes the epsilon + consistent and the intent obvious. + """ + norms = np.linalg.norm(x, axis=axis, keepdims=True) + return x / (norms + eps) + + +def _normalized_filtered_embeddings( + embeddings_path: Path, + ignore_indices: list[int] | None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """Open the .npz, drop ``ignore_indices``, and L2-normalize the survivors. + + Returns ``(normalized_vectors, valid_global_indices, filenames)``. When + every row is masked out, ``normalized_vectors`` is empty (shape ``(0,)``) + and the caller is expected to return early. + """ + data = _open_npz_file(embeddings_path) + embeddings = data["embeddings"] + filenames = data["filenames"] + + valid_mask = np.ones(len(embeddings), dtype=bool) + if ignore_indices: + valid_mask[ignore_indices] = False + valid_global_indices = np.where(valid_mask)[0] + filtered = embeddings[valid_global_indices] + + if len(filtered) == 0: + return filtered, valid_global_indices, filenames + + return _l2_normalize(filtered, axis=1), valid_global_indices, filenames + + register_heif_opener() # Register HEIF opener for PIL SUPPORTED_EXTENSIONS = { ".jpg", @@ -104,33 +142,15 @@ def get_fps_indices_global( Returns: List of selected filenames. """ - data = _open_npz_file(embeddings_path) - embeddings = data["embeddings"] - filenames = data["filenames"] - - n_total = len(embeddings) - - # Create a mask of VALID indices (True = Keep, False = Ignore) - valid_mask = np.ones(n_total, dtype=bool) - if ignore_indices: - valid_mask[ignore_indices] = False - - # Get the indices that are actually available - valid_global_indices = np.where(valid_mask)[0] - - # Filter embeddings to only valid ones - filtered_embeddings = embeddings[valid_global_indices] - - n_samples = len(filtered_embeddings) + vectors, valid_global_indices, filenames = _normalized_filtered_embeddings( + embeddings_path, ignore_indices + ) + n_samples = len(vectors) if n_samples == 0: return [] if n_target >= n_samples: return filenames[valid_global_indices].tolist() - # Normalize - norms = np.linalg.norm(filtered_embeddings, axis=1, keepdims=True) - vectors = filtered_embeddings / (norms + 1e-10) - # Standard FPS Logic on the FILTERED set rng = np.random.RandomState(seed) # Pick random index relative to the FILTERED set @@ -175,29 +195,15 @@ def get_kmeans_indices_global( Returns: List of selected filenames. """ - data = _open_npz_file(embeddings_path) - embeddings = data["embeddings"] - filenames = data["filenames"] - - n_total = len(embeddings) - - valid_mask = np.ones(n_total, dtype=bool) - if ignore_indices: - valid_mask[ignore_indices] = False - - valid_global_indices = np.where(valid_mask)[0] - filtered_embeddings = embeddings[valid_global_indices] - - n_samples = len(filtered_embeddings) + vectors, valid_global_indices, filenames = _normalized_filtered_embeddings( + embeddings_path, ignore_indices + ) + n_samples = len(vectors) if n_samples == 0: return [] if n_target >= n_samples: return filenames[valid_global_indices].tolist() - # Normalize - norms = np.linalg.norm(filtered_embeddings, axis=1, keepdims=True) - vectors = filtered_embeddings / (norms + 1e-10) - # Cluster kmeans = KMeans(n_clusters=n_target, random_state=seed, n_init=10) labels = kmeans.fit_predict(vectors) @@ -1363,10 +1369,9 @@ def find_duplicate_clusters(self, similarity_threshold=0.995): embeddings = data["embeddings"] filenames = data["filenames"] - # Normalize embeddings - norm_embeddings = embeddings / np.linalg.norm( - embeddings, axis=-1, keepdims=True - ) + # Normalize embeddings. ``_l2_normalize`` carries an epsilon guard so + # an all-zero row can't produce NaN here. + norm_embeddings = _l2_normalize(embeddings, axis=-1) assert isinstance( norm_embeddings, np.ndarray ), "Normalization failed, expected np.ndarray" diff --git a/photomap/backend/routers/album.py b/photomap/backend/routers/album.py index e06f0270..0450d6fd 100644 --- a/photomap/backend/routers/album.py +++ b/photomap/backend/routers/album.py @@ -311,15 +311,3 @@ def validate_image_access(album_config, image_path: Path) -> bool: ) -@album_router.get("/filetree/home", tags=["File Management"]) -async def get_home_directory(): - """Get the home directory path for the current user.""" - check_album_lock() # May raise a 403 exception - # In a real application, you would determine the home directory based on the user's - # profile or configuration. Here, we just return a fixed path for demonstration. - try: - home_dir = str(Path.home()) - return {"homePath": home_dir} - except Exception as e: - logger.error(f"Error getting home directory: {e}") - return {"homePath": ""} diff --git a/photomap/backend/routers/curation.py b/photomap/backend/routers/curation.py index 225e26bc..6442f9cc 100644 --- a/photomap/backend/routers/curation.py +++ b/photomap/backend/routers/curation.py @@ -5,6 +5,7 @@ import threading import uuid from collections import Counter +from collections.abc import Callable from pathlib import Path from typing import Any @@ -44,97 +45,120 @@ class ExportRequest(BaseModel): filenames: list[str] output_folder: str +def _validate_curation_request(request: CurationRequest) -> None: + """Clamp / reject obviously bad inputs. Shared by the async and sync endpoints.""" + if request.target_count <= 0: + raise HTTPException(status_code=400, detail="target_count must be positive") + if request.target_count > 100000: + raise HTTPException(status_code=400, detail="target_count exceeds reasonable limit") + if request.iterations < 1: + request.iterations = 1 + if request.iterations > 30: + request.iterations = 30 + + +def _compute_curation( + request: CurationRequest, + on_iteration: Callable[[int], None] | None = None, +) -> dict[str, Any]: + """Run the Monte Carlo curation and build the structured response dict. + + ``on_iteration(i)`` (1-indexed) fires after each Monte Carlo iteration so + the async endpoint can advance its ProgressTracker; the sync endpoint + passes ``None`` and just runs to completion. + + Raises ``LookupError`` if the album key doesn't resolve — callers decide + whether that surfaces as a 404 (sync endpoint) or a ProgressTracker error + (background task). + """ + config_manager = get_config_manager() + album_config = config_manager.get_album(request.album) + if not album_config: + raise LookupError("Album not found") + + index_path = Path(album_config.index) + vote_counter: Counter = Counter() + + for i in range(request.iterations): + run_seed = random.randint(0, 1000000) + if request.method == "kmeans": + selected_files = get_kmeans_indices_global( + index_path, request.target_count, run_seed, request.excluded_indices + ) + else: + selected_files = get_fps_indices_global( + index_path, request.target_count, run_seed, request.excluded_indices + ) + vote_counter.update(selected_files) + if on_iteration is not None: + on_iteration(i + 1) + + data = _open_npz_file(index_path) + filename_map = data["filename_map"] + norm_map = {os.path.normpath(k).lower(): v for k, v in filename_map.items()} + + # Every image that received a vote goes into the analysis table; the + # exclusion check defends against algorithms that returned an excluded + # index (e.g. from index drift after a recent re-index). + analysis_results = [] + for filepath, count in vote_counter.most_common(): + f_norm = os.path.normpath(filepath).lower() + if f_norm in norm_map: + idx = int(norm_map[f_norm]) + if idx in request.excluded_indices: + continue + + analysis_results.append({ + "filename": os.path.basename(filepath), + "subfolder": os.path.basename(os.path.dirname(filepath)), + "filepath": filepath, + "index": idx, + "count": count, + "frequency": round((count / request.iterations) * 100, 1), + }) + + # Top-N consensus winners. + consensus_files = [x["filepath"] for x in analysis_results[: request.target_count]] + selected_indices: list[int] = [] + final_file_list: list[str] = [] + for f in consensus_files: + f_norm = os.path.normpath(f).lower() + if f_norm in norm_map: + selected_indices.append(int(norm_map[f_norm])) + final_file_list.append(f) + + return { + "status": "success", + "count": len(selected_indices), + "target_count": request.target_count, + "selected_indices": selected_indices, + "selected_files": final_file_list, + "analysis_results": analysis_results, + } + + def _run_curation_task(job_id: str, request: CurationRequest): """ Background task to run curation process with progress tracking. """ try: - # Start progress tracking progress_tracker.start_operation(job_id, request.iterations, "curating") - - config_manager = get_config_manager() - album_config = config_manager.get_album(request.album) - if not album_config: - progress_tracker.set_error(job_id, "Album not found") - return - - index_path = Path(album_config.index) logger.info(f"Curation Job {job_id}: Running {request.method.upper()} x{request.iterations}...") - vote_counter = Counter() - - # Run Monte Carlo with progress updates - for i in range(request.iterations): - run_seed = random.randint(0, 1000000) - if request.method == "kmeans": - selected_files = get_kmeans_indices_global( - index_path, request.target_count, run_seed, request.excluded_indices - ) - else: - selected_files = get_fps_indices_global( - index_path, request.target_count, run_seed, request.excluded_indices - ) - vote_counter.update(selected_files) - - # Update progress after each iteration + def _on_iter(i: int) -> None: progress_tracker.update_progress( - job_id, - i + 1, - f"Iteration {i + 1}/{request.iterations}" + job_id, i, f"Iteration {i}/{request.iterations}" ) - # Prepare Data needed for mapping - data = _open_npz_file(index_path) - filename_map = data["filename_map"] - norm_map = {os.path.normpath(k).lower(): v for k, v in filename_map.items()} - - # Generate CSV Data (Analysis Results) - analysis_results = [] - for filepath, count in vote_counter.most_common(): - f_norm = os.path.normpath(filepath).lower() - if f_norm in norm_map: - idx = int(norm_map[f_norm]) - - if idx in request.excluded_indices: - continue - - subfolder = os.path.basename(os.path.dirname(filepath)) - - analysis_results.append({ - "filename": os.path.basename(filepath), - "subfolder": subfolder, - "filepath": filepath, - "index": idx, - "count": count, - "frequency": round((count / request.iterations) * 100, 1) - }) - - # Generate Selection (top N winners) - consensus_files = [x['filepath'] for x in analysis_results[:request.target_count]] - - selected_indices = [] - final_file_list = [] - - for f in consensus_files: - f_norm = os.path.normpath(f).lower() - if f_norm in norm_map: - selected_indices.append(int(norm_map[f_norm])) - final_file_list.append(f) - - result = { - "status": "success", - "count": len(selected_indices), - "target_count": request.target_count, - "selected_indices": selected_indices, - "selected_files": final_file_list, - "analysis_results": analysis_results - } + try: + result = _compute_curation(request, on_iteration=_on_iter) + except LookupError as exc: + progress_tracker.set_error(job_id, str(exc)) + return - # Store result with _curation_results_lock: _curation_results[job_id] = result - # Mark as completed progress_tracker.complete_operation(job_id, "Curation completed") logger.info(f"Curation Job {job_id}: Completed successfully") @@ -154,17 +178,7 @@ async def run_curation(request: CurationRequest, background_tasks: BackgroundTas Returns a job_id that can be used to poll for progress and results. """ try: - # Validate target_count parameter - if request.target_count <= 0: - raise HTTPException(status_code=400, detail="target_count must be positive") - if request.target_count > 100000: - raise HTTPException(status_code=400, detail="target_count exceeds reasonable limit") - - # Validate and cap iterations - if request.iterations < 1: - request.iterations = 1 - if request.iterations > 30: - request.iterations = 30 + _validate_curation_request(request) # Generate unique job ID job_id = f"curation_{uuid.uuid4().hex[:8]}" @@ -178,6 +192,8 @@ async def run_curation(request: CurationRequest, background_tasks: BackgroundTas "iterations": request.iterations } + except HTTPException: + raise except Exception as e: logger.error(f"Failed to start curation: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) from e @@ -239,90 +255,14 @@ async def run_curation_sync(request: CurationRequest): JSON response with status, selected indices, files, and analysis results. """ try: - # Validate target_count parameter - if request.target_count <= 0: - raise HTTPException(status_code=400, detail="target_count must be positive") - if request.target_count > 100000: - raise HTTPException(status_code=400, detail="target_count exceeds reasonable limit") - - # Validate and cap iterations - if request.iterations < 1: - request.iterations = 1 - if request.iterations > 30: - request.iterations = 30 - - config_manager = get_config_manager() - album_config = config_manager.get_album(request.album) - if not album_config: - raise HTTPException(status_code=404, detail="Album not found") - index_path = Path(album_config.index) - + _validate_curation_request(request) logger.info(f"Curation: Running {request.method.upper()} x{request.iterations}...") + return _compute_curation(request) - vote_counter = Counter() - - # 1. Run Monte Carlo - for _i in range(request.iterations): - run_seed = random.randint(0, 1000000) - if request.method == "kmeans": - selected_files = get_kmeans_indices_global( - index_path, request.target_count, run_seed, request.excluded_indices - ) - else: - selected_files = get_fps_indices_global( - index_path, request.target_count, run_seed, request.excluded_indices - ) - vote_counter.update(selected_files) - - # 2. Prepare Data needed for mapping - data = _open_npz_file(index_path) - filename_map = data["filename_map"] - norm_map = {os.path.normpath(k).lower(): v for k, v in filename_map.items()} - - # 3. Generate CSV Data (Analysis Results) - Includes EVERY image that got a vote - analysis_results = [] - for filepath, count in vote_counter.most_common(): - f_norm = os.path.normpath(filepath).lower() - if f_norm in norm_map: - idx = int(norm_map[f_norm]) - - # CRITICAL FIX: Strictly enforce exclusion. - # Even if the algo returned it (e.g. due to index drift), we MUST drop it here. - if idx in request.excluded_indices: - continue - - subfolder = os.path.basename(os.path.dirname(filepath)) - - analysis_results.append({ - "filename": os.path.basename(filepath), - "subfolder": subfolder, - "filepath": filepath, - "index": idx, - "count": count, - "frequency": round((count / request.iterations) * 100, 1) - }) - - # 4. Generate Selection (Green Dots) - Just the top N winners - consensus_files = [x['filepath'] for x in analysis_results[:request.target_count]] - - selected_indices = [] - final_file_list = [] - - for f in consensus_files: - f_norm = os.path.normpath(f).lower() - if f_norm in norm_map: - selected_indices.append(int(norm_map[f_norm])) - final_file_list.append(f) - - return { - "status": "success", - "count": len(selected_indices), - "target_count": request.target_count, - "selected_indices": selected_indices, - "selected_files": final_file_list, - "analysis_results": analysis_results - } - + except HTTPException: + raise + except LookupError as e: + raise HTTPException(status_code=404, detail=str(e)) from e except Exception as e: logger.error(f"Curation Error: {e}") raise HTTPException(status_code=500, detail=str(e)) from e diff --git a/photomap/backend/routers/index.py b/photomap/backend/routers/index.py index 6f9e479b..30db6875 100644 --- a/photomap/backend/routers/index.py +++ b/photomap/backend/routers/index.py @@ -16,7 +16,12 @@ from ..config import get_config_manager from ..embeddings import Embeddings, peek_encoder_spec from ..progress import progress_tracker -from .album import check_album_lock, validate_album_exists, validate_image_access +from .album import ( + check_album_lock, + get_embeddings_for_album, + validate_album_exists, + validate_image_access, +) index_router = APIRouter() @@ -253,10 +258,7 @@ async def delete_image(album_key: str, index: int) -> JSONResponse: check_album_lock() # May raise a 403 exception try: album_config = validate_album_exists(album_key) - embeddings = Embeddings( - embeddings_path=Path(album_config.index), - encoder_spec=album_config.encoder_spec, - ) + embeddings = get_embeddings_for_album(album_key) image_path = embeddings.get_image_path(index) if not validate_image_access(album_config, image_path): @@ -289,10 +291,7 @@ async def move_images(album_key: str, req: MoveImagesRequest) -> JSONResponse: check_album_lock() # May raise a 403 exception try: album_config = validate_album_exists(album_key) - embeddings = Embeddings( - embeddings_path=Path(album_config.index), - encoder_spec=album_config.encoder_spec, - ) + embeddings = get_embeddings_for_album(album_key) target_dir = Path(req.target_directory) @@ -380,10 +379,7 @@ async def copy_images(album_key: str, req: CopyImagesRequest) -> JSONResponse: # Note: No album lock check - copying doesn't modify the album try: album_config = validate_album_exists(album_key) - embeddings = Embeddings( - embeddings_path=Path(album_config.index), - encoder_spec=album_config.encoder_spec, - ) + embeddings = get_embeddings_for_album(album_key) target_dir = Path(req.target_directory) diff --git a/photomap/frontend/static/javascript/album-manager.js b/photomap/frontend/static/javascript/album-manager.js index 6eb37713..338046c6 100644 --- a/photomap/frontend/static/javascript/album-manager.js +++ b/photomap/frontend/static/javascript/album-manager.js @@ -931,29 +931,6 @@ export class AlbumManager { } // Path field methods - createPathField(path = "", cardElement) { - const container = cardElement.querySelector(".edit-album-paths-container"); - return this._createAlbumPathRow({ - path, - container, - onAddRow: () => this.addPathField("", cardElement), - onRemoveRow: () => { - if (container && container.children.length === 0) { - this.addPathField("", cardElement); - } - }, - onFolderPick: (currentPath, setPath) => { - createSimpleDirectoryPicker( - (selectedPath) => { - setPath(selectedPath); - }, - currentPath, - { showCreateFolder: true } - ); - }, - }); - } - addPathField(path = "", cardElement) { const container = cardElement.querySelector(".edit-album-paths-container"); if (container) { diff --git a/photomap/frontend/static/javascript/seek-slider.js b/photomap/frontend/static/javascript/seek-slider.js index f334c156..29f269ab 100644 --- a/photomap/frontend/static/javascript/seek-slider.js +++ b/photomap/frontend/static/javascript/seek-slider.js @@ -1,6 +1,6 @@ import { backStack } from "./back-stack.js"; import { bookmarkManager } from "./bookmarks.js"; -import { ScoreDisplay } from "./score-display.js"; +import { scoreDisplay } from "./score-display.js"; import { getCurrentSlideIndex, slideState } from "./slide-state.js"; import { state } from "./state.js"; import { debounce } from "./utils.js"; @@ -35,7 +35,10 @@ class SeekSlider { this.sliderContainer = document.getElementById("sliderWithTicksContainer"); this.scoreDisplayElement = document.getElementById("fixedScoreDisplay"); this.scoreSliderRow = document.getElementById("scoreSliderRow"); - this.scoreDisplayObj = new ScoreDisplay(); + // Share the singleton from score-display.js. Constructing a second + // instance double-binds the star click handler on #scoreText, so a + // bookmark toggle would fire twice and immediately cancel itself. + this.scoreDisplayObj = scoreDisplay; this.hoverStrip = document.getElementById("sliderHoverStrip"); this.scoreText = document.getElementById("scoreText");