From 3be518676b1cf96b496d6c893dadba1c91f66349 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 15:53:16 +0000 Subject: [PATCH] feat: add retry handling for transient sync failures https://claude.ai/code/session_01V5c1rKZeg6R373XJw2rYHd --- CHANGELOG.md | 15 ++ nexanote/api/routes.py | 9 + nexanote/sync/client.py | 426 ++++++++++++++++++++++++++------- nexanote/sync/sync_log.py | 24 ++ tests/test_sync_reliability.py | 260 +++++++++++++++++++- 5 files changed, 645 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1e1dd4..895dfc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,21 @@ This project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). note body content, passwords, tokens, or server URLs — error strings are scrubbed and the plan/report only ever hold metadata. +- **Retry & backoff for transient failures.** Every WebDAV network + operation (GET, PROPFIND, PUT, MKCOL) is now retried on transient + conditions — timeouts, connection drops, and HTTP 429/502/503/504 — + with small conservative defaults (3 attempts, 0.5s/1s/2s backoff). + Auth failures (401/403) and 404 are never retried. Both the attempt + budget and per-call timeout are configurable on `SyncConfig` + (`max_attempts`, `backoff_seconds`, `timeout_seconds`). + +- **Retryable sync reports.** When a sync fails for a transient reason the + report (and `POST /sync/trigger`) carries `retryable: true`, a short + `transient_reason`, and a suggested `next_retry_after_seconds`. The + diagnostic log additionally records per-operation attempt counts so a + flaky NAS/Cloudflare/mobile link is easy to spot — still with no body + content, credentials, or server URLs. + ### Improved - **Conflict safety.** When a note changed both locally and remotely, the diff --git a/nexanote/api/routes.py b/nexanote/api/routes.py index a4f87bf..ab1f4a8 100644 --- a/nexanote/api/routes.py +++ b/nexanote/api/routes.py @@ -176,6 +176,12 @@ class SyncReportSchema(BaseModel): conflicts: list[dict] = Field(default_factory=list) warnings: list[str] = Field(default_factory=list) plan: dict = Field(default_factory=dict) + # Network reliability — additive too. ``retryable`` flags a transient + # failure (timeout/connection/429/502/503/504) so clients can back off + # and retry instead of surfacing it as a hard error. + retryable: bool = False + next_retry_after_seconds: Optional[float] = None + transient_reason: Optional[str] = None class ExportRequestSchema(BaseModel): @@ -603,6 +609,9 @@ def trigger_sync(dry_run: bool = Query(False)): conflicts=[c.to_dict() for c in plan.conflicts] if plan else [], warnings=list(plan.warnings) if plan else [], plan=plan.to_dict() if plan else {}, + retryable=report.retryable, + next_retry_after_seconds=report.next_retry_after_seconds, + transient_reason=report.transient_reason, ) # A dry-run is a preview — it must not clobber the last *real* status. if not dry_run: diff --git a/nexanote/sync/client.py b/nexanote/sync/client.py index b0ec7c5..a21200a 100644 --- a/nexanote/sync/client.py +++ b/nexanote/sync/client.py @@ -76,6 +76,71 @@ def _sanitize_request_error(exc: BaseException) -> str: return f"WebDAV upload failed: {name}" +# --------------------------------------------------------------------------- +# Retry / transient-failure classification +# --------------------------------------------------------------------------- + +# EN: HTTP statuses that signal a *transient* server/proxy condition worth +# retrying (rate limit + the bad-gateway/unavailable/timeout family that +# a NAS behind Cloudflare or a flaky mobile link routinely emits). +# FR: Statuts HTTP transitoires qui méritent une nouvelle tentative. +RETRYABLE_STATUS_CODES = frozenset({429, 502, 503, 504}) + +# EN: Statuses we must NEVER retry — auth failures and a missing resource are +# deterministic, so retrying only wastes time and can lock an account. +# FR: Statuts à ne jamais retenter (auth / ressource absente). +NON_RETRYABLE_STATUS_CODES = frozenset({401, 403, 404}) + +# EN: Small, conservative defaults. 3 attempts total with 0.5s/1s/2s waits +# between them keeps a flaky link usable without hammering the server. +# FR: Réglages prudents : 3 tentatives, attentes 0.5s/1s/2s. +DEFAULT_MAX_ATTEMPTS = 3 +DEFAULT_BACKOFF_SECONDS: tuple[float, ...] = (0.5, 1.0, 2.0) + + +def _is_transient_exception(exc: BaseException) -> bool: + """True for network conditions that are worth retrying.""" + return isinstance(exc, (requests.Timeout, requests.ConnectionError)) + + +def _transient_exception_reason(exc: BaseException) -> str: + """ + EN: Short, URL-free reason for a transient network exception. We use the + class name only (``ConnectTimeout``, ``ReadTimeout``, ``ConnectionError``) + because ``str(exc)`` from requests embeds the full URL/host. + FR: Motif court et sans URL pour une exception réseau transitoire. + """ + return type(exc).__name__ + + +@dataclass +class OperationAttempt: + """ + EN: One diagnostic record per WebDAV operation: how many attempts it + took, whether the final outcome was a *retryable* (transient) + failure, and a short sanitized reason. ``path`` is the relative DAV + slug path only — never the full URL, host, or credentials. + FR: Un enregistrement de diagnostic par opération WebDAV (tentatives, + caractère retentable, motif court). ``path`` = chemin DAV relatif + seulement, jamais l'URL/hôte/identifiants. + """ + + operation: str + path: str + attempts: int + retryable: bool + reason: Optional[str] = None + + def to_dict(self) -> dict: + return { + "operation": self.operation, + "path": self.path, + "attempts": self.attempts, + "retryable": self.retryable, + "reason": self.reason, + } + + # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- @@ -89,6 +154,15 @@ class SyncConfig: timeout_seconds: int = 15 conflict_strategy: ConflictStrategy = ConflictStrategy.MERGE_STROKES verify_ssl: bool = True + # EN: Retry budget for transient WebDAV failures. ``max_attempts`` counts + # the first try, so 3 = one initial call + two retries. The backoff + # tuple gives the wait *before* each retry; it is clamped to its last + # value when there are more retries than entries. + # FR: Budget de retries pour les échecs transitoires. ``max_attempts`` + # inclut le premier essai. ``backoff_seconds`` = attente avant chaque + # nouvelle tentative. + max_attempts: int = DEFAULT_MAX_ATTEMPTS + backoff_seconds: tuple[float, ...] = DEFAULT_BACKOFF_SECONDS # --------------------------------------------------------------------------- @@ -140,6 +214,19 @@ class SyncReport: # EN: True when this report describes a dry-run (no files/state written). # FR: Vrai quand le rapport décrit un dry-run (aucune écriture). dry_run: bool = False + # EN: Network-reliability metadata. ``retryable`` is True when the sync + # failed (or partially failed) for a *transient* reason — a timeout, + # connection drop, or 429/502/503/504 — so retrying later is sensible. + # ``next_retry_after_seconds`` is a suggested wait before that retry, + # and ``transient_reason`` is a short sanitized cause (no URL/secret). + # ``operation_attempts`` lists per-operation attempt counts for the log. + # FR: Métadonnées de fiabilité réseau. ``retryable`` = vrai si la sync a + # échoué pour une raison transitoire ; ``next_retry_after_seconds`` = + # délai suggéré ; ``transient_reason`` = motif court assaini. + retryable: bool = False + next_retry_after_seconds: Optional[float] = None + transient_reason: Optional[str] = None + operation_attempts: list[dict] = field(default_factory=list) def finish(self) -> None: self.finished_at = datetime.now(timezone.utc) @@ -183,12 +270,106 @@ def __init__(self, config: SyncConfig) -> None: self.session = requests.Session() self.session.auth = self.auth self.session.verify = config.verify_ssl + self.max_attempts = max(1, int(getattr(config, "max_attempts", DEFAULT_MAX_ATTEMPTS))) + backoff = getattr(config, "backoff_seconds", DEFAULT_BACKOFF_SECONDS) + self.backoff_seconds: tuple[float, ...] = tuple(backoff) or (0.0,) + # EN: Diagnostics buffer — one entry per attempted operation, read by + # the engine after a sync to surface attempts/retryable in the log. + # ``last_transient`` is the most recent retryable failure (if any). + # FR: Tampon de diagnostic — une entrée par opération, lu par le moteur. + self.op_attempts: list[OperationAttempt] = [] + self.last_transient: Optional[OperationAttempt] = None def _url(self, *parts: str) -> str: """Construit une URL à partir de parties encodées.""" path = "/".join(quote(p, safe="") for p in parts if p) return urljoin(self.base_url, path) + # ------------------------------------------------------------------ + # Retry executor — the single choke point for every network call. + # ------------------------------------------------------------------ + + def _backoff(self, attempt: int) -> None: + """Sleep before retry ``attempt`` (1-based), clamped to the schedule.""" + idx = min(attempt - 1, len(self.backoff_seconds) - 1) + delay = self.backoff_seconds[idx] + if delay and delay > 0: + time.sleep(delay) + + def _record( + self, + operation: str, + path: str, + attempts: int, + retryable: bool, + reason: Optional[str], + ) -> None: + record = OperationAttempt( + operation=operation, + path=path, + attempts=attempts, + retryable=retryable, + reason=reason, + ) + self.op_attempts.append(record) + if retryable: + self.last_transient = record + + def _execute(self, operation: str, path: str, send) -> Optional["requests.Response"]: + """ + EN: Run ``send()`` (which performs one HTTP request and returns a + ``Response``) with bounded retry/backoff. Retries only transient + failures: timeouts, connection errors, and the 429/502/503/504 + family. Auth (401/403) and 404 are returned to the caller + immediately — never retried. Returns the final ``Response``, or + ``None`` when the request could not be completed (network error + or exhausted transient retries). Every call appends one + ``OperationAttempt`` diagnostic record. + FR: Exécute ``send()`` avec retry/backoff borné. Ne retente que les + échecs transitoires (timeouts, erreurs de connexion, 429/502/ + 503/504). 401/403/404 sont rendus immédiatement, jamais retentés. + Retourne la ``Response`` finale ou ``None`` si la requête n'a pas + abouti. Chaque appel ajoute un enregistrement de diagnostic. + """ + last_reason: Optional[str] = None + for attempt in range(1, self.max_attempts + 1): + try: + resp = send() + except requests.RequestException as exc: + if _is_transient_exception(exc): + last_reason = _transient_exception_reason(exc) + if attempt < self.max_attempts: + self._backoff(attempt) + continue + self._record(operation, path, attempt, True, last_reason) + return None + # Non-transient request error (e.g. malformed URL) — terminal. + self._record( + operation, path, attempt, False, _sanitize_request_error(exc) + ) + return None + + status = resp.status_code + if status in RETRYABLE_STATUS_CODES and attempt < self.max_attempts: + last_reason = f"{status} {resp.reason or ''}".strip() + self._backoff(attempt) + continue + + # Terminal response — either a success, a non-retryable status, or + # an exhausted transient one. A still-transient status here means + # we ran out of attempts, so it is reported as retryable. + retryable = status in RETRYABLE_STATUS_CODES + reason = None + if retryable: + reason = f"{status} {resp.reason or ''}".strip() + elif attempt > 1: + reason = last_reason + self._record(operation, path, attempt, retryable, reason) + return resp + + # Unreachable (loop always returns), but keeps type-checkers happy. + return None + @staticmethod def _is_mkcol_success(status_code: int) -> bool: """ @@ -204,51 +385,64 @@ def _is_mkcol_success(status_code: int) -> bool: return status_code in (200, 201, 405) def ping(self) -> bool: - """Vérifie que le serveur est accessible.""" - try: - resp = self.session.request( + """Vérifie que le serveur est accessible (avec retry transitoire).""" + resp = self._execute( + "OPTIONS", + "/", + lambda: self.session.request( "OPTIONS", self.base_url, timeout=self.config.timeout_seconds, - ) - return resp.status_code < 400 - except requests.RequestException: - return False + ), + ) + return resp is not None and resp.status_code < 400 def list_notebooks(self) -> list[dict]: """PROPFIND sur / — retourne la liste des carnets.""" - return self._propfind(self.base_url, depth=1) + return self._propfind(self.base_url, depth=1, path_label="/") def list_notes(self, notebook_slug: str) -> list[dict]: """PROPFIND sur /{notebook} — retourne la liste des notes.""" - return self._propfind(self._url(notebook_slug), depth=1) + return self._propfind( + self._url(notebook_slug), depth=1, path_label=notebook_slug + ) def list_note_files(self, notebook_slug: str, note_slug: str) -> list[dict]: """PROPFIND sur /{notebook}/{note} — retourne les fichiers.""" - return self._propfind(self._url(notebook_slug, note_slug), depth=1) + return self._propfind( + self._url(notebook_slug, note_slug), + depth=1, + path_label=f"{notebook_slug}/{note_slug}", + ) def get_note_meta(self, notebook_slug: str, note_slug: str) -> Optional[dict]: - """GET /{notebook}/{note}/note.json""" + """GET /{notebook}/{note}/note.json (avec retry transitoire).""" url = self._url(notebook_slug, note_slug, "note.json") - try: - resp = self.session.get(url, timeout=self.config.timeout_seconds) - if resp.status_code == 200: - return resp.json() + path = f"{notebook_slug}/{note_slug}/note.json" + resp = self._execute( + "GET", path, lambda: self.session.get(url, timeout=self.config.timeout_seconds) + ) + if resp is None or resp.status_code != 200: return None - except (requests.RequestException, json.JSONDecodeError) as e: - logger.error(f"GET note.json échoué ({url}): {e}") + try: + return resp.json() + except ValueError as e: + logger.error(f"GET note.json invalid JSON ({path}): {e}") return None def get_ink_page(self, notebook_slug: str, note_slug: str, page_num: int) -> Optional[dict]: - """GET /{notebook}/{note}/page_N.ink""" + """GET /{notebook}/{note}/page_N.ink (avec retry transitoire).""" url = self._url(notebook_slug, note_slug, f"page_{page_num}.ink") - try: - resp = self.session.get(url, timeout=self.config.timeout_seconds) - if resp.status_code == 200: - return resp.json() + path = f"{notebook_slug}/{note_slug}/page_{page_num}.ink" + resp = self._execute( + "GET", path, lambda: self.session.get(url, timeout=self.config.timeout_seconds) + ) + if resp is None or resp.status_code != 200: return None - except (requests.RequestException, json.JSONDecodeError) as e: - logger.error(f"GET page.ink échoué ({url}): {e}") + try: + return resp.json() + except ValueError as e: + logger.error(f"GET page.ink invalid JSON ({path}): {e}") return None def put_note_meta( @@ -303,37 +497,52 @@ def _put_json( FR: Helper interne pour PUT avec récupération MKCOL+retry sur 409. """ prefix = f"{label}: " if page_num is None else f"page {page_num}: " - - def _do_put() -> requests.Response: - return self.session.put( - url, - json=data, - headers={"Content-Type": "application/json"}, - timeout=self.config.timeout_seconds, + put_path = f"{mkcol_paths[-1]}/{label}" if mkcol_paths else label + + def _do_put() -> Optional[requests.Response]: + # Each PUT carries its own transient retry/backoff via _execute. + return self._execute( + "PUT", + put_path, + lambda: self.session.put( + url, + json=data, + headers={"Content-Type": "application/json"}, + timeout=self.config.timeout_seconds, + ), ) - try: + resp = _do_put() + if resp is not None and resp.status_code in (200, 201, 204): + return True, None + + if ( + resp is not None + and resp.status_code == 409 + and self._mkcol_chain(mkcol_paths) + ): + logger.info( + "PUT %s returned 409 — MKCOL'd parents, retrying once", + label, + ) resp = _do_put() - if resp.status_code in (200, 201, 204): + if resp is not None and resp.status_code in (200, 201, 204): return True, None - if resp.status_code == 409 and self._mkcol_chain(mkcol_paths): - logger.info( - "PUT %s returned 409 — MKCOL'd parents, retrying once", - label, - ) - resp = _do_put() - if resp.status_code in (200, 201, 204): - return True, None - - reason = self._format_http_failure(resp, prefix) - logger.error(f"PUT {label} échoué ({url}): {reason}") - return False, reason - except requests.RequestException as exc: - reason = f"{prefix}{_sanitize_request_error(exc)}" - logger.error(f"PUT {label} échoué ({url}): {reason}") + if resp is None: + # Transient/network failure — already recorded by _execute. Build + # a sanitized reason from the last diagnostic record so the report + # carries a useful (URL-free) cause. + last = self.op_attempts[-1] if self.op_attempts else None + detail = last.reason if last and last.reason else "network error" + reason = f"{prefix}WebDAV upload failed: {detail}" + logger.error(f"PUT {label} échoué ({put_path}): {reason}") return False, reason + reason = self._format_http_failure(resp, prefix) + logger.error(f"PUT {label} échoué ({put_path}): {reason}") + return False, reason + @staticmethod def _format_http_failure(resp: requests.Response, prefix: str) -> str: """ @@ -368,14 +577,15 @@ def _mkcol_chain(self, paths: tuple[str, ...]) -> bool: """ for path in paths: url = urljoin(self.base_url, "/".join(quote(p, safe="") for p in path.split("/") if p)) - try: - resp = self.session.request( - "MKCOL", - url, - timeout=self.config.timeout_seconds, - ) - except requests.RequestException as exc: - logger.warning(f"MKCOL chain failed at {path}: {exc}") + resp = self._execute( + "MKCOL", + path, + lambda u=url: self.session.request( + "MKCOL", u, timeout=self.config.timeout_seconds + ), + ) + if resp is None: + logger.warning("MKCOL chain failed at %s (network/transient)", path) return False if not self._is_mkcol_success(resp.status_code): logger.warning( @@ -390,53 +600,56 @@ def _mkcol_chain(self, paths: tuple[str, ...]) -> bool: def create_notebook_dir(self, notebook_slug: str) -> bool: """MKCOL /{notebook} — creates a notebook folder on the remote server.""" url = self._url(notebook_slug) - try: - resp = self.session.request("MKCOL", url, timeout=self.config.timeout_seconds) - return self._is_mkcol_success(resp.status_code) - except requests.RequestException as e: - logger.error(f"MKCOL failed ({url}): {e}") - return False + resp = self._execute( + "MKCOL", + notebook_slug, + lambda: self.session.request("MKCOL", url, timeout=self.config.timeout_seconds), + ) + return resp is not None and self._is_mkcol_success(resp.status_code) def create_note_dir(self, notebook_slug: str, note_slug: str) -> bool: """MKCOL /{notebook}/{note} — creates a note folder on the remote server.""" url = self._url(notebook_slug, note_slug) - try: - resp = self.session.request("MKCOL", url, timeout=self.config.timeout_seconds) - return self._is_mkcol_success(resp.status_code) - except requests.RequestException as e: - logger.error(f"MKCOL note failed ({url}): {e}") - return False + path = f"{notebook_slug}/{note_slug}" + resp = self._execute( + "MKCOL", + path, + lambda: self.session.request("MKCOL", url, timeout=self.config.timeout_seconds), + ) + return resp is not None and self._is_mkcol_success(resp.status_code) - def _propfind(self, url: str, depth: int = 1) -> list[dict]: + def _propfind( + self, url: str, depth: int = 1, path_label: Optional[str] = None + ) -> list[dict]: """ - PROPFIND WebDAV — liste les ressources à un niveau donné. + PROPFIND WebDAV — liste les ressources à un niveau donné (avec retry). Retourne une liste simplifiée de {name, href, is_collection, last_modified}. """ - try: - resp = self.session.request( - "PROPFIND", - url, - headers={ - "Depth": str(depth), - "Content-Type": "application/xml", - }, - data=b""" + body = b""" -""", +""" + resp = self._execute( + "PROPFIND", + path_label or "/", + lambda: self.session.request( + "PROPFIND", + url, + headers={ + "Depth": str(depth), + "Content-Type": "application/xml", + }, + data=body, timeout=self.config.timeout_seconds, - ) - - if resp.status_code == 207: # Multi-Status - return self._parse_propfind(resp.text, url) - return [] - except requests.RequestException as e: - logger.error(f"PROPFIND échoué ({url}): {e}") - return [] + ), + ) + if resp is not None and resp.status_code == 207: # Multi-Status + return self._parse_propfind(resp.text, url) + return [] def _parse_propfind(self, xml_text: str, base_url: str) -> list[dict]: """Parse la réponse XML PROPFIND en liste de ressources.""" @@ -710,6 +923,7 @@ def sync(self) -> SyncReport: msg = f"Impossible de joindre le serveur : {self.config.server_url}" logger.error(msg) report.errors.append(msg) + self._collect_reliability_diagnostics(report) report.finish() self._write_log(report) return report @@ -733,11 +947,47 @@ def sync(self) -> SyncReport: except Exception: logger.exception("could not persist sync state") + self._collect_reliability_diagnostics(report) report.finish() self._write_log(report) logger.info(report.summary()) return report + def _collect_reliability_diagnostics(self, report: SyncReport) -> None: + """ + EN: Fold the WebDAV client's per-operation attempt records into the + report. If the session did not fully succeed *and* at least one + operation failed transiently (timeout / connection drop / + 429/502/503/504), mark the report ``retryable`` and suggest a + wait before the next attempt. Never raises and never mutates + sync state — purely diagnostic. + FR: Replie les enregistrements d'opérations du client dans le rapport. + Si la session n'a pas réussi et qu'au moins une opération a échoué + de façon transitoire, marque le rapport ``retryable`` avec un délai + suggéré. Ne lève jamais, ne touche pas l'état de sync. + """ + client = self.client + attempts = list(getattr(client, "op_attempts", []) or []) + if attempts: + report.operation_attempts = [ + a.to_dict() if hasattr(a, "to_dict") else dict(a) for a in attempts + ] + + transient = [a for a in attempts if getattr(a, "retryable", False)] + if transient and not report.success(): + report.retryable = True + report.transient_reason = getattr(transient[-1], "reason", None) + report.next_retry_after_seconds = self._suggested_retry_delay() + + def _suggested_retry_delay(self) -> float: + """Largest configured backoff — a sensible wait before a whole-sync retry.""" + backoff = getattr(self.config, "backoff_seconds", DEFAULT_BACKOFF_SECONDS) + try: + values = [float(b) for b in backoff if b is not None] + except (TypeError, ValueError): + values = [] + return max(values) if values else max(DEFAULT_BACKOFF_SECONDS) + def _write_log(self, report: SyncReport) -> None: """ EN: Write the sanitized diagnostic log, unless this was a dry-run diff --git a/nexanote/sync/sync_log.py b/nexanote/sync/sync_log.py index 5b1db18..880c503 100644 --- a/nexanote/sync/sync_log.py +++ b/nexanote/sync/sync_log.py @@ -101,6 +101,26 @@ def build_sync_log( errors = [sanitize_error(e) for e in getattr(report, "errors", [])] + # Network-reliability diagnostics — attempts per operation and whether the + # session is worth retrying. Each entry is already URL-free (relative DAV + # slug path only); we still scrub the reason defensively. + operation_attempts = [] + for entry in getattr(report, "operation_attempts", []) or []: + if not isinstance(entry, dict): + continue + reason = entry.get("reason") + operation_attempts.append( + { + "operation": entry.get("operation"), + "path": entry.get("path"), + "attempts": entry.get("attempts"), + "retryable": bool(entry.get("retryable")), + "reason": sanitize_error(reason) if reason else None, + } + ) + + transient_reason = getattr(report, "transient_reason", None) + payload = { "version": SYNC_LOG_VERSION, "timestamp": timestamp.isoformat() if timestamp else None, @@ -109,6 +129,10 @@ def build_sync_log( "duration_seconds": round(report.duration_seconds(), 3), "dry_run": bool(dry_run), "success": report.success(), + "retryable": bool(getattr(report, "retryable", False)), + "next_retry_after_seconds": getattr(report, "next_retry_after_seconds", None), + "transient_reason": sanitize_error(transient_reason) if transient_reason else None, + "operation_attempts": operation_attempts, "counts": { "pulled": getattr(report, "notes_pulled", 0), "pushed": getattr(report, "notes_pushed", 0), diff --git a/tests/test_sync_reliability.py b/tests/test_sync_reliability.py index 83d0de6..efbec6b 100644 --- a/tests/test_sync_reliability.py +++ b/tests/test_sync_reliability.py @@ -26,9 +26,16 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) +import requests + from nexanote.models.note import Note, Notebook, NoteType, SyncStatus from nexanote.storage import FileNoteStore -from nexanote.sync.client import NexaNoteSyncEngine, SyncConfig, SyncReport +from nexanote.sync.client import ( + NexaNoteSyncEngine, + SyncConfig, + SyncReport, + WebDAVClient, +) from nexanote.sync.plan import SyncPlan from nexanote.sync.sync_log import ( SYNC_LOG_FILENAME, @@ -514,3 +521,254 @@ def test_unreachable_server_writes_failure_log_without_corruption(self, tmp_path state_path = data_dir / SYNC_STATE_FILENAME if state_path.exists(): json.loads(state_path.read_text("utf-8")) + + +# --------------------------------------------------------------------------- +# 7. Retry / backoff for transient WebDAV failures +# --------------------------------------------------------------------------- + +# Minimal, empty WebDAV multistatus so a PROPFIND parses to an empty listing. +_EMPTY_MULTISTATUS = ( + '' + '' +) + + +class FakeResponse: + """A stand-in for ``requests.Response`` with just what the client reads.""" + + def __init__(self, status_code=200, reason="", json_data=None, text=""): + self.status_code = status_code + self.reason = reason + self._json = json_data + self.text = text + + def json(self): + if self._json is None: + raise ValueError("no json body") + return self._json + + +class FakeSession: + """ + EN: Scriptable replacement for ``requests.Session``. ``scripts`` maps an + HTTP method to a list of items; each call pops the next one (the last + item repeats once exhausted). An item may be a ``FakeResponse`` or an + exception (class or instance) to raise — used to simulate timeouts. + FR: Remplaçant scriptable de ``requests.Session`` pour simuler réponses, + timeouts et erreurs réseau par méthode HTTP. + """ + + def __init__(self, scripts=None): + self.auth = None + self.verify = True + self.scripts = scripts or {} + self.calls: list[str] = [] + + def _next(self, method): + self.calls.append(method) + items = self.scripts.get(method) + if not items: + return FakeResponse(200) + item = items.pop(0) if len(items) > 1 else items[0] + if isinstance(item, BaseException): + raise item + if isinstance(item, type) and issubclass(item, BaseException): + raise item() + return item + + def request(self, method, url, **kwargs): + return self._next(method.upper()) + + def get(self, url, **kwargs): + return self._next("GET") + + def put(self, url, **kwargs): + return self._next("PUT") + + def count(self, method): + return self.calls.count(method.upper()) + + +def _client(scripts): + """A real ``WebDAVClient`` wired to a scripted session (zero backoff).""" + config = SyncConfig( + server_url="http://stub.invalid/", + username="u", + password="hunter2-secret", + timeout_seconds=1, + backoff_seconds=(0.0, 0.0, 0.0), + ) + client = WebDAVClient(config) + fake = FakeSession(scripts) + client.session = fake + return client, fake + + +class TestTransientRetry: + def test_transient_503_succeeds_after_retry(self): + # First PUT 503, second PUT 201 → overall success after one retry. + client, fake = _client( + {"PUT": [FakeResponse(503, "Service Unavailable"), FakeResponse(201)]} + ) + ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {}) + assert ok is True + assert reason is None + assert fake.count("PUT") == 2 + # Diagnostics record two attempts and a non-retryable final outcome. + last = client.op_attempts[-1] + assert last.operation == "PUT" + assert last.attempts == 2 + assert last.retryable is False + assert client.last_transient is None + + def test_transient_503_on_get_is_retried(self): + client, fake = _client( + {"GET": [FakeResponse(503, "Service Unavailable"), FakeResponse(200, json_data={"id": "x"})]} + ) + meta = client.get_note_meta("nb__01234567", "note__abcd1234") + assert meta == {"id": "x"} + assert fake.count("GET") == 2 + + def test_401_is_not_retried(self): + client, fake = _client({"PUT": [FakeResponse(401, "Unauthorized")]}) + ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {}) + assert ok is False + assert "401" in reason # status preserved + # Auth failure must be attempted exactly once — never retried. + assert fake.count("PUT") == 1 + last = client.op_attempts[-1] + assert last.attempts == 1 + assert last.retryable is False + + def test_404_is_not_retried(self): + client, fake = _client({"PUT": [FakeResponse(404, "Not Found")]}) + ok, _ = client.put_note_meta("nb__01234567", "note__abcd1234", {}) + assert ok is False + assert fake.count("PUT") == 1 + + def test_timeout_is_retried_then_reported(self): + client, fake = _client({"PUT": [requests.ConnectTimeout]}) + ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {}) + assert ok is False + assert reason + # Exhausted the full attempt budget (1 initial + 2 retries = 3). + assert fake.count("PUT") == client.max_attempts == 3 + last = client.op_attempts[-1] + assert last.attempts == 3 + assert last.retryable is True + assert client.last_transient is last + # The recorded reason names the transient cause and leaks no URL/host. + assert "ConnectTimeout" in (last.reason or "") + assert "stub.invalid" not in (last.reason or "") + + def test_connection_error_is_retried(self): + client, fake = _client({"PUT": [requests.ConnectionError]}) + ok, _ = client.put_note_meta("nb__01234567", "note__abcd1234", {}) + assert ok is False + assert fake.count("PUT") == client.max_attempts + + +def _engine_with_real_client(data_dir, scripts): + """Build an engine backed by a real WebDAVClient on a scripted session.""" + config = SyncConfig( + server_url="http://stub.invalid/", + username="u", + password="hunter2-secret", + timeout_seconds=1, + backoff_seconds=(0.0, 0.0, 0.0), + ) + db = FileNoteStore(data_dir) + engine = NexaNoteSyncEngine(db, config, dry_run=False) + fake = FakeSession(scripts) + engine.client.session = fake + return engine, fake + + +class TestRetryableSyncReport: + def _push_only_scripts(self, put_item): + # ping ok, empty remote, MKCOL ok, PUT scripted to ``put_item``. + return { + "OPTIONS": [FakeResponse(200)], + "PROPFIND": [FakeResponse(207, text=_EMPTY_MULTISTATUS)], + "MKCOL": [FakeResponse(201)], + "PUT": [put_item], + } + + def test_timeout_marks_report_retryable_without_corrupting_state(self, tmp_path): + data_dir = tmp_path / "client_retry" + engine, _ = _engine_with_real_client( + data_dir, self._push_only_scripts(requests.ConnectTimeout) + ) + local = Note(title="Pushable", note_type=NoteType.TYPED) + local.add_page().typed_content = "body" + engine.db.save_note(local) + + report = engine.sync() + + # Transient push failure → not successful but explicitly retryable. + assert not report.success() + assert report.retryable is True + assert report.next_retry_after_seconds is not None + assert report.transient_reason + assert "stub.invalid" not in report.transient_reason + # Per-operation attempt records made it into the report. + assert any(a["operation"] == "PUT" and a["attempts"] == 3 + for a in report.operation_attempts) + + # The local note was NOT marked synced — state stays consistent. + assert engine.db.get_note(local.id).sync_status == SyncStatus.LOCAL_ONLY + # The sync-state file (if written) is still valid JSON. + state_path = data_dir / SYNC_STATE_FILENAME + if state_path.exists(): + json.loads(state_path.read_text("utf-8")) + + def test_retryable_state_in_log_without_secrets(self, tmp_path): + data_dir = tmp_path / "client_retry_log" + engine, _ = _engine_with_real_client( + data_dir, self._push_only_scripts(requests.ConnectTimeout) + ) + local = Note(title="Pushable", note_type=NoteType.TYPED) + local.add_page().typed_content = "TOP-SECRET-BODY-9999" + engine.db.save_note(local) + + engine.sync() + + payload = read_sync_log(data_dir) + assert payload is not None + assert payload["retryable"] is True + assert payload["next_retry_after_seconds"] is not None + assert payload["transient_reason"] + assert any(a["operation"] == "PUT" for a in payload["operation_attempts"]) + + raw = sync_log_path(data_dir).read_text("utf-8") + # No credential, no body content, no server host in the diagnostic log. + assert "hunter2-secret" not in raw + assert "TOP-SECRET-BODY-9999" not in raw + assert "stub.invalid" not in raw + + def test_successful_retry_still_produces_normal_report(self, tmp_path): + data_dir = tmp_path / "client_retry_ok" + # PUT 503 once, then succeeds — push should complete normally. + engine, fake = _engine_with_real_client( + data_dir, + self._push_only_scripts(None), + ) + # Replace the PUT script: transient 503 then 201, repeating the 201. + fake.scripts["PUT"] = [FakeResponse(503, "Service Unavailable"), FakeResponse(201)] + + local = Note(title="Pushable", note_type=NoteType.TYPED) + local.add_page().typed_content = "body" + engine.db.save_note(local) + + report = engine.sync() + + assert report.success(), report.errors + # A successful sync is never flagged retryable. + assert report.retryable is False + assert report.notes_pushed == 1 + # The note is now synced and the report reads like any normal one. + assert engine.db.get_note(local.id).sync_status == SyncStatus.SYNCED + # The retried PUT is still visible in the diagnostics (2 attempts). + assert any(a["operation"] == "PUT" and a["attempts"] == 2 + for a in report.operation_attempts)