diff --git a/CHANGELOG.md b/CHANGELOG.md
index f1e1dd4..895dfc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,21 @@ This project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
note body content, passwords, tokens, or server URLs — error strings are
scrubbed and the plan/report only ever hold metadata.
+- **Retry & backoff for transient failures.** Every WebDAV network
+ operation (GET, PROPFIND, PUT, MKCOL) is now retried on transient
+ conditions — timeouts, connection drops, and HTTP 429/502/503/504 —
+ with small conservative defaults (3 attempts, 0.5s/1s/2s backoff).
+ Auth failures (401/403) and 404 are never retried. Both the attempt
+ budget and per-call timeout are configurable on `SyncConfig`
+ (`max_attempts`, `backoff_seconds`, `timeout_seconds`).
+
+- **Retryable sync reports.** When a sync fails for a transient reason the
+ report (and `POST /sync/trigger`) carries `retryable: true`, a short
+ `transient_reason`, and a suggested `next_retry_after_seconds`. The
+ diagnostic log additionally records per-operation attempt counts so a
+ flaky NAS/Cloudflare/mobile link is easy to spot — still with no body
+ content, credentials, or server URLs.
+
### Improved
- **Conflict safety.** When a note changed both locally and remotely, the
diff --git a/nexanote/api/routes.py b/nexanote/api/routes.py
index a4f87bf..ab1f4a8 100644
--- a/nexanote/api/routes.py
+++ b/nexanote/api/routes.py
@@ -176,6 +176,12 @@ class SyncReportSchema(BaseModel):
conflicts: list[dict] = Field(default_factory=list)
warnings: list[str] = Field(default_factory=list)
plan: dict = Field(default_factory=dict)
+ # Network reliability — additive too. ``retryable`` flags a transient
+ # failure (timeout/connection/429/502/503/504) so clients can back off
+ # and retry instead of surfacing it as a hard error.
+ retryable: bool = False
+ next_retry_after_seconds: Optional[float] = None
+ transient_reason: Optional[str] = None
class ExportRequestSchema(BaseModel):
@@ -603,6 +609,9 @@ def trigger_sync(dry_run: bool = Query(False)):
conflicts=[c.to_dict() for c in plan.conflicts] if plan else [],
warnings=list(plan.warnings) if plan else [],
plan=plan.to_dict() if plan else {},
+ retryable=report.retryable,
+ next_retry_after_seconds=report.next_retry_after_seconds,
+ transient_reason=report.transient_reason,
)
# A dry-run is a preview — it must not clobber the last *real* status.
if not dry_run:
diff --git a/nexanote/sync/client.py b/nexanote/sync/client.py
index b0ec7c5..a21200a 100644
--- a/nexanote/sync/client.py
+++ b/nexanote/sync/client.py
@@ -76,6 +76,71 @@ def _sanitize_request_error(exc: BaseException) -> str:
return f"WebDAV upload failed: {name}"
+# ---------------------------------------------------------------------------
+# Retry / transient-failure classification
+# ---------------------------------------------------------------------------
+
+# EN: HTTP statuses that signal a *transient* server/proxy condition worth
+# retrying (rate limit + the bad-gateway/unavailable/timeout family that
+# a NAS behind Cloudflare or a flaky mobile link routinely emits).
+# FR: Statuts HTTP transitoires qui méritent une nouvelle tentative.
+RETRYABLE_STATUS_CODES = frozenset({429, 502, 503, 504})
+
+# EN: Statuses we must NEVER retry — auth failures and a missing resource are
+# deterministic, so retrying only wastes time and can lock an account.
+# FR: Statuts à ne jamais retenter (auth / ressource absente).
+NON_RETRYABLE_STATUS_CODES = frozenset({401, 403, 404})
+
+# EN: Small, conservative defaults. 3 attempts total with 0.5s/1s/2s waits
+# between them keeps a flaky link usable without hammering the server.
+# FR: Réglages prudents : 3 tentatives, attentes 0.5s/1s/2s.
+DEFAULT_MAX_ATTEMPTS = 3
+DEFAULT_BACKOFF_SECONDS: tuple[float, ...] = (0.5, 1.0, 2.0)
+
+
+def _is_transient_exception(exc: BaseException) -> bool:
+ """True for network conditions that are worth retrying."""
+ return isinstance(exc, (requests.Timeout, requests.ConnectionError))
+
+
+def _transient_exception_reason(exc: BaseException) -> str:
+ """
+ EN: Short, URL-free reason for a transient network exception. We use the
+ class name only (``ConnectTimeout``, ``ReadTimeout``, ``ConnectionError``)
+ because ``str(exc)`` from requests embeds the full URL/host.
+ FR: Motif court et sans URL pour une exception réseau transitoire.
+ """
+ return type(exc).__name__
+
+
+@dataclass
+class OperationAttempt:
+ """
+ EN: One diagnostic record per WebDAV operation: how many attempts it
+ took, whether the final outcome was a *retryable* (transient)
+ failure, and a short sanitized reason. ``path`` is the relative DAV
+ slug path only — never the full URL, host, or credentials.
+ FR: Un enregistrement de diagnostic par opération WebDAV (tentatives,
+ caractère retentable, motif court). ``path`` = chemin DAV relatif
+ seulement, jamais l'URL/hôte/identifiants.
+ """
+
+ operation: str
+ path: str
+ attempts: int
+ retryable: bool
+ reason: Optional[str] = None
+
+ def to_dict(self) -> dict:
+ return {
+ "operation": self.operation,
+ "path": self.path,
+ "attempts": self.attempts,
+ "retryable": self.retryable,
+ "reason": self.reason,
+ }
+
+
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
@@ -89,6 +154,15 @@ class SyncConfig:
timeout_seconds: int = 15
conflict_strategy: ConflictStrategy = ConflictStrategy.MERGE_STROKES
verify_ssl: bool = True
+ # EN: Retry budget for transient WebDAV failures. ``max_attempts`` counts
+ # the first try, so 3 = one initial call + two retries. The backoff
+ # tuple gives the wait *before* each retry; it is clamped to its last
+ # value when there are more retries than entries.
+ # FR: Budget de retries pour les échecs transitoires. ``max_attempts``
+ # inclut le premier essai. ``backoff_seconds`` = attente avant chaque
+ # nouvelle tentative.
+ max_attempts: int = DEFAULT_MAX_ATTEMPTS
+ backoff_seconds: tuple[float, ...] = DEFAULT_BACKOFF_SECONDS
# ---------------------------------------------------------------------------
@@ -140,6 +214,19 @@ class SyncReport:
# EN: True when this report describes a dry-run (no files/state written).
# FR: Vrai quand le rapport décrit un dry-run (aucune écriture).
dry_run: bool = False
+ # EN: Network-reliability metadata. ``retryable`` is True when the sync
+ # failed (or partially failed) for a *transient* reason — a timeout,
+ # connection drop, or 429/502/503/504 — so retrying later is sensible.
+ # ``next_retry_after_seconds`` is a suggested wait before that retry,
+ # and ``transient_reason`` is a short sanitized cause (no URL/secret).
+ # ``operation_attempts`` lists per-operation attempt counts for the log.
+ # FR: Métadonnées de fiabilité réseau. ``retryable`` = vrai si la sync a
+ # échoué pour une raison transitoire ; ``next_retry_after_seconds`` =
+ # délai suggéré ; ``transient_reason`` = motif court assaini.
+ retryable: bool = False
+ next_retry_after_seconds: Optional[float] = None
+ transient_reason: Optional[str] = None
+ operation_attempts: list[dict] = field(default_factory=list)
def finish(self) -> None:
self.finished_at = datetime.now(timezone.utc)
@@ -183,12 +270,106 @@ def __init__(self, config: SyncConfig) -> None:
self.session = requests.Session()
self.session.auth = self.auth
self.session.verify = config.verify_ssl
+ self.max_attempts = max(1, int(getattr(config, "max_attempts", DEFAULT_MAX_ATTEMPTS)))
+ backoff = getattr(config, "backoff_seconds", DEFAULT_BACKOFF_SECONDS)
+ self.backoff_seconds: tuple[float, ...] = tuple(backoff) or (0.0,)
+ # EN: Diagnostics buffer — one entry per attempted operation, read by
+ # the engine after a sync to surface attempts/retryable in the log.
+ # ``last_transient`` is the most recent retryable failure (if any).
+ # FR: Tampon de diagnostic — une entrée par opération, lu par le moteur.
+ self.op_attempts: list[OperationAttempt] = []
+ self.last_transient: Optional[OperationAttempt] = None
def _url(self, *parts: str) -> str:
"""Construit une URL à partir de parties encodées."""
path = "/".join(quote(p, safe="") for p in parts if p)
return urljoin(self.base_url, path)
+ # ------------------------------------------------------------------
+ # Retry executor — the single choke point for every network call.
+ # ------------------------------------------------------------------
+
+ def _backoff(self, attempt: int) -> None:
+ """Sleep before retry ``attempt`` (1-based), clamped to the schedule."""
+ idx = min(attempt - 1, len(self.backoff_seconds) - 1)
+ delay = self.backoff_seconds[idx]
+ if delay and delay > 0:
+ time.sleep(delay)
+
+ def _record(
+ self,
+ operation: str,
+ path: str,
+ attempts: int,
+ retryable: bool,
+ reason: Optional[str],
+ ) -> None:
+ record = OperationAttempt(
+ operation=operation,
+ path=path,
+ attempts=attempts,
+ retryable=retryable,
+ reason=reason,
+ )
+ self.op_attempts.append(record)
+ if retryable:
+ self.last_transient = record
+
+ def _execute(self, operation: str, path: str, send) -> Optional["requests.Response"]:
+ """
+ EN: Run ``send()`` (which performs one HTTP request and returns a
+ ``Response``) with bounded retry/backoff. Retries only transient
+ failures: timeouts, connection errors, and the 429/502/503/504
+ family. Auth (401/403) and 404 are returned to the caller
+ immediately — never retried. Returns the final ``Response``, or
+ ``None`` when the request could not be completed (network error
+ or exhausted transient retries). Every call appends one
+ ``OperationAttempt`` diagnostic record.
+ FR: Exécute ``send()`` avec retry/backoff borné. Ne retente que les
+ échecs transitoires (timeouts, erreurs de connexion, 429/502/
+ 503/504). 401/403/404 sont rendus immédiatement, jamais retentés.
+ Retourne la ``Response`` finale ou ``None`` si la requête n'a pas
+ abouti. Chaque appel ajoute un enregistrement de diagnostic.
+ """
+ last_reason: Optional[str] = None
+ for attempt in range(1, self.max_attempts + 1):
+ try:
+ resp = send()
+ except requests.RequestException as exc:
+ if _is_transient_exception(exc):
+ last_reason = _transient_exception_reason(exc)
+ if attempt < self.max_attempts:
+ self._backoff(attempt)
+ continue
+ self._record(operation, path, attempt, True, last_reason)
+ return None
+ # Non-transient request error (e.g. malformed URL) — terminal.
+ self._record(
+ operation, path, attempt, False, _sanitize_request_error(exc)
+ )
+ return None
+
+ status = resp.status_code
+ if status in RETRYABLE_STATUS_CODES and attempt < self.max_attempts:
+ last_reason = f"{status} {resp.reason or ''}".strip()
+ self._backoff(attempt)
+ continue
+
+ # Terminal response — either a success, a non-retryable status, or
+ # an exhausted transient one. A still-transient status here means
+ # we ran out of attempts, so it is reported as retryable.
+ retryable = status in RETRYABLE_STATUS_CODES
+ reason = None
+ if retryable:
+ reason = f"{status} {resp.reason or ''}".strip()
+ elif attempt > 1:
+ reason = last_reason
+ self._record(operation, path, attempt, retryable, reason)
+ return resp
+
+ # Unreachable (loop always returns), but keeps type-checkers happy.
+ return None
+
@staticmethod
def _is_mkcol_success(status_code: int) -> bool:
"""
@@ -204,51 +385,64 @@ def _is_mkcol_success(status_code: int) -> bool:
return status_code in (200, 201, 405)
def ping(self) -> bool:
- """Vérifie que le serveur est accessible."""
- try:
- resp = self.session.request(
+ """Vérifie que le serveur est accessible (avec retry transitoire)."""
+ resp = self._execute(
+ "OPTIONS",
+ "/",
+ lambda: self.session.request(
"OPTIONS",
self.base_url,
timeout=self.config.timeout_seconds,
- )
- return resp.status_code < 400
- except requests.RequestException:
- return False
+ ),
+ )
+ return resp is not None and resp.status_code < 400
def list_notebooks(self) -> list[dict]:
"""PROPFIND sur / — retourne la liste des carnets."""
- return self._propfind(self.base_url, depth=1)
+ return self._propfind(self.base_url, depth=1, path_label="/")
def list_notes(self, notebook_slug: str) -> list[dict]:
"""PROPFIND sur /{notebook} — retourne la liste des notes."""
- return self._propfind(self._url(notebook_slug), depth=1)
+ return self._propfind(
+ self._url(notebook_slug), depth=1, path_label=notebook_slug
+ )
def list_note_files(self, notebook_slug: str, note_slug: str) -> list[dict]:
"""PROPFIND sur /{notebook}/{note} — retourne les fichiers."""
- return self._propfind(self._url(notebook_slug, note_slug), depth=1)
+ return self._propfind(
+ self._url(notebook_slug, note_slug),
+ depth=1,
+ path_label=f"{notebook_slug}/{note_slug}",
+ )
def get_note_meta(self, notebook_slug: str, note_slug: str) -> Optional[dict]:
- """GET /{notebook}/{note}/note.json"""
+ """GET /{notebook}/{note}/note.json (avec retry transitoire)."""
url = self._url(notebook_slug, note_slug, "note.json")
- try:
- resp = self.session.get(url, timeout=self.config.timeout_seconds)
- if resp.status_code == 200:
- return resp.json()
+ path = f"{notebook_slug}/{note_slug}/note.json"
+ resp = self._execute(
+ "GET", path, lambda: self.session.get(url, timeout=self.config.timeout_seconds)
+ )
+ if resp is None or resp.status_code != 200:
return None
- except (requests.RequestException, json.JSONDecodeError) as e:
- logger.error(f"GET note.json échoué ({url}): {e}")
+ try:
+ return resp.json()
+ except ValueError as e:
+ logger.error(f"GET note.json invalid JSON ({path}): {e}")
return None
def get_ink_page(self, notebook_slug: str, note_slug: str, page_num: int) -> Optional[dict]:
- """GET /{notebook}/{note}/page_N.ink"""
+ """GET /{notebook}/{note}/page_N.ink (avec retry transitoire)."""
url = self._url(notebook_slug, note_slug, f"page_{page_num}.ink")
- try:
- resp = self.session.get(url, timeout=self.config.timeout_seconds)
- if resp.status_code == 200:
- return resp.json()
+ path = f"{notebook_slug}/{note_slug}/page_{page_num}.ink"
+ resp = self._execute(
+ "GET", path, lambda: self.session.get(url, timeout=self.config.timeout_seconds)
+ )
+ if resp is None or resp.status_code != 200:
return None
- except (requests.RequestException, json.JSONDecodeError) as e:
- logger.error(f"GET page.ink échoué ({url}): {e}")
+ try:
+ return resp.json()
+ except ValueError as e:
+ logger.error(f"GET page.ink invalid JSON ({path}): {e}")
return None
def put_note_meta(
@@ -303,37 +497,52 @@ def _put_json(
FR: Helper interne pour PUT avec récupération MKCOL+retry sur 409.
"""
prefix = f"{label}: " if page_num is None else f"page {page_num}: "
-
- def _do_put() -> requests.Response:
- return self.session.put(
- url,
- json=data,
- headers={"Content-Type": "application/json"},
- timeout=self.config.timeout_seconds,
+ put_path = f"{mkcol_paths[-1]}/{label}" if mkcol_paths else label
+
+ def _do_put() -> Optional[requests.Response]:
+ # Each PUT carries its own transient retry/backoff via _execute.
+ return self._execute(
+ "PUT",
+ put_path,
+ lambda: self.session.put(
+ url,
+ json=data,
+ headers={"Content-Type": "application/json"},
+ timeout=self.config.timeout_seconds,
+ ),
)
- try:
+ resp = _do_put()
+ if resp is not None and resp.status_code in (200, 201, 204):
+ return True, None
+
+ if (
+ resp is not None
+ and resp.status_code == 409
+ and self._mkcol_chain(mkcol_paths)
+ ):
+ logger.info(
+ "PUT %s returned 409 — MKCOL'd parents, retrying once",
+ label,
+ )
resp = _do_put()
- if resp.status_code in (200, 201, 204):
+ if resp is not None and resp.status_code in (200, 201, 204):
return True, None
- if resp.status_code == 409 and self._mkcol_chain(mkcol_paths):
- logger.info(
- "PUT %s returned 409 — MKCOL'd parents, retrying once",
- label,
- )
- resp = _do_put()
- if resp.status_code in (200, 201, 204):
- return True, None
-
- reason = self._format_http_failure(resp, prefix)
- logger.error(f"PUT {label} échoué ({url}): {reason}")
- return False, reason
- except requests.RequestException as exc:
- reason = f"{prefix}{_sanitize_request_error(exc)}"
- logger.error(f"PUT {label} échoué ({url}): {reason}")
+ if resp is None:
+ # Transient/network failure — already recorded by _execute. Build
+ # a sanitized reason from the last diagnostic record so the report
+ # carries a useful (URL-free) cause.
+ last = self.op_attempts[-1] if self.op_attempts else None
+ detail = last.reason if last and last.reason else "network error"
+ reason = f"{prefix}WebDAV upload failed: {detail}"
+ logger.error(f"PUT {label} échoué ({put_path}): {reason}")
return False, reason
+ reason = self._format_http_failure(resp, prefix)
+ logger.error(f"PUT {label} échoué ({put_path}): {reason}")
+ return False, reason
+
@staticmethod
def _format_http_failure(resp: requests.Response, prefix: str) -> str:
"""
@@ -368,14 +577,15 @@ def _mkcol_chain(self, paths: tuple[str, ...]) -> bool:
"""
for path in paths:
url = urljoin(self.base_url, "/".join(quote(p, safe="") for p in path.split("/") if p))
- try:
- resp = self.session.request(
- "MKCOL",
- url,
- timeout=self.config.timeout_seconds,
- )
- except requests.RequestException as exc:
- logger.warning(f"MKCOL chain failed at {path}: {exc}")
+ resp = self._execute(
+ "MKCOL",
+ path,
+ lambda u=url: self.session.request(
+ "MKCOL", u, timeout=self.config.timeout_seconds
+ ),
+ )
+ if resp is None:
+ logger.warning("MKCOL chain failed at %s (network/transient)", path)
return False
if not self._is_mkcol_success(resp.status_code):
logger.warning(
@@ -390,53 +600,56 @@ def _mkcol_chain(self, paths: tuple[str, ...]) -> bool:
def create_notebook_dir(self, notebook_slug: str) -> bool:
"""MKCOL /{notebook} — creates a notebook folder on the remote server."""
url = self._url(notebook_slug)
- try:
- resp = self.session.request("MKCOL", url, timeout=self.config.timeout_seconds)
- return self._is_mkcol_success(resp.status_code)
- except requests.RequestException as e:
- logger.error(f"MKCOL failed ({url}): {e}")
- return False
+ resp = self._execute(
+ "MKCOL",
+ notebook_slug,
+ lambda: self.session.request("MKCOL", url, timeout=self.config.timeout_seconds),
+ )
+ return resp is not None and self._is_mkcol_success(resp.status_code)
def create_note_dir(self, notebook_slug: str, note_slug: str) -> bool:
"""MKCOL /{notebook}/{note} — creates a note folder on the remote server."""
url = self._url(notebook_slug, note_slug)
- try:
- resp = self.session.request("MKCOL", url, timeout=self.config.timeout_seconds)
- return self._is_mkcol_success(resp.status_code)
- except requests.RequestException as e:
- logger.error(f"MKCOL note failed ({url}): {e}")
- return False
+ path = f"{notebook_slug}/{note_slug}"
+ resp = self._execute(
+ "MKCOL",
+ path,
+ lambda: self.session.request("MKCOL", url, timeout=self.config.timeout_seconds),
+ )
+ return resp is not None and self._is_mkcol_success(resp.status_code)
- def _propfind(self, url: str, depth: int = 1) -> list[dict]:
+ def _propfind(
+ self, url: str, depth: int = 1, path_label: Optional[str] = None
+ ) -> list[dict]:
"""
- PROPFIND WebDAV — liste les ressources à un niveau donné.
+ PROPFIND WebDAV — liste les ressources à un niveau donné (avec retry).
Retourne une liste simplifiée de {name, href, is_collection, last_modified}.
"""
- try:
- resp = self.session.request(
- "PROPFIND",
- url,
- headers={
- "Depth": str(depth),
- "Content-Type": "application/xml",
- },
- data=b"""
+ body = b"""
-""",
+"""
+ resp = self._execute(
+ "PROPFIND",
+ path_label or "/",
+ lambda: self.session.request(
+ "PROPFIND",
+ url,
+ headers={
+ "Depth": str(depth),
+ "Content-Type": "application/xml",
+ },
+ data=body,
timeout=self.config.timeout_seconds,
- )
-
- if resp.status_code == 207: # Multi-Status
- return self._parse_propfind(resp.text, url)
- return []
- except requests.RequestException as e:
- logger.error(f"PROPFIND échoué ({url}): {e}")
- return []
+ ),
+ )
+ if resp is not None and resp.status_code == 207: # Multi-Status
+ return self._parse_propfind(resp.text, url)
+ return []
def _parse_propfind(self, xml_text: str, base_url: str) -> list[dict]:
"""Parse la réponse XML PROPFIND en liste de ressources."""
@@ -710,6 +923,7 @@ def sync(self) -> SyncReport:
msg = f"Impossible de joindre le serveur : {self.config.server_url}"
logger.error(msg)
report.errors.append(msg)
+ self._collect_reliability_diagnostics(report)
report.finish()
self._write_log(report)
return report
@@ -733,11 +947,47 @@ def sync(self) -> SyncReport:
except Exception:
logger.exception("could not persist sync state")
+ self._collect_reliability_diagnostics(report)
report.finish()
self._write_log(report)
logger.info(report.summary())
return report
+ def _collect_reliability_diagnostics(self, report: SyncReport) -> None:
+ """
+ EN: Fold the WebDAV client's per-operation attempt records into the
+ report. If the session did not fully succeed *and* at least one
+ operation failed transiently (timeout / connection drop /
+ 429/502/503/504), mark the report ``retryable`` and suggest a
+ wait before the next attempt. Never raises and never mutates
+ sync state — purely diagnostic.
+ FR: Replie les enregistrements d'opérations du client dans le rapport.
+ Si la session n'a pas réussi et qu'au moins une opération a échoué
+ de façon transitoire, marque le rapport ``retryable`` avec un délai
+ suggéré. Ne lève jamais, ne touche pas l'état de sync.
+ """
+ client = self.client
+ attempts = list(getattr(client, "op_attempts", []) or [])
+ if attempts:
+ report.operation_attempts = [
+ a.to_dict() if hasattr(a, "to_dict") else dict(a) for a in attempts
+ ]
+
+ transient = [a for a in attempts if getattr(a, "retryable", False)]
+ if transient and not report.success():
+ report.retryable = True
+ report.transient_reason = getattr(transient[-1], "reason", None)
+ report.next_retry_after_seconds = self._suggested_retry_delay()
+
+ def _suggested_retry_delay(self) -> float:
+ """Largest configured backoff — a sensible wait before a whole-sync retry."""
+ backoff = getattr(self.config, "backoff_seconds", DEFAULT_BACKOFF_SECONDS)
+ try:
+ values = [float(b) for b in backoff if b is not None]
+ except (TypeError, ValueError):
+ values = []
+ return max(values) if values else max(DEFAULT_BACKOFF_SECONDS)
+
def _write_log(self, report: SyncReport) -> None:
"""
EN: Write the sanitized diagnostic log, unless this was a dry-run
diff --git a/nexanote/sync/sync_log.py b/nexanote/sync/sync_log.py
index 5b1db18..880c503 100644
--- a/nexanote/sync/sync_log.py
+++ b/nexanote/sync/sync_log.py
@@ -101,6 +101,26 @@ def build_sync_log(
errors = [sanitize_error(e) for e in getattr(report, "errors", [])]
+ # Network-reliability diagnostics — attempts per operation and whether the
+ # session is worth retrying. Each entry is already URL-free (relative DAV
+ # slug path only); we still scrub the reason defensively.
+ operation_attempts = []
+ for entry in getattr(report, "operation_attempts", []) or []:
+ if not isinstance(entry, dict):
+ continue
+ reason = entry.get("reason")
+ operation_attempts.append(
+ {
+ "operation": entry.get("operation"),
+ "path": entry.get("path"),
+ "attempts": entry.get("attempts"),
+ "retryable": bool(entry.get("retryable")),
+ "reason": sanitize_error(reason) if reason else None,
+ }
+ )
+
+ transient_reason = getattr(report, "transient_reason", None)
+
payload = {
"version": SYNC_LOG_VERSION,
"timestamp": timestamp.isoformat() if timestamp else None,
@@ -109,6 +129,10 @@ def build_sync_log(
"duration_seconds": round(report.duration_seconds(), 3),
"dry_run": bool(dry_run),
"success": report.success(),
+ "retryable": bool(getattr(report, "retryable", False)),
+ "next_retry_after_seconds": getattr(report, "next_retry_after_seconds", None),
+ "transient_reason": sanitize_error(transient_reason) if transient_reason else None,
+ "operation_attempts": operation_attempts,
"counts": {
"pulled": getattr(report, "notes_pulled", 0),
"pushed": getattr(report, "notes_pushed", 0),
diff --git a/tests/test_sync_reliability.py b/tests/test_sync_reliability.py
index 83d0de6..efbec6b 100644
--- a/tests/test_sync_reliability.py
+++ b/tests/test_sync_reliability.py
@@ -26,9 +26,16 @@
sys.path.insert(0, str(Path(__file__).parent.parent))
+import requests
+
from nexanote.models.note import Note, Notebook, NoteType, SyncStatus
from nexanote.storage import FileNoteStore
-from nexanote.sync.client import NexaNoteSyncEngine, SyncConfig, SyncReport
+from nexanote.sync.client import (
+ NexaNoteSyncEngine,
+ SyncConfig,
+ SyncReport,
+ WebDAVClient,
+)
from nexanote.sync.plan import SyncPlan
from nexanote.sync.sync_log import (
SYNC_LOG_FILENAME,
@@ -514,3 +521,254 @@ def test_unreachable_server_writes_failure_log_without_corruption(self, tmp_path
state_path = data_dir / SYNC_STATE_FILENAME
if state_path.exists():
json.loads(state_path.read_text("utf-8"))
+
+
+# ---------------------------------------------------------------------------
+# 7. Retry / backoff for transient WebDAV failures
+# ---------------------------------------------------------------------------
+
+# Minimal, empty WebDAV multistatus so a PROPFIND parses to an empty listing.
+_EMPTY_MULTISTATUS = (
+ ''
+ ''
+)
+
+
+class FakeResponse:
+ """A stand-in for ``requests.Response`` with just what the client reads."""
+
+ def __init__(self, status_code=200, reason="", json_data=None, text=""):
+ self.status_code = status_code
+ self.reason = reason
+ self._json = json_data
+ self.text = text
+
+ def json(self):
+ if self._json is None:
+ raise ValueError("no json body")
+ return self._json
+
+
+class FakeSession:
+ """
+ EN: Scriptable replacement for ``requests.Session``. ``scripts`` maps an
+ HTTP method to a list of items; each call pops the next one (the last
+ item repeats once exhausted). An item may be a ``FakeResponse`` or an
+ exception (class or instance) to raise — used to simulate timeouts.
+ FR: Remplaçant scriptable de ``requests.Session`` pour simuler réponses,
+ timeouts et erreurs réseau par méthode HTTP.
+ """
+
+ def __init__(self, scripts=None):
+ self.auth = None
+ self.verify = True
+ self.scripts = scripts or {}
+ self.calls: list[str] = []
+
+ def _next(self, method):
+ self.calls.append(method)
+ items = self.scripts.get(method)
+ if not items:
+ return FakeResponse(200)
+ item = items.pop(0) if len(items) > 1 else items[0]
+ if isinstance(item, BaseException):
+ raise item
+ if isinstance(item, type) and issubclass(item, BaseException):
+ raise item()
+ return item
+
+ def request(self, method, url, **kwargs):
+ return self._next(method.upper())
+
+ def get(self, url, **kwargs):
+ return self._next("GET")
+
+ def put(self, url, **kwargs):
+ return self._next("PUT")
+
+ def count(self, method):
+ return self.calls.count(method.upper())
+
+
+def _client(scripts):
+ """A real ``WebDAVClient`` wired to a scripted session (zero backoff)."""
+ config = SyncConfig(
+ server_url="http://stub.invalid/",
+ username="u",
+ password="hunter2-secret",
+ timeout_seconds=1,
+ backoff_seconds=(0.0, 0.0, 0.0),
+ )
+ client = WebDAVClient(config)
+ fake = FakeSession(scripts)
+ client.session = fake
+ return client, fake
+
+
+class TestTransientRetry:
+ def test_transient_503_succeeds_after_retry(self):
+ # First PUT 503, second PUT 201 → overall success after one retry.
+ client, fake = _client(
+ {"PUT": [FakeResponse(503, "Service Unavailable"), FakeResponse(201)]}
+ )
+ ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {})
+ assert ok is True
+ assert reason is None
+ assert fake.count("PUT") == 2
+ # Diagnostics record two attempts and a non-retryable final outcome.
+ last = client.op_attempts[-1]
+ assert last.operation == "PUT"
+ assert last.attempts == 2
+ assert last.retryable is False
+ assert client.last_transient is None
+
+ def test_transient_503_on_get_is_retried(self):
+ client, fake = _client(
+ {"GET": [FakeResponse(503, "Service Unavailable"), FakeResponse(200, json_data={"id": "x"})]}
+ )
+ meta = client.get_note_meta("nb__01234567", "note__abcd1234")
+ assert meta == {"id": "x"}
+ assert fake.count("GET") == 2
+
+ def test_401_is_not_retried(self):
+ client, fake = _client({"PUT": [FakeResponse(401, "Unauthorized")]})
+ ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {})
+ assert ok is False
+ assert "401" in reason # status preserved
+ # Auth failure must be attempted exactly once — never retried.
+ assert fake.count("PUT") == 1
+ last = client.op_attempts[-1]
+ assert last.attempts == 1
+ assert last.retryable is False
+
+ def test_404_is_not_retried(self):
+ client, fake = _client({"PUT": [FakeResponse(404, "Not Found")]})
+ ok, _ = client.put_note_meta("nb__01234567", "note__abcd1234", {})
+ assert ok is False
+ assert fake.count("PUT") == 1
+
+ def test_timeout_is_retried_then_reported(self):
+ client, fake = _client({"PUT": [requests.ConnectTimeout]})
+ ok, reason = client.put_note_meta("nb__01234567", "note__abcd1234", {})
+ assert ok is False
+ assert reason
+ # Exhausted the full attempt budget (1 initial + 2 retries = 3).
+ assert fake.count("PUT") == client.max_attempts == 3
+ last = client.op_attempts[-1]
+ assert last.attempts == 3
+ assert last.retryable is True
+ assert client.last_transient is last
+ # The recorded reason names the transient cause and leaks no URL/host.
+ assert "ConnectTimeout" in (last.reason or "")
+ assert "stub.invalid" not in (last.reason or "")
+
+ def test_connection_error_is_retried(self):
+ client, fake = _client({"PUT": [requests.ConnectionError]})
+ ok, _ = client.put_note_meta("nb__01234567", "note__abcd1234", {})
+ assert ok is False
+ assert fake.count("PUT") == client.max_attempts
+
+
+def _engine_with_real_client(data_dir, scripts):
+ """Build an engine backed by a real WebDAVClient on a scripted session."""
+ config = SyncConfig(
+ server_url="http://stub.invalid/",
+ username="u",
+ password="hunter2-secret",
+ timeout_seconds=1,
+ backoff_seconds=(0.0, 0.0, 0.0),
+ )
+ db = FileNoteStore(data_dir)
+ engine = NexaNoteSyncEngine(db, config, dry_run=False)
+ fake = FakeSession(scripts)
+ engine.client.session = fake
+ return engine, fake
+
+
+class TestRetryableSyncReport:
+ def _push_only_scripts(self, put_item):
+ # ping ok, empty remote, MKCOL ok, PUT scripted to ``put_item``.
+ return {
+ "OPTIONS": [FakeResponse(200)],
+ "PROPFIND": [FakeResponse(207, text=_EMPTY_MULTISTATUS)],
+ "MKCOL": [FakeResponse(201)],
+ "PUT": [put_item],
+ }
+
+ def test_timeout_marks_report_retryable_without_corrupting_state(self, tmp_path):
+ data_dir = tmp_path / "client_retry"
+ engine, _ = _engine_with_real_client(
+ data_dir, self._push_only_scripts(requests.ConnectTimeout)
+ )
+ local = Note(title="Pushable", note_type=NoteType.TYPED)
+ local.add_page().typed_content = "body"
+ engine.db.save_note(local)
+
+ report = engine.sync()
+
+ # Transient push failure → not successful but explicitly retryable.
+ assert not report.success()
+ assert report.retryable is True
+ assert report.next_retry_after_seconds is not None
+ assert report.transient_reason
+ assert "stub.invalid" not in report.transient_reason
+ # Per-operation attempt records made it into the report.
+ assert any(a["operation"] == "PUT" and a["attempts"] == 3
+ for a in report.operation_attempts)
+
+ # The local note was NOT marked synced — state stays consistent.
+ assert engine.db.get_note(local.id).sync_status == SyncStatus.LOCAL_ONLY
+ # The sync-state file (if written) is still valid JSON.
+ state_path = data_dir / SYNC_STATE_FILENAME
+ if state_path.exists():
+ json.loads(state_path.read_text("utf-8"))
+
+ def test_retryable_state_in_log_without_secrets(self, tmp_path):
+ data_dir = tmp_path / "client_retry_log"
+ engine, _ = _engine_with_real_client(
+ data_dir, self._push_only_scripts(requests.ConnectTimeout)
+ )
+ local = Note(title="Pushable", note_type=NoteType.TYPED)
+ local.add_page().typed_content = "TOP-SECRET-BODY-9999"
+ engine.db.save_note(local)
+
+ engine.sync()
+
+ payload = read_sync_log(data_dir)
+ assert payload is not None
+ assert payload["retryable"] is True
+ assert payload["next_retry_after_seconds"] is not None
+ assert payload["transient_reason"]
+ assert any(a["operation"] == "PUT" for a in payload["operation_attempts"])
+
+ raw = sync_log_path(data_dir).read_text("utf-8")
+ # No credential, no body content, no server host in the diagnostic log.
+ assert "hunter2-secret" not in raw
+ assert "TOP-SECRET-BODY-9999" not in raw
+ assert "stub.invalid" not in raw
+
+ def test_successful_retry_still_produces_normal_report(self, tmp_path):
+ data_dir = tmp_path / "client_retry_ok"
+ # PUT 503 once, then succeeds — push should complete normally.
+ engine, fake = _engine_with_real_client(
+ data_dir,
+ self._push_only_scripts(None),
+ )
+ # Replace the PUT script: transient 503 then 201, repeating the 201.
+ fake.scripts["PUT"] = [FakeResponse(503, "Service Unavailable"), FakeResponse(201)]
+
+ local = Note(title="Pushable", note_type=NoteType.TYPED)
+ local.add_page().typed_content = "body"
+ engine.db.save_note(local)
+
+ report = engine.sync()
+
+ assert report.success(), report.errors
+ # A successful sync is never flagged retryable.
+ assert report.retryable is False
+ assert report.notes_pushed == 1
+ # The note is now synced and the report reads like any normal one.
+ assert engine.db.get_note(local.id).sync_status == SyncStatus.SYNCED
+ # The retried PUT is still visible in the diagnostics (2 attempts).
+ assert any(a["operation"] == "PUT" and a["attempts"] == 2
+ for a in report.operation_attempts)