From e697f1d541c01082c5796cb50eac271ff7223f1d Mon Sep 17 00:00:00 2001 From: ahanot Date: Mon, 11 May 2026 16:42:24 -0400 Subject: [PATCH] feat(ads-client): retry callback requests once on transient transport failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `recordImpression`, `recordClick`, and `reportAd` are fire-and-forget calls that consumers can't easily redo. On unreliable mobile networks they were silently lost on a single timeout, connection reset, or DNS hiccup. Retry the request once after a 500ms wait when viaduct reports a transport-level error. HTTP status errors are intentionally not retried — those are application decisions, not transient blips. Extracted as a small `retry_once` helper so the retry policy is testable without needing to fake a viaduct backend. --- CHANGELOG.md | 1 + components/ads-client/src/mars/transport.rs | 70 ++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 315e730130..5d0fa95162 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ ### Ads Client * HTTP cache TTL is now resolved by priority — explicit per-request TTL (if any) wins, otherwise the response's `Cache-Control: max-age` is used, otherwise the configured `default_ttl`. The resolved TTL is capped at 7 days as a safety net against misconfigured server values. Previously the layer took the minimum of all three, which effectively ignored the server's `max-age` signal. +* Callback requests (`recordImpression`, `recordClick`, `reportAd`) now retry once after a short delay on transient transport-level failures (timeouts, connection resets, DNS), to better survive flaky mobile networks. HTTP status errors are not retried. ### Remote Settings * Add uptake telemetry support ([#7288](https://github.com/mozilla/application-services/pull/7288)) diff --git a/components/ads-client/src/mars/transport.rs b/components/ads-client/src/mars/transport.rs index 594ca2a7a8..babbb1469c 100644 --- a/components/ads-client/src/mars/transport.rs +++ b/components/ads-client/src/mars/transport.rs @@ -4,6 +4,7 @@ */ use std::hash::Hash; +use std::time::Duration; use viaduct::{Client, ClientSettings, Request, Response}; @@ -16,6 +17,26 @@ use crate::{ const OHTTP_CHANNEL_ID: &str = "ads-client"; +/// Wait before the single retry attempt for fire-and-forget callback +/// requests. Short enough to feel responsive on mobile, long enough to +/// let a transient blip recover. +const CALLBACK_RETRY_DELAY: Duration = Duration::from_millis(500); + +/// Run `op` and, if it returns `Err`, sleep for `delay` and run it once +/// more. The second result is returned as-is. +fn retry_once(delay: Duration, mut op: F) -> Result +where + F: FnMut() -> Result, +{ + match op() { + Ok(v) => Ok(v), + Err(_) => { + std::thread::sleep(delay); + op() + } + } +} + pub struct MARSTransport { http_cache: Option, telemetry: T, @@ -38,7 +59,11 @@ impl MARSTransport { pub fn fire(&self, request: Request, ohttp: bool) -> Result<(), TransportError> { let client = Self::client_for(ohttp)?; - let response = client.send_sync(request)?; + // Callback requests (impression/click/report) are fire-and-forget on + // unreliable mobile networks. Retry once on a transient viaduct error + // (timeout, connection reset, DNS) before giving up. HTTP status + // errors are not retried — those are decided after this call returns. + let response = retry_once(CALLBACK_RETRY_DELAY, || client.send_sync(request.clone()))?; HTTPError::check(&response)?; Ok(()) } @@ -83,3 +108,46 @@ impl MARSTransport { } } } + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::Cell; + + #[test] + fn retry_once_returns_first_success_without_calling_again() { + let calls = Cell::new(0); + let result: Result<&'static str, ()> = retry_once(Duration::ZERO, || { + calls.set(calls.get() + 1); + Ok("ok") + }); + assert_eq!(result, Ok("ok")); + assert_eq!(calls.get(), 1); + } + + #[test] + fn retry_once_retries_after_failure_and_returns_success() { + let calls = Cell::new(0); + let result: Result<&'static str, &'static str> = retry_once(Duration::ZERO, || { + calls.set(calls.get() + 1); + if calls.get() == 1 { + Err("transient") + } else { + Ok("ok") + } + }); + assert_eq!(result, Ok("ok")); + assert_eq!(calls.get(), 2); + } + + #[test] + fn retry_once_gives_up_after_second_failure() { + let calls = Cell::new(0); + let result: Result<(), &'static str> = retry_once(Duration::ZERO, || { + calls.set(calls.get() + 1); + Err("still broken") + }); + assert_eq!(result, Err("still broken")); + assert_eq!(calls.get(), 2); + } +}