From eb5269791e03912507bd30574a94a6c479285668 Mon Sep 17 00:00:00 2001 From: Ionut Nechita Date: Wed, 13 May 2026 19:39:05 +0300 Subject: [PATCH] drbd: fix false positive resync throttling in drbd_rs_c_min_rate_throttle drbd_rs_c_min_rate_throttle() is intended to slow down resync when genuine application I/O is competing for the backing device. It used to detect "application I/O" by comparing the total sector count from the backing device (part_stat_read_accum) against the resync sector counter (rs_sect_ev), and throttling when the resync speed exceeds c-min-rate. That curr_events heuristic produces false positives: 1) On the receiver path, rs_sect_ev is incremented *after* the throttle check. The current resync I/O is already reflected in part_stat counters but not yet in rs_sect_ev, creating a persistent positive delta that looks like application I/O. 2) The per-cpu part_stat counters and the atomic rs_sect_ev are not read under any common lock, so transient skew between them can push the delta above 64 sectors even when no application I/O is present. When the false positive fires, the function compares the resync speed against c-min-rate (default 35840 KB/s ~ 35 MB/s). On modern hardware capable of 300+ MB/s resync the condition is almost always true, so the caller sleeps 100 ms (HZ/10) per resync request or stops issuing new requests, capping throughput at roughly c-min-rate. This was observed in production on a Distributed Cloud controller where drbd-dc-vault (100 GB) resynced at ~30 MB/s instead of the expected ~360 MB/s. Setting c-min-rate above the actual resync speed (e.g. 350 MB/s) or disabling the feature (c-min-rate 0) restored full throughput, confirming false-positive throttling as root cause. Switch the gate to ap_bio_cnt. inc_ap_bio() is called for every application bio at the top of drbd_make_request(), before any activity-log handling, and dec_ap_bio() runs on completion. That makes ap_bio_cnt the authoritative "application I/O in flight" signal, independent of part_stat update timing, per-cpu skew, and activity-log fastpath outcomes. Backport of the drbd 9.x fix to the in-tree drbd 8.4 driver. Suggested-by: Ionut Nechita Signed-off-by: Philipp Reisner [inechita: backport to drbd 8.4 - ap_bio_cnt is scalar, not array] Signed-off-by: Ionut Nechita --- drivers/block/drbd/drbd_int.h | 2 - drivers/block/drbd/drbd_main.c | 1 - drivers/block/drbd/drbd_receiver.c | 62 ++++++++++++++++-------------- drivers/block/drbd/drbd_worker.c | 3 -- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index f6d6276974eef..908475fd4dd39 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -886,8 +886,6 @@ struct drbd_device { atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ atomic_t rs_sect_ev; /* for submitted resync data rate, both */ int rs_last_sect_ev; /* counter to compare with */ - int rs_last_events; /* counter of read or write "events" (unit sectors) - * on the lower level device when we last looked. */ int c_sync_rate; /* current resync rate after syncer throttle magic */ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index b1a721dd04969..b370b173259b9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2018,7 +2018,6 @@ void drbd_device_cleanup(struct drbd_device *device) device->rs_start = device->rs_total = device->rs_failed = 0; - device->rs_last_events = 0; device->rs_last_sect_ev = 0; for (i = 0; i < DRBD_SYNC_MARKS; i++) { device->rs_mark_left[i] = 0; diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 58b95bf4bdca6..93ca35e7a960a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2569,50 +2569,56 @@ bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sec return throttle; } +/* + * Throttle resync when application I/O is in flight on the backing + * device and the resync is running faster than c-min-rate. + * + * The previous heuristic compared the backing device's part_stat + * sectors counter against our own rs_sect_ev counter (similar to + * MD RAID is_mddev_idle()) and fired on a delta > 64 sectors. That + * comparison is racy: rs_sect_ev is bumped at submission while + * part_stat is updated on bio completion (and is per-cpu), and the + * receiver path bumps rs_sect_ev *after* the throttle check. On + * fast hardware the transient skew routinely exceeds 64 sectors + * even with no application I/O, capping resync throughput at + * c-min-rate. + * + * ap_bio_cnt is incremented unconditionally for every application + * request in drbd_make_request(), so it is the authoritative + * "is application I/O in flight" signal. + */ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) { - struct gendisk *disk = device->ldev->backing_bdev->bd_disk; unsigned long db, dt, dbdt; unsigned int c_min_rate; - int curr_events; + unsigned long rs_left; + int i; rcu_read_lock(); c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; rcu_read_unlock(); - /* feature disabled? */ if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read_accum(disk->part0, sectors) - - atomic_read(&device->rs_sect_ev); - - if (atomic_read(&device->ap_actlog_cnt) - || curr_events - device->rs_last_events > 64) { - unsigned long rs_left; - int i; - - device->rs_last_events = curr_events; + if (!atomic_read(&device->ap_bio_cnt)) + return false; - /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, - * approx. */ - i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, approx. */ + i = (device->rs_last_mark + DRBD_SYNC_MARKS - 1) % DRBD_SYNC_MARKS; - if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) - rs_left = device->ov_left; - else - rs_left = drbd_bm_total_weight(device) - device->rs_failed; + if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + rs_left = device->ov_left; + else + rs_left = drbd_bm_total_weight(device) - device->rs_failed; - dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; - if (!dt) - dt++; - db = device->rs_mark_left[i] - rs_left; - dbdt = Bit2KB(db/dt); + dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db / dt); - if (dbdt > c_min_rate) - return true; - } - return false; + return dbdt > c_min_rate; } static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 0697f99fed18d..cf7f322103abb 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1657,14 +1657,11 @@ void drbd_resync_after_changed(struct drbd_device *device) void drbd_rs_controller_reset(struct drbd_peer_device *peer_device) { struct drbd_device *device = peer_device->device; - struct gendisk *disk = device->ldev->backing_bdev->bd_disk; struct fifo_buffer *plan; atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; - device->rs_last_events = - (int)part_stat_read_accum(disk->part0, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context.