Skip to content

Commit 59ee0bb

Browse files
committed
Be more careful with locking around db.db_data
Lock db_mtx in some places that access db->db_data. But in some places, add assertions that the dbuf is in a state where it will not be copied, rather than locking it. Lock db_rwlock in some places that access db->db.db_data's contents. But in some places, add assertions that should guarantee the buffer is being accessed by one thread only, rather than locking it. Closes #16626 Sponsored by: ConnectWise Signed-off-by: Alan Somers <asomers@gmail.com>
1 parent 970c0d6 commit 59ee0bb

File tree

5 files changed

+117
-16
lines changed

5 files changed

+117
-16
lines changed

include/sys/dbuf.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,22 @@ typedef struct dmu_buf_impl {
329329
dmu_buf_user_t *db_user;
330330
} dmu_buf_impl_t;
331331

332+
/*
333+
* Assert that the value of db.db_data cannot currently be changed. Either
334+
* it's locked, or it's in an immutable state.
335+
*/
336+
void assert_db_data_addr_locked(const dmu_buf_impl_t *db);
337+
/*
338+
* Assert that the provided dbuf's contents can only be accessed by the caller,
339+
* and by no other thread. Either it must be locked, or in a state where
340+
* locking is not required.
341+
*/
342+
#ifdef __linux__
343+
void assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t wr);
344+
#else
345+
void assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t wr);
346+
#endif
347+
332348
#define DBUF_HASH_MUTEX(h, idx) \
333349
(&(h)->hash_mutexes[(idx) & ((h)->hash_mutex_mask)])
334350

module/zfs/dbuf.c

Lines changed: 83 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,41 @@ static unsigned long dbuf_metadata_cache_target_bytes(void);
286286
static uint_t dbuf_cache_hiwater_pct = 10;
287287
static uint_t dbuf_cache_lowater_pct = 10;
288288

289+
void
290+
assert_db_data_addr_locked(const dmu_buf_impl_t *db)
291+
{
292+
if (db->db_level > 0)
293+
return;
294+
else if (db->db.db_object == DMU_META_DNODE_OBJECT)
295+
return;
296+
ASSERT(MUTEX_HELD(&db->db_mtx));
297+
}
298+
299+
void
300+
#ifdef __linux__
301+
assert_db_data_contents_locked(dmu_buf_impl_t *db, boolean_t writer)
302+
#else
303+
assert_db_data_contents_locked(const dmu_buf_impl_t *db, boolean_t writer)
304+
#endif
305+
{
306+
/*
307+
* db_rwlock protects indirect blocks and the data block of the meta
308+
* dnode.
309+
*/
310+
if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID)
311+
return;
312+
if (db->db_dirtycnt == 0)
313+
return;
314+
else if (db->db_level == 0)
315+
return;
316+
else if (db->db.db_object != DMU_META_DNODE_OBJECT)
317+
return;
318+
if (writer)
319+
ASSERT(RW_WRITE_HELD(&db->db_rwlock));
320+
else
321+
ASSERT(RW_LOCK_HELD(&db->db_rwlock));
322+
}
323+
289324
static int
290325
dbuf_cons(void *vdb, void *unused, int kmflag)
291326
{
@@ -1215,13 +1250,16 @@ dbuf_verify(dmu_buf_impl_t *db)
12151250
*/
12161251
if (db->db_dirtycnt == 0) {
12171252
if (db->db_level == 0) {
1218-
uint64_t *buf = db->db.db_data;
1253+
uint64_t *buf;
12191254
int i;
12201255

1256+
assert_db_data_contents_locked(db, FALSE);
1257+
buf = db->db.db_data;
12211258
for (i = 0; i < db->db.db_size >> 3; i++) {
12221259
ASSERT0(buf[i]);
12231260
}
12241261
} else {
1262+
assert_db_data_contents_locked(db, FALSE);
12251263
blkptr_t *bps = db->db.db_data;
12261264
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
12271265
db->db.db_size);
@@ -1703,6 +1741,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
17031741
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
17041742
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
17051743
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1744+
assert_db_data_contents_locked(db, FALSE);
17061745
memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
17071746
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
17081747
dnode_t *dn = DB_DNODE(db);
@@ -1733,6 +1772,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
17331772
} else {
17341773
dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
17351774
}
1775+
assert_db_data_contents_locked(db, FALSE);
17361776
memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
17371777
} else {
17381778
db->db_buf = NULL;
@@ -3023,6 +3063,7 @@ dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
30233063
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
30243064
/* we were freed while filling */
30253065
/* XXX dbuf_undirty? */
3066+
assert_db_data_contents_locked(db, TRUE);
30263067
memset(db->db.db_data, 0, db->db.db_size);
30273068
db->db_freed_in_flight = FALSE;
30283069
db->db_state = DB_CACHED;
@@ -3155,6 +3196,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx,
31553196
ASSERT(!arc_is_encrypted(buf));
31563197
mutex_exit(&db->db_mtx);
31573198
(void) dbuf_dirty(db, tx);
3199+
assert_db_data_contents_locked(db, TRUE);
31583200
memcpy(db->db.db_data, buf->b_data, db->db.db_size);
31593201
arc_buf_destroy(buf, db);
31603202
return;
@@ -3398,6 +3440,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
33983440
*parentp = NULL;
33993441
return (err);
34003442
}
3443+
assert_db_data_addr_locked(*parentp);
34013444
*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
34023445
(blkid & ((1ULL << epbs) - 1));
34033446
return (0);
@@ -4584,10 +4627,12 @@ dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
45844627
return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
45854628
} else {
45864629
dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
4630+
assert_db_data_addr_locked(parent_db);
45874631
int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
45884632
VERIFY3U(parent_db->db_level, ==, 1);
45894633
VERIFY3P(DB_DNODE(parent_db), ==, dn);
45904634
VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
4635+
assert_db_data_contents_locked(parent_db, FALSE);
45914636
blkptr_t *bp = parent_db->db.db_data;
45924637
return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
45934638
}
@@ -4598,12 +4643,22 @@ dbuf_lightweight_ready(zio_t *zio)
45984643
{
45994644
dbuf_dirty_record_t *dr = zio->io_private;
46004645
blkptr_t *bp = zio->io_bp;
4646+
dmu_buf_impl_t *parent_db = NULL;
46014647

46024648
if (zio->io_error != 0)
46034649
return;
46044650

46054651
dnode_t *dn = dr->dr_dnode;
46064652

4653+
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4654+
if (dr->dr_parent == NULL) {
4655+
parent_db = dn->dn_dbuf;
4656+
} else {
4657+
parent_db = dr->dr_parent->dr_dbuf;
4658+
}
4659+
4660+
assert_db_data_addr_locked(parent_db);
4661+
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46074662
blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
46084663
spa_t *spa = dmu_objset_spa(dn->dn_objset);
46094664
int64_t delta = bp_get_dsize_sync(spa, bp) -
@@ -4623,14 +4678,6 @@ dbuf_lightweight_ready(zio_t *zio)
46234678
BP_SET_FILL(bp, fill);
46244679
}
46254680

4626-
dmu_buf_impl_t *parent_db;
4627-
EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
4628-
if (dr->dr_parent == NULL) {
4629-
parent_db = dn->dn_dbuf;
4630-
} else {
4631-
parent_db = dr->dr_parent->dr_dbuf;
4632-
}
4633-
rw_enter(&parent_db->db_rwlock, RW_WRITER);
46344681
*bp_orig = *bp;
46354682
rw_exit(&parent_db->db_rwlock);
46364683
}
@@ -4664,6 +4711,7 @@ noinline static void
46644711
dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46654712
{
46664713
dnode_t *dn = dr->dr_dnode;
4714+
dmu_buf_impl_t *parent_db = NULL;
46674715
zio_t *pio;
46684716
if (dn->dn_phys->dn_nlevels == 1) {
46694717
pio = dn->dn_zio;
@@ -4682,6 +4730,11 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46824730
* See comment in dbuf_write(). This is so that zio->io_bp_orig
46834731
* will have the old BP in dbuf_lightweight_done().
46844732
*/
4733+
if (dr->dr_dnode->dn_phys->dn_nlevels != 1) {
4734+
parent_db = dr->dr_parent->dr_dbuf;
4735+
assert_db_data_addr_locked(parent_db);
4736+
rw_enter(&parent_db->db_rwlock, RW_READER);
4737+
}
46854738
dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
46864739

46874740
dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
@@ -4691,6 +4744,9 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
46914744
dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
46924745
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
46934746

4747+
if (parent_db)
4748+
rw_exit(&parent_db->db_rwlock);
4749+
46944750
zio_nowait(dr->dr_zio);
46954751
}
46964752

@@ -4847,6 +4903,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
48474903
} else {
48484904
*datap = arc_alloc_buf(os->os_spa, db, type, psize);
48494905
}
4906+
assert_db_data_contents_locked(db, FALSE);
48504907
memcpy((*datap)->b_data, db->db.db_data, psize);
48514908
}
48524909
db->db_data_pending = dr;
@@ -4953,6 +5010,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49535010

49545011
if (dn->dn_type == DMU_OT_DNODE) {
49555012
i = 0;
5013+
rw_enter(&db->db_rwlock, RW_READER);
49565014
while (i < db->db.db_size) {
49575015
dnode_phys_t *dnp =
49585016
(void *)(((char *)db->db.db_data) + i);
@@ -4978,6 +5036,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49785036
DNODE_MIN_SIZE;
49795037
}
49805038
}
5039+
rw_exit(&db->db_rwlock);
49815040
} else {
49825041
if (BP_IS_HOLE(bp)) {
49835042
fill = 0;
@@ -4986,6 +5045,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49865045
}
49875046
}
49885047
} else {
5048+
rw_enter(&db->db_rwlock, RW_READER);
49895049
blkptr_t *ibp = db->db.db_data;
49905050
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
49915051
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
@@ -4995,6 +5055,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
49955055
BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
49965056
fill += BP_GET_FILL(ibp);
49975057
}
5058+
rw_exit(&db->db_rwlock);
49985059
}
49995060
DB_DNODE_EXIT(db);
50005061

@@ -5029,6 +5090,8 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50295090
DB_DNODE_EXIT(db);
50305091
ASSERT3U(epbs, <, 31);
50315092

5093+
assert_db_data_addr_locked(db);
5094+
rw_enter(&db->db_rwlock, RW_READER);
50325095
/* Determine if all our children are holes */
50335096
for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
50345097
if (!BP_IS_HOLE(bp))
@@ -5045,10 +5108,13 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
50455108
* anybody from reading the blocks we're about to
50465109
* zero out.
50475110
*/
5048-
rw_enter(&db->db_rwlock, RW_WRITER);
5111+
if (!rw_tryupgrade(&db->db_rwlock)) {
5112+
rw_exit(&db->db_rwlock);
5113+
rw_enter(&db->db_rwlock, RW_WRITER);
5114+
}
50495115
memset(db->db.db_data, 0, db->db.db_size);
5050-
rw_exit(&db->db_rwlock);
50515116
}
5117+
rw_exit(&db->db_rwlock);
50525118
}
50535119

50545120
static void
@@ -5243,11 +5309,11 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
52435309
* avoid lock contention, only grab it when we are actually
52445310
* changing the BP.
52455311
*/
5246-
if (rw != NULL)
5312+
if (rw != NULL && !rw_tryupgrade(rw)) {
5313+
rw_exit(rw);
52475314
rw_enter(rw, RW_WRITER);
5315+
}
52485316
*bp = bp_copy;
5249-
if (rw != NULL)
5250-
rw_exit(rw);
52515317
}
52525318
}
52535319

@@ -5263,6 +5329,8 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52635329
if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
52645330
return;
52655331

5332+
assert_db_data_addr_locked(db);
5333+
rw_enter(&db->db_rwlock, RW_READER);
52665334
if (db->db_level > 0) {
52675335
blkptr_t *bp = db->db.db_data;
52685336
for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
@@ -5281,6 +5349,7 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
52815349
}
52825350
}
52835351
}
5352+
rw_exit(&db->db_rwlock);
52845353
}
52855354

52865355

module/zfs/dmu_objset.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,8 +2234,12 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
22342234
FTAG, (dmu_buf_t **)&db);
22352235
ASSERT0(error);
22362236
mutex_enter(&db->db_mtx);
2237-
data = (before) ? db->db.db_data :
2238-
dmu_objset_userquota_find_data(db, tx);
2237+
if (before) {
2238+
assert_db_data_contents_locked(db, FALSE);
2239+
data = db->db.db_data;
2240+
} else {
2241+
data = dmu_objset_userquota_find_data(db, tx);
2242+
}
22392243
have_spill = B_TRUE;
22402244
} else {
22412245
mutex_enter(&dn->dn_mtx);

module/zfs/dnode.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,11 +445,14 @@ dnode_verify(dnode_t *dn)
445445
if (dn->dn_phys->dn_type != DMU_OT_NONE)
446446
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
447447
ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
448+
#ifdef DEBUG
448449
if (dn->dn_dbuf != NULL) {
450+
assert_db_data_addr_locked(dn->dn_dbuf);
449451
ASSERT3P(dn->dn_phys, ==,
450452
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
451453
(dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
452454
}
455+
#endif
453456
if (drop_struct_lock)
454457
rw_exit(&dn->dn_struct_rwlock);
455458
}
@@ -1522,6 +1525,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
15221525
epb = db->db.db_size >> DNODE_SHIFT;
15231526

15241527
idx = object & (epb - 1);
1528+
assert_db_data_addr_locked(db);
15251529
dn_block = (dnode_phys_t *)db->db.db_data;
15261530

15271531
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
@@ -1537,6 +1541,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
15371541
dnh = &dnc->dnc_children[0];
15381542

15391543
/* Initialize dnode slot status from dnode_phys_t */
1544+
rw_enter(&db->db_rwlock, RW_READER);
15401545
for (int i = 0; i < epb; i++) {
15411546
zrl_init(&dnh[i].dnh_zrlock);
15421547

@@ -1557,6 +1562,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
15571562
skip = 0;
15581563
}
15591564
}
1565+
rw_exit(&db->db_rwlock);
15601566

15611567
dmu_buf_init_user(&dnc->dnc_dbu, NULL,
15621568
dnode_buf_evict_async, NULL);
@@ -1608,6 +1614,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
16081614
DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
16091615
dn = dnh->dnh_dnode;
16101616
} else {
1617+
assert_db_data_contents_locked(db, FALSE);
16111618
dn = dnode_create(os, dn_block + idx, db,
16121619
object, dnh);
16131620
dmu_buf_add_user_size(&db->db,
@@ -1681,6 +1688,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
16811688
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
16821689
dn = dnh->dnh_dnode;
16831690
} else {
1691+
assert_db_data_contents_locked(db, FALSE);
16841692
dn = dnode_create(os, dn_block + idx, db,
16851693
object, dnh);
16861694
dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
@@ -2564,6 +2572,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
25642572
dbuf_rele(db, FTAG);
25652573
return (error);
25662574
}
2575+
assert_db_data_addr_locked(db);
25672576
data = db->db.db_data;
25682577
rw_enter(&db->db_rwlock, RW_READER);
25692578
}

0 commit comments

Comments
 (0)