aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-09-24 05:59:04 -0400
committerAlex Elder <aelder@sgi.com>2010-10-18 16:07:56 -0400
commit74f75a0cb7033918eb0fa4a50df25091ac75c16e (patch)
tree3885c0b357c760152d14df03ef88839fdbf5f964
parent69b491c214d7fd4d4df972ae5377be99ca3753db (diff)
xfs: convert buffer cache hash to rbtree
The buffer cache hash is showing typical hash scalability problems. In large scale testing the number of cached items growing far larger than the hash can efficiently handle. Hence we need to move to a self-scaling cache indexing mechanism. I have selected rbtrees for indexing becuse they can have O(log n) search scalability, and insert and remove cost is not excessive, even on large trees. Hence we should be able to cache large numbers of buffers without incurring the excessive cache miss search penalties that the hash is imposing on us. To ensure we still have parallel access to the cache, we need multiple trees. Rather than hashing the buffers by disk address to select a tree, it seems more sensible to separate trees by typical access patterns. Most operations use buffers from within a single AG at a time, so rather than searching lots of different lists, separate the buffer indexes out into per-AG rbtrees. This means that searches during metadata operation have a much higher chance of hitting cache resident nodes, and that updates of the tree are less likely to disturb trees being accessed on other CPUs doing independent operations. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com>
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c136
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h8
-rw-r--r--fs/xfs/xfs_ag.h4
-rw-r--r--fs/xfs/xfs_mount.c2
4 files changed, 74 insertions, 76 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 251bcdc6352e..749d7d39d657 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -188,7 +188,7 @@ _xfs_buf_initialize(
188 atomic_set(&bp->b_hold, 1); 188 atomic_set(&bp->b_hold, 1);
189 init_completion(&bp->b_iowait); 189 init_completion(&bp->b_iowait);
190 INIT_LIST_HEAD(&bp->b_list); 190 INIT_LIST_HEAD(&bp->b_list);
191 INIT_LIST_HEAD(&bp->b_hash_list); 191 RB_CLEAR_NODE(&bp->b_rbnode);
192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 192 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
193 XB_SET_OWNER(bp); 193 XB_SET_OWNER(bp);
194 bp->b_target = target; 194 bp->b_target = target;
@@ -262,8 +262,6 @@ xfs_buf_free(
262{ 262{
263 trace_xfs_buf_free(bp, _RET_IP_); 263 trace_xfs_buf_free(bp, _RET_IP_);
264 264
265 ASSERT(list_empty(&bp->b_hash_list));
266
267 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 265 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
268 uint i; 266 uint i;
269 267
@@ -422,8 +420,10 @@ _xfs_buf_find(
422{ 420{
423 xfs_off_t range_base; 421 xfs_off_t range_base;
424 size_t range_length; 422 size_t range_length;
425 xfs_bufhash_t *hash; 423 struct xfs_perag *pag;
426 xfs_buf_t *bp, *n; 424 struct rb_node **rbp;
425 struct rb_node *parent;
426 xfs_buf_t *bp;
427 427
428 range_base = (ioff << BBSHIFT); 428 range_base = (ioff << BBSHIFT);
429 range_length = (isize << BBSHIFT); 429 range_length = (isize << BBSHIFT);
@@ -432,14 +432,37 @@ _xfs_buf_find(
432 ASSERT(!(range_length < (1 << btp->bt_sshift))); 432 ASSERT(!(range_length < (1 << btp->bt_sshift)));
433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 433 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
434 434
435 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 435 /* get tree root */
436 436 pag = xfs_perag_get(btp->bt_mount,
437 spin_lock(&hash->bh_lock); 437 xfs_daddr_to_agno(btp->bt_mount, ioff));
438 438
439 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 439 /* walk tree */
440 ASSERT(btp == bp->b_target); 440 spin_lock(&pag->pag_buf_lock);
441 if (bp->b_file_offset == range_base && 441 rbp = &pag->pag_buf_tree.rb_node;
442 bp->b_buffer_length == range_length) { 442 parent = NULL;
443 bp = NULL;
444 while (*rbp) {
445 parent = *rbp;
446 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
447
448 if (range_base < bp->b_file_offset)
449 rbp = &(*rbp)->rb_left;
450 else if (range_base > bp->b_file_offset)
451 rbp = &(*rbp)->rb_right;
452 else {
453 /*
454 * found a block offset match. If the range doesn't
455 * match, the only way this is allowed is if the buffer
456 * in the cache is stale and the transaction that made
457 * it stale has not yet committed. i.e. we are
458 * reallocating a busy extent. Skip this buffer and
459 * continue searching to the right for an exact match.
460 */
461 if (bp->b_buffer_length != range_length) {
462 ASSERT(bp->b_flags & XBF_STALE);
463 rbp = &(*rbp)->rb_right;
464 continue;
465 }
443 atomic_inc(&bp->b_hold); 466 atomic_inc(&bp->b_hold);
444 goto found; 467 goto found;
445 } 468 }
@@ -449,17 +472,21 @@ _xfs_buf_find(
449 if (new_bp) { 472 if (new_bp) {
450 _xfs_buf_initialize(new_bp, btp, range_base, 473 _xfs_buf_initialize(new_bp, btp, range_base,
451 range_length, flags); 474 range_length, flags);
452 new_bp->b_hash = hash; 475 rb_link_node(&new_bp->b_rbnode, parent, rbp);
453 list_add(&new_bp->b_hash_list, &hash->bh_list); 476 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
477 /* the buffer keeps the perag reference until it is freed */
478 new_bp->b_pag = pag;
479 spin_unlock(&pag->pag_buf_lock);
454 } else { 480 } else {
455 XFS_STATS_INC(xb_miss_locked); 481 XFS_STATS_INC(xb_miss_locked);
482 spin_unlock(&pag->pag_buf_lock);
483 xfs_perag_put(pag);
456 } 484 }
457
458 spin_unlock(&hash->bh_lock);
459 return new_bp; 485 return new_bp;
460 486
461found: 487found:
462 spin_unlock(&hash->bh_lock); 488 spin_unlock(&pag->pag_buf_lock);
489 xfs_perag_put(pag);
463 490
464 /* Attempt to get the semaphore without sleeping, 491 /* Attempt to get the semaphore without sleeping,
465 * if this does not work then we need to drop the 492 * if this does not work then we need to drop the
@@ -809,27 +836,30 @@ void
809xfs_buf_rele( 836xfs_buf_rele(
810 xfs_buf_t *bp) 837 xfs_buf_t *bp)
811{ 838{
812 xfs_bufhash_t *hash = bp->b_hash; 839 struct xfs_perag *pag = bp->b_pag;
813 840
814 trace_xfs_buf_rele(bp, _RET_IP_); 841 trace_xfs_buf_rele(bp, _RET_IP_);
815 842
816 if (unlikely(!hash)) { 843 if (!pag) {
817 ASSERT(!bp->b_relse); 844 ASSERT(!bp->b_relse);
845 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
818 if (atomic_dec_and_test(&bp->b_hold)) 846 if (atomic_dec_and_test(&bp->b_hold))
819 xfs_buf_free(bp); 847 xfs_buf_free(bp);
820 return; 848 return;
821 } 849 }
822 850
851 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
823 ASSERT(atomic_read(&bp->b_hold) > 0); 852 ASSERT(atomic_read(&bp->b_hold) > 0);
824 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 853 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
825 if (bp->b_relse) { 854 if (bp->b_relse) {
826 atomic_inc(&bp->b_hold); 855 atomic_inc(&bp->b_hold);
827 spin_unlock(&hash->bh_lock); 856 spin_unlock(&pag->pag_buf_lock);
828 (*(bp->b_relse)) (bp); 857 bp->b_relse(bp);
829 } else { 858 } else {
830 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 859 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
831 list_del_init(&bp->b_hash_list); 860 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
832 spin_unlock(&hash->bh_lock); 861 spin_unlock(&pag->pag_buf_lock);
862 xfs_perag_put(pag);
833 xfs_buf_free(bp); 863 xfs_buf_free(bp);
834 } 864 }
835 } 865 }
@@ -1429,56 +1459,24 @@ xfs_buf_iomove(
1429 */ 1459 */
1430void 1460void
1431xfs_wait_buftarg( 1461xfs_wait_buftarg(
1432 xfs_buftarg_t *btp) 1462 struct xfs_buftarg *btp)
1433{ 1463{
1434 xfs_bufhash_t *hash; 1464 struct xfs_perag *pag;
1435 uint i; 1465 uint i;
1436 1466
1437 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1467 for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) {
1438 hash = &btp->bt_hash[i]; 1468 pag = xfs_perag_get(btp->bt_mount, i);
1439 spin_lock(&hash->bh_lock); 1469 spin_lock(&pag->pag_buf_lock);
1440 while (!list_empty(&hash->bh_list)) { 1470 while (rb_first(&pag->pag_buf_tree)) {
1441 spin_unlock(&hash->bh_lock); 1471 spin_unlock(&pag->pag_buf_lock);
1442 delay(100); 1472 delay(100);
1443 spin_lock(&hash->bh_lock); 1473 spin_lock(&pag->pag_buf_lock);
1444 } 1474 }
1445 spin_unlock(&hash->bh_lock); 1475 spin_unlock(&pag->pag_buf_lock);
1446 } 1476 xfs_perag_put(pag);
1447}
1448
1449/*
1450 * Allocate buffer hash table for a given target.
1451 * For devices containing metadata (i.e. not the log/realtime devices)
1452 * we need to allocate a much larger hash table.
1453 */
1454STATIC void
1455xfs_alloc_bufhash(
1456 xfs_buftarg_t *btp,
1457 int external)
1458{
1459 unsigned int i;
1460
1461 if (external) {
1462 btp->bt_hash = NULL;
1463 return;
1464 }
1465 btp->bt_hashshift = 12; /* 4096 buckets */
1466 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1467 sizeof(xfs_bufhash_t));
1468 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1469 spin_lock_init(&btp->bt_hash[i].bh_lock);
1470 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1471 } 1477 }
1472} 1478}
1473 1479
1474STATIC void
1475xfs_free_bufhash(
1476 xfs_buftarg_t *btp)
1477{
1478 kmem_free_large(btp->bt_hash);
1479 btp->bt_hash = NULL;
1480}
1481
1482/* 1480/*
1483 * buftarg list for delwrite queue processing 1481 * buftarg list for delwrite queue processing
1484 */ 1482 */
@@ -1511,7 +1509,6 @@ xfs_free_buftarg(
1511 xfs_flush_buftarg(btp, 1); 1509 xfs_flush_buftarg(btp, 1);
1512 if (mp->m_flags & XFS_MOUNT_BARRIER) 1510 if (mp->m_flags & XFS_MOUNT_BARRIER)
1513 xfs_blkdev_issue_flush(btp); 1511 xfs_blkdev_issue_flush(btp);
1514 xfs_free_bufhash(btp);
1515 iput(btp->bt_mapping->host); 1512 iput(btp->bt_mapping->host);
1516 1513
1517 /* Unregister the buftarg first so that we don't get a 1514 /* Unregister the buftarg first so that we don't get a
@@ -1651,7 +1648,6 @@ xfs_alloc_buftarg(
1651 goto error; 1648 goto error;
1652 if (xfs_alloc_delwrite_queue(btp, fsname)) 1649 if (xfs_alloc_delwrite_queue(btp, fsname))
1653 goto error; 1650 goto error;
1654 xfs_alloc_bufhash(btp, external);
1655 return btp; 1651 return btp;
1656 1652
1657error: 1653error:
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 1f109cee136c..c79882ecb513 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -135,10 +135,6 @@ typedef struct xfs_buftarg {
135 unsigned int bt_sshift; 135 unsigned int bt_sshift;
136 size_t bt_smask; 136 size_t bt_smask;
137 137
138 /* per device buffer hash table */
139 uint bt_hashshift;
140 xfs_bufhash_t *bt_hash;
141
142 /* per device delwri queue */ 138 /* per device delwri queue */
143 struct task_struct *bt_task; 139 struct task_struct *bt_task;
144 struct list_head bt_list; 140 struct list_head bt_list;
@@ -172,8 +168,8 @@ typedef struct xfs_buf {
172 wait_queue_head_t b_waiters; /* unpin waiters */ 168 wait_queue_head_t b_waiters; /* unpin waiters */
173 struct list_head b_list; 169 struct list_head b_list;
174 xfs_buf_flags_t b_flags; /* status flags */ 170 xfs_buf_flags_t b_flags; /* status flags */
175 struct list_head b_hash_list; /* hash table list */ 171 struct rb_node b_rbnode; /* rbtree node */
176 xfs_bufhash_t *b_hash; /* hash table list start */ 172 struct xfs_perag *b_pag; /* contains rbtree root */
177 xfs_buftarg_t *b_target; /* buffer target (device) */ 173 xfs_buftarg_t *b_target; /* buffer target (device) */
178 atomic_t b_hold; /* reference count */ 174 atomic_t b_hold; /* reference count */
179 xfs_daddr_t b_bn; /* block number for I/O */ 175 xfs_daddr_t b_bn; /* block number for I/O */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index baeec83d01f9..63c7a1a6c022 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -233,6 +233,10 @@ typedef struct xfs_perag {
233 struct mutex pag_ici_reclaim_lock; /* serialisation point */ 233 struct mutex pag_ici_reclaim_lock; /* serialisation point */
234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ 234 unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */
235 235
236 /* buffer cache index */
237 spinlock_t pag_buf_lock; /* lock for pag_buf_tree */
238 struct rb_root pag_buf_tree; /* ordered tree of active buffers */
239
236 /* for rcu-safe freeing */ 240 /* for rcu-safe freeing */
237 struct rcu_head rcu_head; 241 struct rcu_head rcu_head;
238#endif 242#endif
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 59859c343e04..cfa2fb4e7f97 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -479,6 +479,8 @@ xfs_initialize_perag(
479 rwlock_init(&pag->pag_ici_lock); 479 rwlock_init(&pag->pag_ici_lock);
480 mutex_init(&pag->pag_ici_reclaim_lock); 480 mutex_init(&pag->pag_ici_reclaim_lock);
481 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); 481 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
482 spin_lock_init(&pag->pag_buf_lock);
483 pag->pag_buf_tree = RB_ROOT;
482 484
483 if (radix_tree_preload(GFP_NOFS)) 485 if (radix_tree_preload(GFP_NOFS))
484 goto out_unwind; 486 goto out_unwind;