aboutsummaryrefslogtreecommitdiffstats
path: root/fs/fs-writeback.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-04-21 20:19:44 -0400
committerWu Fengguang <fengguang.wu@intel.com>2011-06-07 20:25:21 -0400
commitf758eeabeb96f878c860e8f110f94ec8820822a9 (patch)
treefea5a465aa0aa38c6c9263eb264acbeb7f722c02 /fs/fs-writeback.c
parent424b351fe1901fc909fd0ca4f21dab58f24c1aac (diff)
writeback: split inode_wb_list_lock into bdi_writeback.list_lock
Split the global inode_wb_list_lock into a per-bdi_writeback list_lock, as it's currently the most contended lock in the system for metadata heavy workloads. It won't help for single-filesystem workloads for which we'll need the I/O-less balance_dirty_pages, but at least we can dedicate a cpu to spinning on each bdi now for larger systems. Based on earlier patches from Nick Piggin and Dave Chinner. It reduces lock contentions to 1/4 in this test case: 10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram lock_stat version 0.3 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- vanilla 2.6.39-rc3: inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23 ------------------ inode_wb_list_lock 2 [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 34 [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 12893 [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 10702 [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a ------------------ inode_wb_list_lock 2 [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85 inode_wb_list_lock 19 [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49 inode_wb_list_lock 5550 [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0 inode_wb_list_lock 8511 [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157 2.6.39-rc3 + patch: &(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37 ------------------------ &(&wb->list_lock)->rlock 10 [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 1493 [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150 &(&wb->list_lock)->rlock 3652 [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f &(&wb->list_lock)->rlock 1412 [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223 ------------------------ &(&wb->list_lock)->rlock 3 [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b &(&wb->list_lock)->rlock 6 [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86 &(&wb->list_lock)->rlock 2061 [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf &(&wb->list_lock)->rlock 2629 [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r--fs/fs-writeback.c97
1 files changed, 49 insertions, 48 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 664acdb2e7ef..36a30917e0dc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
181 */ 181 */
182void inode_wb_list_del(struct inode *inode) 182void inode_wb_list_del(struct inode *inode)
183{ 183{
184 spin_lock(&inode_wb_list_lock); 184 struct backing_dev_info *bdi = inode_to_bdi(inode);
185
186 spin_lock(&bdi->wb.list_lock);
185 list_del_init(&inode->i_wb_list); 187 list_del_init(&inode->i_wb_list);
186 spin_unlock(&inode_wb_list_lock); 188 spin_unlock(&bdi->wb.list_lock);
187} 189}
188 190
189
190/* 191/*
191 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 192 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
192 * furthest end of its superblock's dirty-inode list. 193 * furthest end of its superblock's dirty-inode list.
@@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode)
196 * the case then the inode must have been redirtied while it was being written 197 * the case then the inode must have been redirtied while it was being written
197 * out and we don't reset its dirtied_when. 198 * out and we don't reset its dirtied_when.
198 */ 199 */
199static void redirty_tail(struct inode *inode) 200static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
200{ 201{
201 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 202 assert_spin_locked(&wb->list_lock);
202
203 assert_spin_locked(&inode_wb_list_lock);
204 if (!list_empty(&wb->b_dirty)) { 203 if (!list_empty(&wb->b_dirty)) {
205 struct inode *tail; 204 struct inode *tail;
206 205
@@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode)
214/* 213/*
215 * requeue inode for re-scanning after bdi->b_io list is exhausted. 214 * requeue inode for re-scanning after bdi->b_io list is exhausted.
216 */ 215 */
217static void requeue_io(struct inode *inode) 216static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
218{ 217{
219 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 218 assert_spin_locked(&wb->list_lock);
220
221 assert_spin_locked(&inode_wb_list_lock);
222 list_move(&inode->i_wb_list, &wb->b_more_io); 219 list_move(&inode->i_wb_list, &wb->b_more_io);
223} 220}
224 221
@@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode)
226{ 223{
227 /* 224 /*
228 * Prevent speculative execution through 225 * Prevent speculative execution through
229 * spin_unlock(&inode_wb_list_lock); 226 * spin_unlock(&wb->list_lock);
230 */ 227 */
231 228
232 smp_mb(); 229 smp_mb();
@@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
302 */ 299 */
303static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 300static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
304{ 301{
305 assert_spin_locked(&inode_wb_list_lock); 302 assert_spin_locked(&wb->list_lock);
306 list_splice_init(&wb->b_more_io, &wb->b_io); 303 list_splice_init(&wb->b_more_io, &wb->b_io);
307 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 304 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
308} 305}
@@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
317/* 314/*
318 * Wait for writeback on an inode to complete. 315 * Wait for writeback on an inode to complete.
319 */ 316 */
320static void inode_wait_for_writeback(struct inode *inode) 317static void inode_wait_for_writeback(struct inode *inode,
318 struct bdi_writeback *wb)
321{ 319{
322 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); 320 DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
323 wait_queue_head_t *wqh; 321 wait_queue_head_t *wqh;
@@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode)
325 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 323 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
326 while (inode->i_state & I_SYNC) { 324 while (inode->i_state & I_SYNC) {
327 spin_unlock(&inode->i_lock); 325 spin_unlock(&inode->i_lock);
328 spin_unlock(&inode_wb_list_lock); 326 spin_unlock(&wb->list_lock);
329 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 327 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
330 spin_lock(&inode_wb_list_lock); 328 spin_lock(&wb->list_lock);
331 spin_lock(&inode->i_lock); 329 spin_lock(&inode->i_lock);
332 } 330 }
333} 331}
334 332
335/* 333/*
336 * Write out an inode's dirty pages. Called under inode_wb_list_lock and 334 * Write out an inode's dirty pages. Called under wb->list_lock and
337 * inode->i_lock. Either the caller has an active reference on the inode or 335 * inode->i_lock. Either the caller has an active reference on the inode or
338 * the inode has I_WILL_FREE set. 336 * the inode has I_WILL_FREE set.
339 * 337 *
@@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode)
344 * livelocks, etc. 342 * livelocks, etc.
345 */ 343 */
346static int 344static int
347writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 345writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
346 struct writeback_control *wbc)
348{ 347{
349 struct address_space *mapping = inode->i_mapping; 348 struct address_space *mapping = inode->i_mapping;
350 unsigned dirty; 349 unsigned dirty;
351 int ret; 350 int ret;
352 351
353 assert_spin_locked(&inode_wb_list_lock); 352 assert_spin_locked(&wb->list_lock);
354 assert_spin_locked(&inode->i_lock); 353 assert_spin_locked(&inode->i_lock);
355 354
356 if (!atomic_read(&inode->i_count)) 355 if (!atomic_read(&inode->i_count))
@@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
368 * completed a full scan of b_io. 367 * completed a full scan of b_io.
369 */ 368 */
370 if (wbc->sync_mode != WB_SYNC_ALL) { 369 if (wbc->sync_mode != WB_SYNC_ALL) {
371 requeue_io(inode); 370 requeue_io(inode, wb);
372 return 0; 371 return 0;
373 } 372 }
374 373
375 /* 374 /*
376 * It's a data-integrity sync. We must wait. 375 * It's a data-integrity sync. We must wait.
377 */ 376 */
378 inode_wait_for_writeback(inode); 377 inode_wait_for_writeback(inode, wb);
379 } 378 }
380 379
381 BUG_ON(inode->i_state & I_SYNC); 380 BUG_ON(inode->i_state & I_SYNC);
@@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
384 inode->i_state |= I_SYNC; 383 inode->i_state |= I_SYNC;
385 inode->i_state &= ~I_DIRTY_PAGES; 384 inode->i_state &= ~I_DIRTY_PAGES;
386 spin_unlock(&inode->i_lock); 385 spin_unlock(&inode->i_lock);
387 spin_unlock(&inode_wb_list_lock); 386 spin_unlock(&wb->list_lock);
388 387
389 ret = do_writepages(mapping, wbc); 388 ret = do_writepages(mapping, wbc);
390 389
@@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
415 ret = err; 414 ret = err;
416 } 415 }
417 416
418 spin_lock(&inode_wb_list_lock); 417 spin_lock(&wb->list_lock);
419 spin_lock(&inode->i_lock); 418 spin_lock(&inode->i_lock);
420 inode->i_state &= ~I_SYNC; 419 inode->i_state &= ~I_SYNC;
421 if (!(inode->i_state & I_FREEING)) { 420 if (!(inode->i_state & I_FREEING)) {
@@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
438 /* 437 /*
439 * slice used up: queue for next turn 438 * slice used up: queue for next turn
440 */ 439 */
441 requeue_io(inode); 440 requeue_io(inode, wb);
442 } else { 441 } else {
443 /* 442 /*
444 * Writeback blocked by something other than 443 * Writeback blocked by something other than
@@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
447 * retrying writeback of the dirty page/inode 446 * retrying writeback of the dirty page/inode
448 * that cannot be performed immediately. 447 * that cannot be performed immediately.
449 */ 448 */
450 redirty_tail(inode); 449 redirty_tail(inode, wb);
451 } 450 }
452 } else if (inode->i_state & I_DIRTY) { 451 } else if (inode->i_state & I_DIRTY) {
453 /* 452 /*
@@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
456 * submission or metadata updates after data IO 455 * submission or metadata updates after data IO
457 * completion. 456 * completion.
458 */ 457 */
459 redirty_tail(inode); 458 redirty_tail(inode, wb);
460 } else { 459 } else {
461 /* 460 /*
462 * The inode is clean. At this point we either have 461 * The inode is clean. At this point we either have
@@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
521 * superblock, move all inodes not belonging 520 * superblock, move all inodes not belonging
522 * to it back onto the dirty list. 521 * to it back onto the dirty list.
523 */ 522 */
524 redirty_tail(inode); 523 redirty_tail(inode, wb);
525 continue; 524 continue;
526 } 525 }
527 526
@@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
541 spin_lock(&inode->i_lock); 540 spin_lock(&inode->i_lock);
542 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 541 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
543 spin_unlock(&inode->i_lock); 542 spin_unlock(&inode->i_lock);
544 requeue_io(inode); 543 requeue_io(inode, wb);
545 continue; 544 continue;
546 } 545 }
547 546
@@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
557 __iget(inode); 556 __iget(inode);
558 557
559 pages_skipped = wbc->pages_skipped; 558 pages_skipped = wbc->pages_skipped;
560 writeback_single_inode(inode, wbc); 559 writeback_single_inode(inode, wb, wbc);
561 if (wbc->pages_skipped != pages_skipped) { 560 if (wbc->pages_skipped != pages_skipped) {
562 /* 561 /*
563 * writeback is not making progress due to locked 562 * writeback is not making progress due to locked
564 * buffers. Skip this inode for now. 563 * buffers. Skip this inode for now.
565 */ 564 */
566 redirty_tail(inode); 565 redirty_tail(inode, wb);
567 } 566 }
568 spin_unlock(&inode->i_lock); 567 spin_unlock(&inode->i_lock);
569 spin_unlock(&inode_wb_list_lock); 568 spin_unlock(&wb->list_lock);
570 iput(inode); 569 iput(inode);
571 cond_resched(); 570 cond_resched();
572 spin_lock(&inode_wb_list_lock); 571 spin_lock(&wb->list_lock);
573 if (wbc->nr_to_write <= 0) { 572 if (wbc->nr_to_write <= 0) {
574 wbc->more_io = 1; 573 wbc->more_io = 1;
575 return 1; 574 return 1;
@@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
588 587
589 if (!wbc->wb_start) 588 if (!wbc->wb_start)
590 wbc->wb_start = jiffies; /* livelock avoidance */ 589 wbc->wb_start = jiffies; /* livelock avoidance */
591 spin_lock(&inode_wb_list_lock); 590 spin_lock(&wb->list_lock);
592 591
593 if (list_empty(&wb->b_io)) 592 if (list_empty(&wb->b_io))
594 queue_io(wb, wbc->older_than_this); 593 queue_io(wb, wbc->older_than_this);
@@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
598 struct super_block *sb = inode->i_sb; 597 struct super_block *sb = inode->i_sb;
599 598
600 if (!pin_sb_for_writeback(sb)) { 599 if (!pin_sb_for_writeback(sb)) {
601 requeue_io(inode); 600 requeue_io(inode, wb);
602 continue; 601 continue;
603 } 602 }
604 ret = writeback_sb_inodes(sb, wb, wbc, false); 603 ret = writeback_sb_inodes(sb, wb, wbc, false);
@@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
607 if (ret) 606 if (ret)
608 break; 607 break;
609 } 608 }
610 spin_unlock(&inode_wb_list_lock); 609 spin_unlock(&wb->list_lock);
611 /* Leave any unwritten inodes on b_io */ 610 /* Leave any unwritten inodes on b_io */
612} 611}
613 612
@@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
616{ 615{
617 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 616 WARN_ON(!rwsem_is_locked(&sb->s_umount));
618 617
619 spin_lock(&inode_wb_list_lock); 618 spin_lock(&wb->list_lock);
620 if (list_empty(&wb->b_io)) 619 if (list_empty(&wb->b_io))
621 queue_io(wb, wbc->older_than_this); 620 queue_io(wb, wbc->older_than_this);
622 writeback_sb_inodes(sb, wb, wbc, true); 621 writeback_sb_inodes(sb, wb, wbc, true);
623 spin_unlock(&inode_wb_list_lock); 622 spin_unlock(&wb->list_lock);
624} 623}
625 624
626/* 625/*
@@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb,
762 * become available for writeback. Otherwise 761 * become available for writeback. Otherwise
763 * we'll just busyloop. 762 * we'll just busyloop.
764 */ 763 */
765 spin_lock(&inode_wb_list_lock); 764 spin_lock(&wb->list_lock);
766 if (!list_empty(&wb->b_more_io)) { 765 if (!list_empty(&wb->b_more_io)) {
767 inode = wb_inode(wb->b_more_io.prev); 766 inode = wb_inode(wb->b_more_io.prev);
768 trace_wbc_writeback_wait(&wbc, wb->bdi); 767 trace_wbc_writeback_wait(&wbc, wb->bdi);
769 spin_lock(&inode->i_lock); 768 spin_lock(&inode->i_lock);
770 inode_wait_for_writeback(inode); 769 inode_wait_for_writeback(inode, wb);
771 spin_unlock(&inode->i_lock); 770 spin_unlock(&inode->i_lock);
772 } 771 }
773 spin_unlock(&inode_wb_list_lock); 772 spin_unlock(&wb->list_lock);
774 } 773 }
775 774
776 return wrote; 775 return wrote;
@@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1104 } 1103 }
1105 1104
1106 spin_unlock(&inode->i_lock); 1105 spin_unlock(&inode->i_lock);
1107 spin_lock(&inode_wb_list_lock); 1106 spin_lock(&bdi->wb.list_lock);
1108 inode->dirtied_when = jiffies; 1107 inode->dirtied_when = jiffies;
1109 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1108 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1110 spin_unlock(&inode_wb_list_lock); 1109 spin_unlock(&bdi->wb.list_lock);
1111 1110
1112 if (wakeup_bdi) 1111 if (wakeup_bdi)
1113 bdi_wakeup_thread_delayed(bdi); 1112 bdi_wakeup_thread_delayed(bdi);
@@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
1309 */ 1308 */
1310int write_inode_now(struct inode *inode, int sync) 1309int write_inode_now(struct inode *inode, int sync)
1311{ 1310{
1311 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1312 int ret; 1312 int ret;
1313 struct writeback_control wbc = { 1313 struct writeback_control wbc = {
1314 .nr_to_write = LONG_MAX, 1314 .nr_to_write = LONG_MAX,
@@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync)
1321 wbc.nr_to_write = 0; 1321 wbc.nr_to_write = 0;
1322 1322
1323 might_sleep(); 1323 might_sleep();
1324 spin_lock(&inode_wb_list_lock); 1324 spin_lock(&wb->list_lock);
1325 spin_lock(&inode->i_lock); 1325 spin_lock(&inode->i_lock);
1326 ret = writeback_single_inode(inode, &wbc); 1326 ret = writeback_single_inode(inode, wb, &wbc);
1327 spin_unlock(&inode->i_lock); 1327 spin_unlock(&inode->i_lock);
1328 spin_unlock(&inode_wb_list_lock); 1328 spin_unlock(&wb->list_lock);
1329 if (sync) 1329 if (sync)
1330 inode_sync_wait(inode); 1330 inode_sync_wait(inode);
1331 return ret; 1331 return ret;
@@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now);
1345 */ 1345 */
1346int sync_inode(struct inode *inode, struct writeback_control *wbc) 1346int sync_inode(struct inode *inode, struct writeback_control *wbc)
1347{ 1347{
1348 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1348 int ret; 1349 int ret;
1349 1350
1350 spin_lock(&inode_wb_list_lock); 1351 spin_lock(&wb->list_lock);
1351 spin_lock(&inode->i_lock); 1352 spin_lock(&inode->i_lock);
1352 ret = writeback_single_inode(inode, wbc); 1353 ret = writeback_single_inode(inode, wb, wbc);
1353 spin_unlock(&inode->i_lock); 1354 spin_unlock(&inode->i_lock);
1354 spin_unlock(&inode_wb_list_lock); 1355 spin_unlock(&wb->list_lock);
1355 return ret; 1356 return ret;
1356} 1357}
1357EXPORT_SYMBOL(sync_inode); 1358EXPORT_SYMBOL(sync_inode);