diff options
author | Christoph Hellwig <hch@infradead.org> | 2011-04-21 20:19:44 -0400 |
---|---|---|
committer | Wu Fengguang <fengguang.wu@intel.com> | 2011-06-07 20:25:21 -0400 |
commit | f758eeabeb96f878c860e8f110f94ec8820822a9 (patch) | |
tree | fea5a465aa0aa38c6c9263eb264acbeb7f722c02 /mm/backing-dev.c | |
parent | 424b351fe1901fc909fd0ca4f21dab58f24c1aac (diff) |
writeback: split inode_wb_list_lock into bdi_writeback.list_lock
Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads. It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.
Based on earlier patches from Nick Piggin and Dave Chinner.
It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram
lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
inode_wb_list_lock: 42590 44433 0.12 147.74 144127.35 252274 886792 0.08 121.34 917211.23
------------------
inode_wb_list_lock 2 [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
inode_wb_list_lock 34 [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
inode_wb_list_lock 12893 [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
inode_wb_list_lock 10702 [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
------------------
inode_wb_list_lock 2 [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
inode_wb_list_lock 19 [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
inode_wb_list_lock 5550 [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
inode_wb_list_lock 8511 [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157
2.6.39-rc3 + patch:
&(&wb->list_lock)->rlock: 11383 11657 0.14 151.69 40429.51 90825 527918 0.11 145.90 556843.37
------------------------
&(&wb->list_lock)->rlock 10 [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
&(&wb->list_lock)->rlock 1493 [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
&(&wb->list_lock)->rlock 3652 [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
&(&wb->list_lock)->rlock 1412 [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
------------------------
&(&wb->list_lock)->rlock 3 [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
&(&wb->list_lock)->rlock 6 [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
&(&wb->list_lock)->rlock 2061 [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
&(&wb->list_lock)->rlock 2629 [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Diffstat (limited to 'mm/backing-dev.c')
-rw-r--r-- | mm/backing-dev.c | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f032e6e1e09a..5f6553ef1ba7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
@@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
628 | INIT_LIST_HEAD(&wb->b_dirty); | 639 | INIT_LIST_HEAD(&wb->b_dirty); |
629 | INIT_LIST_HEAD(&wb->b_io); | 640 | INIT_LIST_HEAD(&wb->b_io); |
630 | INIT_LIST_HEAD(&wb->b_more_io); | 641 | INIT_LIST_HEAD(&wb->b_more_io); |
642 | spin_lock_init(&wb->list_lock); | ||
631 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 643 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
632 | } | 644 | } |
633 | 645 | ||
@@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
676 | if (bdi_has_dirty_io(bdi)) { | 688 | if (bdi_has_dirty_io(bdi)) { |
677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 689 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
678 | 690 | ||
679 | spin_lock(&inode_wb_list_lock); | 691 | bdi_lock_two(&bdi->wb, dst); |
680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 692 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
681 | list_splice(&bdi->wb.b_io, &dst->b_io); | 693 | list_splice(&bdi->wb.b_io, &dst->b_io); |
682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 694 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
683 | spin_unlock(&inode_wb_list_lock); | 695 | spin_unlock(&bdi->wb.list_lock); |
696 | spin_unlock(&dst->list_lock); | ||
684 | } | 697 | } |
685 | 698 | ||
686 | bdi_unregister(bdi); | 699 | bdi_unregister(bdi); |