aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2008-07-01 03:07:34 -0400
committerJens Axboe <jens.axboe@oracle.com>2008-07-01 03:07:34 -0400
commit18ce3751ccd488c78d3827e9f6bf54e6322676fb (patch)
tree4bb83c2b963e8ebe918b79f61d1a440fb1d28f8d
parentd585d0b9d73ed999cc7b8cf3cac4a5b01abb544e (diff)
Properly notify block layer of sync writes
fsync_buffers_list() and sync_dirty_buffer() both issue async writes and then immediately wait on them. Conceptually, that makes them sync writes and we should treat them as such so that the IO schedulers can handle them appropriately. This patch fixes a write starvation issue that Lin Ming reported, where xx is stuck for more than 2 minutes because of a large number of synchronous IO in the system: INFO: task kjournald:20558 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kjournald D ffff810010820978 6712 20558 2 ffff81022ddb1d10 0000000000000046 ffff81022e7baa10 ffffffff803ba6f2 ffff81022ecd0000 ffff8101e6dc9160 ffff81022ecd0348 000000008048b6cb 0000000000000086 ffff81022c4e8d30 0000000000000000 ffffffff80247537 Call Trace: [<ffffffff803ba6f2>] kobject_get+0x12/0x17 [<ffffffff80247537>] getnstimeofday+0x2f/0x83 [<ffffffff8029c1ac>] sync_buffer+0x0/0x3f [<ffffffff8066d195>] io_schedule+0x5d/0x9f [<ffffffff8029c1e7>] sync_buffer+0x3b/0x3f [<ffffffff8066d3f0>] __wait_on_bit+0x40/0x6f [<ffffffff8029c1ac>] sync_buffer+0x0/0x3f [<ffffffff8066d48b>] out_of_line_wait_on_bit+0x6c/0x78 [<ffffffff80243909>] wake_bit_function+0x0/0x23 [<ffffffff8029e3ad>] sync_dirty_buffer+0x98/0xcb [<ffffffff8030056b>] journal_commit_transaction+0x97d/0xcb6 [<ffffffff8023a676>] lock_timer_base+0x26/0x4b [<ffffffff8030300a>] kjournald+0xc1/0x1fb [<ffffffff802438db>] autoremove_wake_function+0x0/0x2e [<ffffffff80302f49>] kjournald+0x0/0x1fb [<ffffffff802437bb>] kthread+0x47/0x74 [<ffffffff8022de51>] schedule_tail+0x28/0x5d [<ffffffff8020cac8>] child_rip+0xa/0x12 [<ffffffff80243774>] kthread+0x0/0x74 [<ffffffff8020cabe>] child_rip+0x0/0x12 Lin Ming confirms that this patch fixes the issue. I've run tests with it for the past week and no ill effects have been observed, so I'm proposing it for inclusion into 2.6.26. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
-rw-r--r--fs/buffer.c13
-rw-r--r--include/linux/fs.h1
2 files changed, 9 insertions, 5 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f013..0f51c0f7c266 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
821 * contents - it is a noop if I/O is still in 821 * contents - it is a noop if I/O is still in
822 * flight on potentially older contents. 822 * flight on potentially older contents.
823 */ 823 */
824 ll_rw_block(SWRITE, 1, &bh); 824 ll_rw_block(SWRITE_SYNC, 1, &bh);
825 brelse(bh); 825 brelse(bh);
826 spin_lock(lock); 826 spin_lock(lock);
827 } 827 }
@@ -2940,16 +2940,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2940 for (i = 0; i < nr; i++) { 2940 for (i = 0; i < nr; i++) {
2941 struct buffer_head *bh = bhs[i]; 2941 struct buffer_head *bh = bhs[i];
2942 2942
2943 if (rw == SWRITE) 2943 if (rw == SWRITE || rw == SWRITE_SYNC)
2944 lock_buffer(bh); 2944 lock_buffer(bh);
2945 else if (test_set_buffer_locked(bh)) 2945 else if (test_set_buffer_locked(bh))
2946 continue; 2946 continue;
2947 2947
2948 if (rw == WRITE || rw == SWRITE) { 2948 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
2949 if (test_clear_buffer_dirty(bh)) { 2949 if (test_clear_buffer_dirty(bh)) {
2950 bh->b_end_io = end_buffer_write_sync; 2950 bh->b_end_io = end_buffer_write_sync;
2951 get_bh(bh); 2951 get_bh(bh);
2952 submit_bh(WRITE, bh); 2952 if (rw == SWRITE_SYNC)
2953 submit_bh(WRITE_SYNC, bh);
2954 else
2955 submit_bh(WRITE, bh);
2953 continue; 2956 continue;
2954 } 2957 }
2955 } else { 2958 } else {
@@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2978 if (test_clear_buffer_dirty(bh)) { 2981 if (test_clear_buffer_dirty(bh)) {
2979 get_bh(bh); 2982 get_bh(bh);
2980 bh->b_end_io = end_buffer_write_sync; 2983 bh->b_end_io = end_buffer_write_sync;
2981 ret = submit_bh(WRITE, bh); 2984 ret = submit_bh(WRITE_SYNC, bh);
2982 wait_on_buffer(bh); 2985 wait_on_buffer(bh);
2983 if (buffer_eopnotsupp(bh)) { 2986 if (buffer_eopnotsupp(bh)) {
2984 clear_buffer_eopnotsupp(bh); 2987 clear_buffer_eopnotsupp(bh);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c1080826832..d8e2762ed14d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -83,6 +83,7 @@ extern int dir_notify_enable;
83#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) 83#define READ_SYNC (READ | (1 << BIO_RW_SYNC))
84#define READ_META (READ | (1 << BIO_RW_META)) 84#define READ_META (READ | (1 << BIO_RW_META))
85#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) 85#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
86#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC))
86#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) 87#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
87 88
88#define SEL_IN 1 89#define SEL_IN 1