diff options
author | Theodore Ts'o <tytso@mit.edu> | 2010-11-08 13:43:33 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2010-11-08 13:43:33 -0500 |
commit | f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 (patch) | |
tree | 006cdcfd487404fb61986e3030d96cb33866755d /fs/ext4/page-io.c | |
parent | ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 (diff) |
ext4: handle writeback of inodes which are being freed
The following BUG can occur when an inode which is getting freed when
it still has dirty pages outstanding, and it gets deleted (in this
because it was the target of a rename). In ordered mode, we need to
make sure the data pages are written just in case we crash before the
rename (or unlink) is committed. If the inode is being freed then
when we try to igrab the inode, we end up tripping the BUG_ON at
fs/ext4/page-io.c:146.
To solve this problem, we need to keep track of the number of io
callbacks which are pending, and avoid destroying the inode until they
have all been completed. That way we don't have to bump the inode
count to keep the inode from being destroyed; an approach which
doesn't work because the count could have already been dropped down to
zero before the inode writeback has started (at which point we're not
allowed to bump the count back up to 1, since it's already started
getting freed).
Thanks to Dave Chinner for suggesting this approach, which is also
used by XFS.
kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146!
Call Trace:
[<ffffffff811075b1>] ext4_bio_write_page+0x172/0x307
[<ffffffff811033a7>] mpage_da_submit_io+0x2f9/0x37b
[<ffffffff811068d7>] mpage_da_map_and_submit+0x2cc/0x2e2
[<ffffffff811069b3>] mpage_add_bh_to_extent+0xc6/0xd5
[<ffffffff81106c66>] write_cache_pages_da+0x2a4/0x3ac
[<ffffffff81107044>] ext4_da_writepages+0x2d6/0x44d
[<ffffffff81087910>] do_writepages+0x1c/0x25
[<ffffffff810810a4>] __filemap_fdatawrite_range+0x4b/0x4d
[<ffffffff810815f5>] filemap_fdatawrite_range+0xe/0x10
[<ffffffff81122a2e>] jbd2_journal_begin_ordered_truncate+0x7b/0xa2
[<ffffffff8110615d>] ext4_evict_inode+0x57/0x24c
[<ffffffff810c14a3>] evict+0x22/0x92
[<ffffffff810c1a3d>] iput+0x212/0x249
[<ffffffff810bdf16>] dentry_iput+0xa1/0xb9
[<ffffffff810bdf6b>] d_kill+0x3d/0x5d
[<ffffffff810be613>] dput+0x13a/0x147
[<ffffffff810b990d>] sys_renameat+0x1b5/0x258
[<ffffffff81145f71>] ? _atomic_dec_and_lock+0x2d/0x4c
[<ffffffff810b2950>] ? cp_new_stat+0xde/0xea
[<ffffffff810b29c1>] ? sys_newlstat+0x2d/0x38
[<ffffffff810b99c6>] sys_rename+0x16/0x18
[<ffffffff81002a2b>] system_call_fastpath+0x16/0x1b
Reported-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Tested-by: Nick Bowler <nbowler@elliptictech.com>
Diffstat (limited to 'fs/ext4/page-io.c')
-rw-r--r-- | fs/ext4/page-io.c | 59 |
1 files changed, 34 insertions, 25 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a9d976..a24c8cca7370 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -32,8 +32,14 @@ | |||
32 | 32 | ||
33 | static struct kmem_cache *io_page_cachep, *io_end_cachep; | 33 | static struct kmem_cache *io_page_cachep, *io_end_cachep; |
34 | 34 | ||
35 | #define WQ_HASH_SZ 37 | ||
36 | #define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) | ||
37 | static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; | ||
38 | |||
35 | int __init ext4_init_pageio(void) | 39 | int __init ext4_init_pageio(void) |
36 | { | 40 | { |
41 | int i; | ||
42 | |||
37 | io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); | 43 | io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); |
38 | if (io_page_cachep == NULL) | 44 | if (io_page_cachep == NULL) |
39 | return -ENOMEM; | 45 | return -ENOMEM; |
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) | |||
42 | kmem_cache_destroy(io_page_cachep); | 48 | kmem_cache_destroy(io_page_cachep); |
43 | return -ENOMEM; | 49 | return -ENOMEM; |
44 | } | 50 | } |
51 | for (i = 0; i < WQ_HASH_SZ; i++) | ||
52 | init_waitqueue_head(&ioend_wq[i]); | ||
45 | 53 | ||
46 | return 0; | 54 | return 0; |
47 | } | 55 | } |
@@ -52,9 +60,17 @@ void ext4_exit_pageio(void) | |||
52 | kmem_cache_destroy(io_page_cachep); | 60 | kmem_cache_destroy(io_page_cachep); |
53 | } | 61 | } |
54 | 62 | ||
63 | void ext4_ioend_wait(struct inode *inode) | ||
64 | { | ||
65 | wait_queue_head_t *wq = to_ioend_wq(inode); | ||
66 | |||
67 | wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); | ||
68 | } | ||
69 | |||
55 | void ext4_free_io_end(ext4_io_end_t *io) | 70 | void ext4_free_io_end(ext4_io_end_t *io) |
56 | { | 71 | { |
57 | int i; | 72 | int i; |
73 | wait_queue_head_t *wq; | ||
58 | 74 | ||
59 | BUG_ON(!io); | 75 | BUG_ON(!io); |
60 | if (io->page) | 76 | if (io->page) |
@@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io) | |||
69 | } | 85 | } |
70 | } | 86 | } |
71 | io->num_io_pages = 0; | 87 | io->num_io_pages = 0; |
72 | iput(io->inode); | 88 | wq = to_ioend_wq(io->inode); |
89 | if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && | ||
90 | waitqueue_active(wq)) | ||
91 | wake_up_all(wq); | ||
73 | kmem_cache_free(io_end_cachep, io); | 92 | kmem_cache_free(io_end_cachep, io); |
74 | } | 93 | } |
75 | 94 | ||
@@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | |||
142 | io = kmem_cache_alloc(io_end_cachep, flags); | 161 | io = kmem_cache_alloc(io_end_cachep, flags); |
143 | if (io) { | 162 | if (io) { |
144 | memset(io, 0, sizeof(*io)); | 163 | memset(io, 0, sizeof(*io)); |
145 | io->inode = igrab(inode); | 164 | atomic_inc(&EXT4_I(inode)->i_ioend_count); |
146 | BUG_ON(!io->inode); | 165 | io->inode = inode; |
147 | INIT_WORK(&io->work, ext4_end_io_work); | 166 | INIT_WORK(&io->work, ext4_end_io_work); |
148 | INIT_LIST_HEAD(&io->list); | 167 | INIT_LIST_HEAD(&io->list); |
149 | } | 168 | } |
@@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
171 | struct workqueue_struct *wq; | 190 | struct workqueue_struct *wq; |
172 | struct inode *inode; | 191 | struct inode *inode; |
173 | unsigned long flags; | 192 | unsigned long flags; |
174 | ext4_fsblk_t err_block; | ||
175 | int i; | 193 | int i; |
176 | 194 | ||
177 | BUG_ON(!io_end); | 195 | BUG_ON(!io_end); |
178 | inode = io_end->inode; | ||
179 | bio->bi_private = NULL; | 196 | bio->bi_private = NULL; |
180 | bio->bi_end_io = NULL; | 197 | bio->bi_end_io = NULL; |
181 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) | 198 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
182 | error = 0; | 199 | error = 0; |
183 | err_block = bio->bi_sector >> (inode->i_blkbits - 9); | ||
184 | bio_put(bio); | 200 | bio_put(bio); |
185 | 201 | ||
186 | if (!(inode->i_sb->s_flags & MS_ACTIVE)) { | ||
187 | pr_err("sb umounted, discard end_io request for inode %lu\n", | ||
188 | io_end->inode->i_ino); | ||
189 | ext4_free_io_end(io_end); | ||
190 | return; | ||
191 | } | ||
192 | |||
193 | if (error) { | ||
194 | io_end->flag |= EXT4_IO_END_ERROR; | ||
195 | ext4_warning(inode->i_sb, "I/O error writing to inode %lu " | ||
196 | "(offset %llu size %ld starting block %llu)", | ||
197 | inode->i_ino, | ||
198 | (unsigned long long) io_end->offset, | ||
199 | (long) io_end->size, | ||
200 | (unsigned long long) err_block); | ||
201 | } | ||
202 | |||
203 | for (i = 0; i < io_end->num_io_pages; i++) { | 202 | for (i = 0; i < io_end->num_io_pages; i++) { |
204 | struct page *page = io_end->pages[i]->p_page; | 203 | struct page *page = io_end->pages[i]->p_page; |
205 | struct buffer_head *bh, *head; | 204 | struct buffer_head *bh, *head; |
@@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
254 | if (!partial_write) | 253 | if (!partial_write) |
255 | SetPageUptodate(page); | 254 | SetPageUptodate(page); |
256 | } | 255 | } |
257 | |||
258 | io_end->num_io_pages = 0; | 256 | io_end->num_io_pages = 0; |
257 | inode = io_end->inode; | ||
258 | |||
259 | if (error) { | ||
260 | io_end->flag |= EXT4_IO_END_ERROR; | ||
261 | ext4_warning(inode->i_sb, "I/O error writing to inode %lu " | ||
262 | "(offset %llu size %ld starting block %llu)", | ||
263 | inode->i_ino, | ||
264 | (unsigned long long) io_end->offset, | ||
265 | (long) io_end->size, | ||
266 | (unsigned long long) | ||
267 | bio->bi_sector >> (inode->i_blkbits - 9)); | ||
268 | } | ||
259 | 269 | ||
260 | /* Add the io_end to per-inode completed io list*/ | 270 | /* Add the io_end to per-inode completed io list*/ |
261 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | 271 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); |
@@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io, | |||
305 | bio->bi_private = io->io_end = io_end; | 315 | bio->bi_private = io->io_end = io_end; |
306 | bio->bi_end_io = ext4_end_bio; | 316 | bio->bi_end_io = ext4_end_bio; |
307 | 317 | ||
308 | io_end->inode = inode; | ||
309 | io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); | 318 | io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); |
310 | 319 | ||
311 | io->io_bio = bio; | 320 | io->io_bio = bio; |