aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2010-11-08 13:43:33 -0500
committerTheodore Ts'o <tytso@mit.edu>2010-11-08 13:43:33 -0500
commitf7ad6d2e9201a6e1c9ee6530a291452eb695feb8 (patch)
tree006cdcfd487404fb61986e3030d96cb33866755d
parentce7e010aef63dc6b37a2354f7c9f5f4aedb37978 (diff)
ext4: handle writeback of inodes which are being freed
The following BUG can occur when an inode which is getting freed when it still has dirty pages outstanding, and it gets deleted (in this because it was the target of a rename). In ordered mode, we need to make sure the data pages are written just in case we crash before the rename (or unlink) is committed. If the inode is being freed then when we try to igrab the inode, we end up tripping the BUG_ON at fs/ext4/page-io.c:146. To solve this problem, we need to keep track of the number of io callbacks which are pending, and avoid destroying the inode until they have all been completed. That way we don't have to bump the inode count to keep the inode from being destroyed; an approach which doesn't work because the count could have already been dropped down to zero before the inode writeback has started (at which point we're not allowed to bump the count back up to 1, since it's already started getting freed). Thanks to Dave Chinner for suggesting this approach, which is also used by XFS. kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146! Call Trace: [<ffffffff811075b1>] ext4_bio_write_page+0x172/0x307 [<ffffffff811033a7>] mpage_da_submit_io+0x2f9/0x37b [<ffffffff811068d7>] mpage_da_map_and_submit+0x2cc/0x2e2 [<ffffffff811069b3>] mpage_add_bh_to_extent+0xc6/0xd5 [<ffffffff81106c66>] write_cache_pages_da+0x2a4/0x3ac [<ffffffff81107044>] ext4_da_writepages+0x2d6/0x44d [<ffffffff81087910>] do_writepages+0x1c/0x25 [<ffffffff810810a4>] __filemap_fdatawrite_range+0x4b/0x4d [<ffffffff810815f5>] filemap_fdatawrite_range+0xe/0x10 [<ffffffff81122a2e>] jbd2_journal_begin_ordered_truncate+0x7b/0xa2 [<ffffffff8110615d>] ext4_evict_inode+0x57/0x24c [<ffffffff810c14a3>] evict+0x22/0x92 [<ffffffff810c1a3d>] iput+0x212/0x249 [<ffffffff810bdf16>] dentry_iput+0xa1/0xb9 [<ffffffff810bdf6b>] d_kill+0x3d/0x5d [<ffffffff810be613>] dput+0x13a/0x147 [<ffffffff810b990d>] sys_renameat+0x1b5/0x258 [<ffffffff81145f71>] ? _atomic_dec_and_lock+0x2d/0x4c [<ffffffff810b2950>] ? cp_new_stat+0xde/0xea [<ffffffff810b29c1>] ? sys_newlstat+0x2d/0x38 [<ffffffff810b99c6>] sys_rename+0x16/0x18 [<ffffffff81002a2b>] system_call_fastpath+0x16/0x1b Reported-by: Nick Bowler <nbowler@elliptictech.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Tested-by: Nick Bowler <nbowler@elliptictech.com>
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/page-io.c59
-rw-r--r--fs/ext4/super.c2
3 files changed, 38 insertions, 25 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f82..670d1343f914 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -858,6 +858,7 @@ struct ext4_inode_info {
858 spinlock_t i_completed_io_lock; 858 spinlock_t i_completed_io_lock;
859 /* current io_end structure for async DIO write*/ 859 /* current io_end structure for async DIO write*/
860 ext4_io_end_t *cur_aio_dio; 860 ext4_io_end_t *cur_aio_dio;
861 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
861 862
862 /* 863 /*
863 * Transactions that contain inode's metadata needed to complete 864 * Transactions that contain inode's metadata needed to complete
@@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2060/* page-io.c */ 2061/* page-io.c */
2061extern int __init ext4_init_pageio(void); 2062extern int __init ext4_init_pageio(void);
2062extern void ext4_exit_pageio(void); 2063extern void ext4_exit_pageio(void);
2064extern void ext4_ioend_wait(struct inode *);
2063extern void ext4_free_io_end(ext4_io_end_t *io); 2065extern void ext4_free_io_end(ext4_io_end_t *io);
2064extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2066extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2065extern int ext4_end_io_nolock(ext4_io_end_t *io); 2067extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..a24c8cca7370 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
35int __init ext4_init_pageio(void) 39int __init ext4_init_pageio(void)
36{ 40{
41 int i;
42
37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
38 if (io_page_cachep == NULL) 44 if (io_page_cachep == NULL)
39 return -ENOMEM; 45 return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
42 kmem_cache_destroy(io_page_cachep); 48 kmem_cache_destroy(io_page_cachep);
43 return -ENOMEM; 49 return -ENOMEM;
44 } 50 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
45 53
46 return 0; 54 return 0;
47} 55}
@@ -52,9 +60,17 @@ void ext4_exit_pageio(void)
52 kmem_cache_destroy(io_page_cachep); 60 kmem_cache_destroy(io_page_cachep);
53} 61}
54 62
63void ext4_ioend_wait(struct inode *inode)
64{
65 wait_queue_head_t *wq = to_ioend_wq(inode);
66
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68}
69
55void ext4_free_io_end(ext4_io_end_t *io) 70void ext4_free_io_end(ext4_io_end_t *io)
56{ 71{
57 int i; 72 int i;
73 wait_queue_head_t *wq;
58 74
59 BUG_ON(!io); 75 BUG_ON(!io);
60 if (io->page) 76 if (io->page)
@@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io)
69 } 85 }
70 } 86 }
71 io->num_io_pages = 0; 87 io->num_io_pages = 0;
72 iput(io->inode); 88 wq = to_ioend_wq(io->inode);
89 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
90 waitqueue_active(wq))
91 wake_up_all(wq);
73 kmem_cache_free(io_end_cachep, io); 92 kmem_cache_free(io_end_cachep, io);
74} 93}
75 94
@@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
142 io = kmem_cache_alloc(io_end_cachep, flags); 161 io = kmem_cache_alloc(io_end_cachep, flags);
143 if (io) { 162 if (io) {
144 memset(io, 0, sizeof(*io)); 163 memset(io, 0, sizeof(*io));
145 io->inode = igrab(inode); 164 atomic_inc(&EXT4_I(inode)->i_ioend_count);
146 BUG_ON(!io->inode); 165 io->inode = inode;
147 INIT_WORK(&io->work, ext4_end_io_work); 166 INIT_WORK(&io->work, ext4_end_io_work);
148 INIT_LIST_HEAD(&io->list); 167 INIT_LIST_HEAD(&io->list);
149 } 168 }
@@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error)
171 struct workqueue_struct *wq; 190 struct workqueue_struct *wq;
172 struct inode *inode; 191 struct inode *inode;
173 unsigned long flags; 192 unsigned long flags;
174 ext4_fsblk_t err_block;
175 int i; 193 int i;
176 194
177 BUG_ON(!io_end); 195 BUG_ON(!io_end);
178 inode = io_end->inode;
179 bio->bi_private = NULL; 196 bio->bi_private = NULL;
180 bio->bi_end_io = NULL; 197 bio->bi_end_io = NULL;
181 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 198 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
182 error = 0; 199 error = 0;
183 err_block = bio->bi_sector >> (inode->i_blkbits - 9);
184 bio_put(bio); 200 bio_put(bio);
185 201
186 if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
187 pr_err("sb umounted, discard end_io request for inode %lu\n",
188 io_end->inode->i_ino);
189 ext4_free_io_end(io_end);
190 return;
191 }
192
193 if (error) {
194 io_end->flag |= EXT4_IO_END_ERROR;
195 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
196 "(offset %llu size %ld starting block %llu)",
197 inode->i_ino,
198 (unsigned long long) io_end->offset,
199 (long) io_end->size,
200 (unsigned long long) err_block);
201 }
202
203 for (i = 0; i < io_end->num_io_pages; i++) { 202 for (i = 0; i < io_end->num_io_pages; i++) {
204 struct page *page = io_end->pages[i]->p_page; 203 struct page *page = io_end->pages[i]->p_page;
205 struct buffer_head *bh, *head; 204 struct buffer_head *bh, *head;
@@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error)
254 if (!partial_write) 253 if (!partial_write)
255 SetPageUptodate(page); 254 SetPageUptodate(page);
256 } 255 }
257
258 io_end->num_io_pages = 0; 256 io_end->num_io_pages = 0;
257 inode = io_end->inode;
258
259 if (error) {
260 io_end->flag |= EXT4_IO_END_ERROR;
261 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
262 "(offset %llu size %ld starting block %llu)",
263 inode->i_ino,
264 (unsigned long long) io_end->offset,
265 (long) io_end->size,
266 (unsigned long long)
267 bio->bi_sector >> (inode->i_blkbits - 9));
268 }
259 269
260 /* Add the io_end to per-inode completed io list*/ 270 /* Add the io_end to per-inode completed io list*/
261 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); 271 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io,
305 bio->bi_private = io->io_end = io_end; 315 bio->bi_private = io->io_end = io_end;
306 bio->bi_end_io = ext4_end_bio; 316 bio->bi_end_io = ext4_end_bio;
307 317
308 io_end->inode = inode;
309 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 318 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
310 319
311 io->io_bio = bio; 320 io->io_bio = bio;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 04352e9729d0..45653af88953 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
828 ei->cur_aio_dio = NULL; 828 ei->cur_aio_dio = NULL;
829 ei->i_sync_tid = 0; 829 ei->i_sync_tid = 0;
830 ei->i_datasync_tid = 0; 830 ei->i_datasync_tid = 0;
831 atomic_set(&ei->i_ioend_count, 0);
831 832
832 return &ei->vfs_inode; 833 return &ei->vfs_inode;
833} 834}
834 835
835static void ext4_destroy_inode(struct inode *inode) 836static void ext4_destroy_inode(struct inode *inode)
836{ 837{
838 ext4_ioend_wait(inode);
837 if (!list_empty(&(EXT4_I(inode)->i_orphan))) { 839 if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
838 ext4_msg(inode->i_sb, KERN_ERR, 840 ext4_msg(inode->i_sb, KERN_ERR,
839 "Inode %lu (%p): orphan list check failed!", 841 "Inode %lu (%p): orphan list check failed!",