ext4: handle writeback of inodes which are being freed

The following BUG can occur when an inode which is getting freed when it still has dirty pages outstanding, and it gets deleted (in this because it was the target of a rename). In ordered mode, we need to make sure the data pages are written just in case we crash before the rename (or unlink) is committed. If the inode is being freed then when we try to igrab the inode, we end up tripping the BUG_ON at fs/ext4/page-io.c:146. To solve this problem, we need to keep track of the number of io callbacks which are pending, and avoid destroying the inode until they have all been completed. That way we don't have to bump the inode count to keep the inode from being destroyed; an approach which doesn't work because the count could have already been dropped down to zero before the inode writeback has started (at which point we're not allowed to bump the count back up to 1, since it's already started getting freed). Thanks to Dave Chinner for suggesting this approach, which is also used by XFS. kernel BUG at /scratch_space/linux-2.6/fs/ext4/page-io.c:146! Call Trace: [<ffffffff811075b1>] ext4_bio_write_page+0x172/0x307 [<ffffffff811033a7>] mpage_da_submit_io+0x2f9/0x37b [<ffffffff811068d7>] mpage_da_map_and_submit+0x2cc/0x2e2 [<ffffffff811069b3>] mpage_add_bh_to_extent+0xc6/0xd5 [<ffffffff81106c66>] write_cache_pages_da+0x2a4/0x3ac [<ffffffff81107044>] ext4_da_writepages+0x2d6/0x44d [<ffffffff81087910>] do_writepages+0x1c/0x25 [<ffffffff810810a4>] __filemap_fdatawrite_range+0x4b/0x4d [<ffffffff810815f5>] filemap_fdatawrite_range+0xe/0x10 [<ffffffff81122a2e>] jbd2_journal_begin_ordered_truncate+0x7b/0xa2 [<ffffffff8110615d>] ext4_evict_inode+0x57/0x24c [<ffffffff810c14a3>] evict+0x22/0x92 [<ffffffff810c1a3d>] iput+0x212/0x249 [<ffffffff810bdf16>] dentry_iput+0xa1/0xb9 [<ffffffff810bdf6b>] d_kill+0x3d/0x5d [<ffffffff810be613>] dput+0x13a/0x147 [<ffffffff810b990d>] sys_renameat+0x1b5/0x258 [<ffffffff81145f71>] ? _atomic_dec_and_lock+0x2d/0x4c [<ffffffff810b2950>] ? cp_new_stat+0xde/0xea [<ffffffff810b29c1>] ? sys_newlstat+0x2d/0x38 [<ffffffff810b99c6>] sys_rename+0x16/0x18 [<ffffffff81002a2b>] system_call_fastpath+0x16/0x1b Reported-by: Nick Bowler <nbowler@elliptictech.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Tested-by: Nick Bowler <nbowler@elliptictech.com>
author: Theodore Ts'o <tytso@mit.edu> 2010-11-08 13:43:33 -0500
committer: Theodore Ts'o <tytso@mit.edu> 2010-11-08 13:43:33 -0500
commit: f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 (patch)
tree: 006cdcfd487404fb61986e3030d96cb33866755d /fs/ext4/page-io.c
parent: ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 (diff)
1 files changed, 34 insertions, 25 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..a24c8cca7370 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
 static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ              37
+#define to_ioend_wq(v)  (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
 int __init ext4_init_pageio(void)
 {
+        int i;
        io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
        if (io_page_cachep == NULL)
                return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
                kmem_cache_destroy(io_page_cachep);
                return -ENOMEM;
        }
+        for (i = 0; i < WQ_HASH_SZ; i++)
+                init_waitqueue_head(&ioend_wq[i]);
        return 0;
 }
@@ -52,9 +60,17 @@ void ext4_exit_pageio(void)
        kmem_cache_destroy(io_page_cachep);
 }
+void ext4_ioend_wait(struct inode *inode)
+{
+        wait_queue_head_t *wq = to_ioend_wq(inode);
+        wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
 void ext4_free_io_end(ext4_io_end_t *io)
 {
        int i;
+        wait_queue_head_t *wq;
        BUG_ON(!io);
        if (io->page)
@@ -69,7 +85,10 @@ void ext4_free_io_end(ext4_io_end_t *io)
                }
        }
        io->num_io_pages = 0;
-        iput(io->inode);
+        wq = to_ioend_wq(io->inode);
+        if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+            waitqueue_active(wq))
+                wake_up_all(wq);
        kmem_cache_free(io_end_cachep, io);
 }
@@ -142,8 +161,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
        io = kmem_cache_alloc(io_end_cachep, flags);
        if (io) {
                memset(io, 0, sizeof(*io));
-                io->inode = igrab(inode);
+                atomic_inc(&EXT4_I(inode)->i_ioend_count);
-                BUG_ON(!io->inode);
+                io->inode = inode;
                INIT_WORK(&io->work, ext4_end_io_work);
                INIT_LIST_HEAD(&io->list);
        }
@@ -171,35 +190,15 @@ static void ext4_end_bio(struct bio *bio, int error)
        struct workqueue_struct *wq;
        struct inode *inode;
        unsigned long flags;
-        ext4_fsblk_t err_block;
        int i;
        BUG_ON(!io_end);
-        inode = io_end->inode;
        bio->bi_private = NULL;
        bio->bi_end_io = NULL;
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = 0;
-        err_block = bio->bi_sector >> (inode->i_blkbits - 9);
        bio_put(bio);
-        if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
-                pr_err("sb umounted, discard end_io request for inode %lu\n",
-                        io_end->inode->i_ino);
-                ext4_free_io_end(io_end);
-                return;
-        }
-        if (error) {
-                io_end->flag |= EXT4_IO_END_ERROR;
-                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
-                             "(offset %llu size %ld starting block %llu)",
-                             inode->i_ino,
-                             (unsigned long long) io_end->offset,
-                             (long) io_end->size,
-                             (unsigned long long) err_block);
-        }
        for (i = 0; i < io_end->num_io_pages; i++) {
                struct page *page = io_end->pages[i]->p_page;
                struct buffer_head *bh, *head;
@@ -254,8 +253,19 @@ static void ext4_end_bio(struct bio *bio, int error)
                if (!partial_write)
                        SetPageUptodate(page);
        }
        io_end->num_io_pages = 0;
+        inode = io_end->inode;
+        if (error) {
+                io_end->flag |= EXT4_IO_END_ERROR;
+                ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+                             "(offset %llu size %ld starting block %llu)",
+                             inode->i_ino,
+                             (unsigned long long) io_end->offset,
+                             (long) io_end->size,
+                             (unsigned long long)
+                             bio->bi_sector >> (inode->i_blkbits - 9));
+        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +315,6 @@ static int io_submit_init(struct ext4_io_submit *io,
        bio->bi_private = io->io_end = io_end;
        bio->bi_end_io = ext4_end_bio;
-        io_end->inode = inode;
        io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
        io->io_bio = bio;
author	Theodore Ts'o <tytso@mit.edu>	2010-11-08 13:43:33 -0500
committer	Theodore Ts'o <tytso@mit.edu>	2010-11-08 13:43:33 -0500
commit	f7ad6d2e9201a6e1c9ee6530a291452eb695feb8 (patch)
tree	006cdcfd487404fb61986e3030d96cb33866755d /fs/ext4/page-io.c
parent	ce7e010aef63dc6b37a2354f7c9f5f4aedb37978 (diff)