ext4: Add delayed allocation support in data=writeback mode

Updated with fixes from Mingming Cao <cmm@us.ibm.com> to unlock and release the page from page cache if the delalloc write_begin failed, and properly handle preallocated blocks. Also added a fix to clear buffer_delay in block_write_full_page() after allocating a delayed buffer. Updated with fixes from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> to update i_disksize properly and to add bmap support for delayed allocation. Updated with a fix from Valerie Clement <valerie.clement@bull.net> to avoid filesystem corruption when the filesystem is mounted with the delalloc option and blocksize < pagesize. Signed-off-by: Alex Tomas <alex@clusterfs.com> Signed-off-by: Mingming Cao <cmm@us.ibm.com> Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
author: Alex Tomas <alex@clusterfs.com> 2008-07-11 19:27:31 -0400
committer: Theodore Ts'o <tytso@mit.edu> 2008-07-11 19:27:31 -0400
commit: 64769240bd07f446f83660bb143bb609d8ab4910 (patch)
tree: db8bd8dda988adab6b4f6087a9b59ce504612009 /fs/ext4
parent: 29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (diff)
3 files changed, 700 insertions, 6 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f65829bbe7aa..ee9576dc0ba1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,6 +536,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC              0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)               o &= ~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7b9569179fdf..2bef4f879e4b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
@@ -46,6 +47,8 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                                   new_size);
 }
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 /*
 * Test whether an inode is a fast symlink.
 */
@@ -1408,6 +1411,669 @@ static int ext4_journalled_write_end(struct file *file,
 }
 /*
+ * Delayed allocation stuff
+ */
+struct mpage_da_data {
+        struct inode *inode;
+        struct buffer_head lbh;                 /* extent of blocks */
+        unsigned long first_page, next_page;    /* extent of pages */
+        get_block_t *get_block;
+        struct writeback_control *wbc;
+};
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+        struct address_space *mapping = mpd->inode->i_mapping;
+        struct mpage_data mpd_pp = {
+                .bio = NULL,
+                .last_block_in_bio = 0,
+                .get_block = mpd->get_block,
+                .use_writepage = 1,
+        };
+        int ret = 0, err, nr_pages, i;
+        unsigned long index, end;
+        struct pagevec pvec;
+        BUG_ON(mpd->next_page <= mpd->first_page);
+        pagevec_init(&pvec, 0);
+        index = mpd->first_page;
+        end = mpd->next_page - 1;
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+                        /*
+                         * In error case, we have to continue because
+                         * remaining pages are still locked
+                         * XXX: unlock and re-dirty them?
+                         */
+                        if (ret == 0)
+                                ret = err;
+                }
+                pagevec_release(&pvec);
+        }
+        if (mpd_pp.bio)
+                mpage_bio_submit(WRITE, mpd_pp.bio);
+        return ret;
+}
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+                                 struct buffer_head *exbh)
+{
+        struct inode *inode = mpd->inode;
+        struct address_space *mapping = inode->i_mapping;
+        int blocks = exbh->b_size >> inode->i_blkbits;
+        sector_t pblock = exbh->b_blocknr, cur_logical;
+        struct buffer_head *head, *bh;
+        unsigned long index, end;
+        struct pagevec pvec;
+        int nr_pages, i;
+        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                /* XXX: optimize tail */
+                nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        index = page->index;
+                        if (index > end)
+                                break;
+                        index++;
+                        BUG_ON(!PageLocked(page));
+                        BUG_ON(PageWriteback(page));
+                        BUG_ON(!page_has_buffers(page));
+                        bh = page_buffers(page);
+                        head = bh;
+                        /* skip blocks out of the range */
+                        do {
+                                if (cur_logical >= logical)
+                                        break;
+                                cur_logical++;
+                        } while ((bh = bh->b_this_page) != head);
+                        do {
+                                if (cur_logical >= logical + blocks)
+                                        break;
+                                if (buffer_delay(bh)) {
+                                        bh->b_blocknr = pblock;
+                                        clear_buffer_delay(bh);
+                                } else if (buffer_mapped(bh)) {
+                                        BUG_ON(bh->b_blocknr != pblock);
+                                }
+                                cur_logical++;
+                                pblock++;
+                        } while ((bh = bh->b_this_page) != head);
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+                                             struct buffer_head *bh)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        int blocks, i;
+        blocks = bh->b_size >> inode->i_blkbits;
+        for (i = 0; i < blocks; i++)
+                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        int err = 0, remain = lbh->b_size;
+        sector_t next = lbh->b_blocknr;
+        struct buffer_head new;
+        /*
+         * We consider only non-mapped and non-allocated blocks
+         */
+        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+                return;
+        while (remain) {
+                new.b_state = lbh->b_state;
+                new.b_blocknr = 0;
+                new.b_size = remain;
+                err = mpd->get_block(mpd->inode, next, &new, 1);
+                if (err) {
+                        /*
+                         * Rather than implement own error handling
+                         * here, we just leave remaining blocks
+                         * unallocated and try again with ->writepage()
+                         */
+                        break;
+                }
+                BUG_ON(new.b_size == 0);
+                if (buffer_new(&new))
+                        __unmap_underlying_blocks(mpd->inode, &new);
+                /*
+                 * If blocks are delayed marked, we need to
+                 * put actual blocknr and drop delayed bit
+                 */
+                if (buffer_delay(lbh))
+                        mpage_put_bnr_to_bhs(mpd, next, &new);
+                        /* go for the remaining blocks */
+                        next += new.b_size >> mpd->inode->i_blkbits;
+                        remain -= new.b_size;
+                }
+}
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+                                   sector_t logical, struct buffer_head *bh)
+{
+        struct buffer_head *lbh = &mpd->lbh;
+        sector_t next;
+        next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+        /*
+         * First block in the extent
+         */
+        if (lbh->b_size == 0) {
+                lbh->b_blocknr = logical;
+                lbh->b_size = bh->b_size;
+                lbh->b_state = bh->b_state & BH_FLAGS;
+                return;
+        }
+        /*
+         * Can we merge the block to our big extent?
+         */
+        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+                lbh->b_size += bh->b_size;
+                return;
+        }
+        /*
+         * We couldn't merge the block to our extent, so we
+         * need to flush current  extent and start new one
+         */
+        mpage_da_map_blocks(mpd);
+        /*
+         * Now start a new extent
+         */
+        lbh->b_size = bh->b_size;
+        lbh->b_state = bh->b_state & BH_FLAGS;
+        lbh->b_blocknr = logical;
+}
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+                                struct writeback_control *wbc, void *data)
+{
+        struct mpage_da_data *mpd = data;
+        struct inode *inode = mpd->inode;
+        struct buffer_head *bh, *head, fake;
+        sector_t logical;
+        /*
+         * Can we merge this page to current extent?
+         */
+        if (mpd->next_page != page->index) {
+                /*
+                 * Nope, we can't. So, we map non-allocated blocks
+                 * and start IO on them using __mpage_writepage()
+                 */
+                if (mpd->next_page != mpd->first_page) {
+                        mpage_da_map_blocks(mpd);
+                        mpage_da_submit_io(mpd);
+                }
+                /*
+                 * Start next extent of pages ...
+                 */
+                mpd->first_page = page->index;
+                /*
+                 * ... and blocks
+                 */
+                mpd->lbh.b_size = 0;
+                mpd->lbh.b_state = 0;
+                mpd->lbh.b_blocknr = 0;
+        }
+        mpd->next_page = page->index + 1;
+        logical = (sector_t) page->index <<
+                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * There is no attached buffer heads yet (mmap?)
+                 * we treat the page asfull of dirty blocks
+                 */
+                bh = &fake;
+                bh->b_size = PAGE_CACHE_SIZE;
+                bh->b_state = 0;
+                set_buffer_dirty(bh);
+                set_buffer_uptodate(bh);
+                mpage_add_bh_to_extent(mpd, logical, bh);
+        } else {
+                /*
+                 * Page with regular buffer heads, just add all dirty ones
+                 */
+                head = page_buffers(page);
+                bh = head;
+                do {
+                        BUG_ON(buffer_locked(bh));
+                        if (buffer_dirty(bh))
+                                mpage_add_bh_to_extent(mpd, logical, bh);
+                        logical++;
+                } while ((bh = bh->b_this_page) != head);
+        }
+        return 0;
+}
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc,
+                               get_block_t get_block)
+{
+        struct mpage_da_data mpd;
+        int ret;
+        if (!get_block)
+                return generic_writepages(mapping, wbc);
+        mpd.wbc = wbc;
+        mpd.inode = mapping->host;
+        mpd.lbh.b_size = 0;
+        mpd.lbh.b_state = 0;
+        mpd.lbh.b_blocknr = 0;
+        mpd.first_page = 0;
+        mpd.next_page = 0;
+        mpd.get_block = get_block;
+        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+        /*
+         * Handle last extent of pages
+         */
+        if (mpd.next_page != mpd.first_page) {
+                mpage_da_map_blocks(&mpd);
+                mpage_da_submit_io(&mpd);
+        }
+        return ret;
+}
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+        int ret = 0;
+        BUG_ON(create == 0);
+        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        /*
+         * first, we need to know whether the block is allocated already
+         * preallocated blocks are unmapped but should treated
+         * the same as allocated blocks.
+         */
+        ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
+        if (ret == 0) {
+                /* the block isn't allocated yet, let's reserve space */
+                /* XXX: call reservation here */
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                map_bh(bh_result, inode->i_sb, 0);
+                set_buffer_new(bh_result);
+                set_buffer_delay(bh_result);
+        } else if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                ret = 0;
+        }
+        return ret;
+}
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        if (create) {
+                handle = ext4_journal_start(inode, needed_blocks);
+                if (IS_ERR(handle)) {
+                        ret = PTR_ERR(handle);
+                        goto out;
+                }
+        }
+        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0);
+        if (ret > 0) {
+                bh_result->b_size = (ret << inode->i_blkbits);
+                /*
+                 * Update on-disk size along with block allocation
+                 * we don't use 'extend_disksize' as size may change
+                 * within already allocated block -bzzz
+                 */
+                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+                if (disksize > i_size_read(inode))
+                        disksize = i_size_read(inode);
+                if (disksize > EXT4_I(inode)->i_disksize) {
+                        /*
+                         * XXX: replace with spinlock if seen contended -bzzz
+                         */
+                        down_write(&EXT4_I(inode)->i_data_sem);
+                        if (disksize > EXT4_I(inode)->i_disksize)
+                                EXT4_I(inode)->i_disksize = disksize;
+                        up_write(&EXT4_I(inode)->i_data_sem);
+                        if (EXT4_I(inode)->i_disksize == disksize) {
+                                if (handle == NULL)
+                                        handle = ext4_journal_start(inode, 1);
+                                if (!IS_ERR(handle))
+                                        ext4_mark_inode_dirty(handle, inode);
+                        }
+                }
+                ret = 0;
+        }
+out:
+        if (handle && !IS_ERR(handle))
+                ext4_journal_stop(handle);
+        return ret;
+}
+/* FIXME!! only support data=writeback mode */
+static int ext4_da_writepage(struct page *page,
+                                struct writeback_control *wbc)
+{
+        struct inode *inode = page->mapping->host;
+        handle_t *handle = NULL;
+        int ret = 0;
+        int err;
+        if (ext4_journal_current_handle())
+                goto out_fail;
+        handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out_fail;
+        }
+        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+                ret = nobh_writepage(page, ext4_get_block, wbc);
+        else
+                ret = block_write_full_page(page, ext4_get_block, wbc);
+        if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
+                EXT4_I(inode)->i_disksize = inode->i_size;
+                ext4_mark_inode_dirty(handle, inode);
+        }
+        err = ext4_journal_stop(handle);
+        if (!ret)
+                ret = err;
+        return ret;
+out_fail:
+        redirty_page_for_writepage(wbc, page);
+        unlock_page(page);
+        return ret;
+}
+static int ext4_da_writepages(struct address_space *mapping,
+                                struct writeback_control *wbc)
+{
+        return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+}
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        int ret;
+        struct page *page;
+        pgoff_t index;
+        unsigned from, to;
+        struct inode *inode = mapping->host;
+        handle_t *handle;
+        index = pos >> PAGE_CACHE_SHIFT;
+        from = pos & (PAGE_CACHE_SIZE - 1);
+        to = from + len;
+        /*
+         * With delayed allocation, we don't log the i_disksize update
+         * if there is delayed block allocation. But we still need
+         * to journalling the i_disksize update if writes to the end
+         * of file which has an already mapped buffer.
+         */
+        handle = ext4_journal_start(inode, 1);
+        if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
+                goto out;
+        }
+        page = __grab_cache_page(mapping, index);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+                                                        ext4_da_get_block_prep);
+        if (ret < 0) {
+                unlock_page(page);
+                ext4_journal_stop(handle);
+                page_cache_release(page);
+        }
+out:
+        return ret;
+}
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+        return !buffer_mapped(bh) || buffer_delay(bh);
+}
+static int ext4_da_write_end(struct file *file,
+                                struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        int ret = 0, ret2;
+        handle_t *handle = ext4_journal_current_handle();
+        loff_t new_i_size;
+        /*
+         * generic_write_end() will run mark_inode_dirty() if i_size
+         * changes.  So let's piggyback the i_disksize mark_inode_dirty
+         * into that.
+         */
+        new_i_size = pos + copied;
+        if (new_i_size > EXT4_I(inode)->i_disksize)
+                if (!walk_page_buffers(NULL, page_buffers(page),
+                                       0, len, NULL, ext4_bh_unmapped_or_delay)){
+                        /*
+                         * Updating i_disksize when extending file without
+                         * needing block allocation
+                         */
+                        if (ext4_should_order_data(inode))
+                                ret = ext4_jbd2_file_inode(handle, inode);
+                        EXT4_I(inode)->i_disksize = new_i_size;
+                }
+        ret2 = generic_write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        copied = ret2;
+        if (ret2 < 0)
+                ret = ret2;
+        ret2 = ext4_journal_stop(handle);
+        if (!ret)
+                ret = ret2;
+        return ret ? ret : copied;
+}
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+        struct buffer_head *head, *bh;
+        unsigned int curr_off = 0;
+        /*
+         * Drop reserved blocks
+         */
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                goto out;
+        head = page_buffers(page);
+        bh = head;
+        do {
+                unsigned int next_off = curr_off + bh->b_size;
+                /*
+                 * is this block fully invalidated?
+                 */
+                if (offset <= curr_off && buffer_delay(bh)) {
+                        clear_buffer_delay(bh);
+                        /* XXX: add real stuff here */
+                }
+                curr_off = next_off;
+                bh = bh->b_this_page;
+        } while (bh != head);
+out:
+        ext4_invalidatepage(page, offset);
+        return;
+}
+/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
@@ -1427,6 +2093,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
        journal_t *journal;
        int err;
+        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+                        test_opt(inode->i_sb, DELALLOC)) {
+                /*
+                 * With delalloc we want to sync the file
+                 * so that we can make sure we allocate
+                 * blocks for file
+                 */
+                filemap_write_and_wait(mapping);
+        }
        if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
                /*
                 * This is a REALLY heavyweight approach, but the use of
@@ -1471,11 +2147,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
        return 0;
 }
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-        return !buffer_mapped(bh) || buffer_delay(bh);
-}
 /*
 * Note that we don't need to start a transaction unless we're journaling data
 * because we should have holes filled from ext4_page_mkwrite(). We even don't
@@ -1832,10 +2503,28 @@ static const struct address_space_operations ext4_journalled_aops = {
        .releasepage    = ext4_releasepage,
 };
+static const struct address_space_operations ext4_da_aops = {
+        .readpage       = ext4_readpage,
+        .readpages      = ext4_readpages,
+        .writepage      = ext4_da_writepage,
+        .writepages     = ext4_da_writepages,
+        .sync_page      = block_sync_page,
+        .write_begin    = ext4_da_write_begin,
+        .write_end      = ext4_da_write_end,
+        .bmap           = ext4_bmap,
+        .invalidatepage = ext4_da_invalidatepage,
+        .releasepage    = ext4_releasepage,
+        .direct_IO      = ext4_direct_IO,
+        .migratepage    = buffer_migrate_page,
+};
 void ext4_set_aops(struct inode *inode)
 {
        if (ext4_should_order_data(inode))
                inode->i_mapping->a_ops = &ext4_ordered_aops;
+        else if (ext4_should_writeback_data(inode) &&
+                 test_opt(inode->i_sb, DELALLOC))
+                inode->i_mapping->a_ops = &ext4_da_aops;
        else if (ext4_should_writeback_data(inode))
                inode->i_mapping->a_ops = &ext4_writeback_aops;
        else
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 629d0fa27e3a..de9d3d0eb203 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -898,7 +898,7 @@ enum {
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
        Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-        Opt_mballoc, Opt_nomballoc, Opt_stripe,
+        Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc,
 };
 static match_table_t tokens = {
@@ -957,6 +957,7 @@ static match_table_t tokens = {
        {Opt_nomballoc, "nomballoc"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
+        {Opt_delalloc, "delalloc"},
        {Opt_err, NULL},
 };
@@ -1335,6 +1336,9 @@ set_qf_format:
                                return 0;
                        sbi->s_stripe = option;
                        break;
+                case Opt_delalloc:
+                        set_opt(sbi->s_mount_opt, DELALLOC);
+                        break;
                default:
                        printk (KERN_ERR
                                "EXT4-fs: Unrecognized mount option \"%s\" "
author	Alex Tomas <alex@clusterfs.com>	2008-07-11 19:27:31 -0400
committer	Theodore Ts'o <tytso@mit.edu>	2008-07-11 19:27:31 -0400
commit	64769240bd07f446f83660bb143bb609d8ab4910 (patch)
tree	db8bd8dda988adab6b4f6087a9b59ce504612009 /fs/ext4
parent	29a814d2ee0e43c2980f33f91c1311ec06c0aa35 (diff)