1 files changed, 453 insertions, 77 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c4da98a959ae..fffec40d5996 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -42,7 +42,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "ext4_extents.h"
 #include "truncate.h"
 #include <trace/events/ext4.h>
@@ -121,9 +120,6 @@ void ext4_evict_inode(struct inode *inode)
        trace_ext4_evict_inode(inode);
-        mutex_lock(&inode->i_mutex);
-        ext4_flush_completed_IO(inode);
-        mutex_unlock(&inode->i_mutex);
        ext4_ioend_wait(inode);
        if (inode->i_nlink) {
@@ -271,7 +267,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        struct ext4_inode_info *ei = EXT4_I(inode);
        spin_lock(&ei->i_block_reservation_lock);
-        trace_ext4_da_update_reserve_space(inode, used);
+        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks\n",
@@ -284,7 +280,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
@@ -294,7 +290,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
@@ -303,14 +299,14 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update quota subsystem for data blocks */
        if (quota_claim)
-                dquot_claim_block(inode, used);
+                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
-                dquot_release_reservation_block(inode, used);
+                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }
        /*
@@ -402,6 +398,49 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
+ * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
+ */
+static void set_buffers_da_mapped(struct inode *inode,
+                                   struct ext4_map_blocks *map)
+{
+        struct address_space *mapping = inode->i_mapping;
+        struct pagevec pvec;
+        int i, nr_pages;
+        pgoff_t index, end;
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + map->m_len - 1) >>
+                (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        pagevec_init(&pvec, 0);
+        while (index <= end) {
+                nr_pages = pagevec_lookup(&pvec, mapping, index,
+                                          min(end - index + 1,
+                                              (pgoff_t)PAGEVEC_SIZE));
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        struct buffer_head *bh, *head;
+                        if (unlikely(page->mapping != mapping) ||
+                            !PageDirty(page))
+                                break;
+                        if (page_has_buffers(page)) {
+                                bh = head = page_buffers(page);
+                                do {
+                                        set_buffer_da_mapped(bh);
+                                        bh = bh->b_this_page;
+                                } while (bh != head);
+                        }
+                        index++;
+                }
+                pagevec_release(&pvec);
+        }
+}
+/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
@@ -419,7 +458,7 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 * the buffer head is mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that casem, buffer head is unmapped
+ * that case, buffer head is unmapped
 *
 * It returns the error in case of allocation failure.
 */
@@ -438,9 +477,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval = ext4_ext_map_blocks(handle, inode, map, 0);
+                retval = ext4_ext_map_blocks(handle, inode, map, flags &
+                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        } else {
-                retval = ext4_ind_map_blocks(handle, inode, map, 0);
+                retval = ext4_ind_map_blocks(handle, inode, map, flags &
+                                             EXT4_GET_BLOCKS_KEEP_SIZE);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
@@ -458,7 +499,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
-         * ext4_ext_get_block() returns th create = 0
+         * ext4_ext_get_block() returns the create = 0
         * with buffer head unmapped.
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
@@ -520,9 +561,17 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
                        (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
                        ext4_da_update_reserve_space(inode, retval, 1);
        }
-        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
+                /* If we have successfully mapped the delayed allocated blocks,
+                 * set the BH_Da_Mapped bit on them. Its important to do this
+                 * under the protection of i_data_sem.
+                 */
+                if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
+                        set_buffers_da_mapped(inode, map);
+        }
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                int ret = check_block_validity(inode, map);
@@ -650,7 +699,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                return bh;
        if (buffer_uptodate(bh))
                return bh;
-        ll_rw_block(READ_META, 1, &bh);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -912,7 +961,11 @@ static int ext4_ordered_write_end(struct file *file,
                        ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
+        } else {
+                unlock_page(page);
+                page_cache_release(page);
        }
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
@@ -1040,14 +1093,14 @@ static int ext4_journalled_write_end(struct file *file,
 }
 /*
- * Reserve a single block located at lblock
+ * Reserve a single cluster located at lblock
 */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed;
+        unsigned int md_needed;
        int ret;
        /*
@@ -1057,7 +1110,8 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_needed = ext4_calc_metadata_amount(inode, lblock);
+        md_needed = EXT4_NUM_B2C(sbi,
+                                 ext4_calc_metadata_amount(inode, lblock));
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
@@ -1066,15 +1120,15 @@ repeat:
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, 1);
+        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1));
        if (ret)
                return ret;
        /*
         * We do still charge estimated metadata to the sb though;
         * we cannot afford to run out of free blocks.
         */
-        if (ext4_claim_free_blocks(sbi, md_needed + 1, 0)) {
+        if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
-                dquot_release_reservation_block(inode, 1);
+                dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1121,19 +1175,21 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * We can release all of the reserved metadata blocks
                 * only when we have written all of the delayed
                 * allocation blocks.
+                 * Note that in case of bigalloc, i_reserved_meta_blocks,
+                 * i_reserved_data_blocks, etc. refer to number of clusters.
                 */
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
        /* update fs dirty data blocks counter */
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        dquot_release_reservation_block(inode, to_free);
+        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
 }
 static void ext4_da_page_release_reservation(struct page *page,
@@ -1142,6 +1198,9 @@ static void ext4_da_page_release_reservation(struct page *page,
        int to_release = 0;
        struct buffer_head *head, *bh;
        unsigned int curr_off = 0;
+        struct inode *inode = page->mapping->host;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int num_clusters;
        head = page_buffers(page);
        bh = head;
@@ -1151,10 +1210,24 @@ static void ext4_da_page_release_reservation(struct page *page,
                if ((offset <= curr_off) && (buffer_delay(bh))) {
                        to_release++;
                        clear_buffer_delay(bh);
+                        clear_buffer_da_mapped(bh);
                }
                curr_off = next_off;
        } while ((bh = bh->b_this_page) != head);
-        ext4_da_release_space(page->mapping->host, to_release);
+        /* If we have released all the blocks belonging to a cluster, then we
+         * need to release the reserved space for that cluster. */
+        num_clusters = EXT4_NUM_B2C(sbi, to_release);
+        while (num_clusters > 0) {
+                ext4_fsblk_t lblk;
+                lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+                        ((num_clusters - 1) << sbi->s_cluster_bits);
+                if (sbi->s_cluster_ratio == 1 ||
+                    !ext4_find_delalloc_cluster(inode, lblk, 1))
+                        ext4_da_release_space(inode, 1);
+                num_clusters--;
+        }
 }
 /*
@@ -1256,6 +1329,8 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                                                clear_buffer_delay(bh);
                                                bh->b_blocknr = pblock;
                                        }
+                                        if (buffer_da_mapped(bh))
+                                                clear_buffer_da_mapped(bh);
                                        if (buffer_unwritten(bh) ||
                                            buffer_mapped(bh))
                                                BUG_ON(bh->b_blocknr != pblock);
@@ -1349,12 +1424,15 @@ static void ext4_print_free_blocks(struct inode *inode)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        printk(KERN_CRIT "Total free blocks count %lld\n",
-               ext4_count_free_blocks(inode->i_sb));
+               EXT4_C2B(EXT4_SB(inode->i_sb),
+                        ext4_count_free_clusters(inode->i_sb)));
        printk(KERN_CRIT "Free/Dirty block details\n");
        printk(KERN_CRIT "free_blocks=%lld\n",
-               (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        printk(KERN_CRIT "dirty_blocks=%lld\n",
-               (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+               (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
+                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        printk(KERN_CRIT "Block reservation details\n");
        printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
               EXT4_I(inode)->i_reserved_data_blocks);
@@ -1433,8 +1511,7 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                if (err == -EAGAIN)
                        goto submit_io;
-                if (err == -ENOSPC &&
+                if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
-                    ext4_count_free_blocks(sb)) {
                        mpd->retval = err;
                        goto submit_io;
                }
@@ -1474,13 +1551,15 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
                for (i = 0; i < map.m_len; i++)
                        unmap_underlying_metadata(bdev, map.m_pblk + i);
-        }
-        if (ext4_should_order_data(mpd->inode)) {
+                if (ext4_should_order_data(mpd->inode)) {
-                err = ext4_jbd2_file_inode(handle, mpd->inode);
+                        err = ext4_jbd2_file_inode(handle, mpd->inode);
-                if (err)
+                        if (err) {
-                        /* This only happens if the journal is aborted */
+                                /* Only if the journal is aborted */
-                        return;
+                                mpd->retval = err;
+                                goto submit_io;
+                        }
+                }
        }
        /*
@@ -1587,6 +1666,66 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
 }
 /*
+ * This function is grabs code from the very beginning of
+ * ext4_map_blocks, but assumes that the caller is from delayed write
+ * time. This function looks up the requested blocks and sets the
+ * buffer delay bit under the protection of i_data_sem.
+ */
+static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
+                              struct ext4_map_blocks *map,
+                              struct buffer_head *bh)
+{
+        int retval;
+        sector_t invalid_block = ~((sector_t) 0xffff);
+        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
+                invalid_block = ~0;
+        map->m_flags = 0;
+        ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, map->m_len,
+                  (unsigned long) map->m_lblk);
+        /*
+         * Try to see if we can get the block without requesting a new
+         * file system block.
+         */
+        down_read((&EXT4_I(inode)->i_data_sem));
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+                retval = ext4_ext_map_blocks(NULL, inode, map, 0);
+        else
+                retval = ext4_ind_map_blocks(NULL, inode, map, 0);
+        if (retval == 0) {
+                /*
+                 * XXX: __block_prepare_write() unmaps passed block,
+                 * is it OK?
+                 */
+                /* If the block was allocated from previously allocated cluster,
+                 * then we dont need to reserve it again. */
+                if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) {
+                        retval = ext4_da_reserve_space(inode, iblock);
+                        if (retval)
+                                /* not enough space to reserve */
+                                goto out_unlock;
+                }
+                /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
+                 * and it should not appear on the bh->b_state.
+                 */
+                map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
+                map_bh(bh, inode->i_sb, invalid_block);
+                set_buffer_new(bh);
+                set_buffer_delay(bh);
+        }
+out_unlock:
+        up_read((&EXT4_I(inode)->i_data_sem));
+        return retval;
+}
+/*
 * This is a special get_blocks_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
@@ -1603,10 +1742,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 {
        struct ext4_map_blocks map;
        int ret = 0;
-        sector_t invalid_block = ~((sector_t) 0xffff);
-        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
-                invalid_block = ~0;
        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
@@ -1619,25 +1754,9 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_map_blocks(NULL, inode, &map, 0);
+        ret = ext4_da_map_blocks(inode, iblock, &map, bh);
-        if (ret < 0)
+        if (ret <= 0)
                return ret;
-        if (ret == 0) {
-                if (buffer_delay(bh))
-                        return 0; /* Not sure this could or should happen */
-                /*
-                 * XXX: __block_write_begin() unmaps passed block, is it OK?
-                 */
-                ret = ext4_da_reserve_space(inode, iblock);
-                if (ret)
-                        /* not enough space to reserve */
-                        return ret;
-                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh);
-                set_buffer_delay(bh);
-                return 0;
-        }
        map_bh(bh, inode->i_sb, map.m_pblk);
        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
@@ -1814,8 +1933,12 @@ static int ext4_writepage(struct page *page,
                 * We don't want to do block allocation, so redirty
                 * the page and return.  We may reach here when we do
                 * a journal commit via journal_submit_inode_data_buffers.
-                 * We can also reach here via shrink_page_list
+                 * We can also reach here via shrink_page_list but it
+                 * should never be for direct reclaim so warn if that
+                 * happens
                 */
+                WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+                                                                PF_MEMALLOC);
                goto redirty_page;
        }
        if (commit_write)
@@ -2049,6 +2172,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        pgoff_t done_index = 0;
        pgoff_t end;
+        struct blk_plug plug;
        trace_ext4_da_writepages(inode, wbc);
@@ -2127,6 +2251,7 @@ retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, index, end);
+        blk_start_plug(&plug);
        while (!ret && wbc->nr_to_write > 0) {
                /*
@@ -2145,6 +2270,7 @@ retry:
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
+                        blk_finish_plug(&plug);
                        goto out_writepages;
                }
@@ -2177,11 +2303,12 @@ retry:
                        ret = 0;
                } else if (ret == MPAGE_DA_EXTENT_TAIL) {
                        /*
-                         * got one extent now try with
+                         * Got one extent now try with rest of the pages.
-                         * rest of the pages
+                         * If mpd.retval is set -EIO, journal is aborted.
+                         * So we don't need to write any more.
                         */
                        pages_written += mpd.pages_written;
-                        ret = 0;
+                        ret = mpd.retval;
                        io_done = 1;
                } else if (wbc->nr_to_write)
                        /*
@@ -2191,6 +2318,7 @@ retry:
                         */
                        break;
        }
+        blk_finish_plug(&plug);
        if (!io_done && !cycled) {
                cycled = 1;
                index = 0;
@@ -2229,10 +2357,11 @@ static int ext4_nonda_switch(struct super_block *sb)
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
-        free_blocks  = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+        free_blocks  = EXT4_C2B(sbi,
-        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+                percpu_counter_read_positive(&sbi->s_freeclusters_counter));
+        dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        if (2 * free_blocks < 3 * dirty_blocks ||
-                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+                free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
@@ -2244,7 +2373,7 @@ static int ext4_nonda_switch(struct super_block *sb)
         * start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (free_blocks < 2 * dirty_blocks)
-                writeback_inodes_sb_if_idle(sb);
+                writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
        return 0;
 }
@@ -2258,6 +2387,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
        pgoff_t index;
        struct inode *inode = mapping->host;
        handle_t *handle;
+        loff_t page_len;
        index = pos >> PAGE_CACHE_SHIFT;
@@ -2304,6 +2434,13 @@ retry:
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);
+        } else {
+                page_len = pos & (PAGE_CACHE_SIZE - 1);
+                if (page_len > 0) {
+                        ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                                inode, page, pos - page_len, page_len,
+                                EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+                }
        }
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -2346,6 +2483,7 @@ static int ext4_da_write_end(struct file *file,
        loff_t new_i_size;
        unsigned long start, end;
        int write_mode = (int)(unsigned long)fsdata;
+        loff_t page_len;
        if (write_mode == FALL_BACK_TO_NONDELALLOC) {
                if (ext4_should_order_data(inode)) {
@@ -2394,6 +2532,16 @@ static int ext4_da_write_end(struct file *file,
        }
        ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
+        page_len = PAGE_CACHE_SIZE -
+                        ((pos + copied - 1) & (PAGE_CACHE_SIZE - 1));
+        if (page_len > 0) {
+                ret = ext4_discard_partial_page_buffers_no_lock(handle,
+                        inode, page, pos + copied - 1, page_len,
+                        EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED);
+        }
        copied = ret2;
        if (ret2 < 0)
                ret = ret2;
@@ -2688,10 +2836,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
         * but being more careful is always safe for the future change.
         */
        inode = io_end->inode;
-        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+        ext4_set_io_unwritten_flag(inode, io_end);
-                io_end->flag |= EXT4_IO_END_UNWRITTEN;
-                atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
-        }
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -2857,6 +3002,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;
+        /*
+         * If we are doing data journalling we don't support O_DIRECT
+         */
+        if (ext4_should_journal_data(inode))
+                return 0;
        trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -2926,6 +3077,7 @@ static const struct address_space_operations ext4_journalled_aops = {
        .bmap                   = ext4_bmap,
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
+        .direct_IO              = ext4_direct_IO,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
 };
@@ -2962,6 +3114,227 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_journalled_aops;
 }
+/*
+ * ext4_discard_partial_page_buffers()
+ * Wrapper function for ext4_discard_partial_page_buffers_no_lock.
+ * This function finds and locks the page containing the offset
+ * "from" and passes it to ext4_discard_partial_page_buffers_no_lock.
+ * Calling functions that already have the page locked should call
+ * ext4_discard_partial_page_buffers_no_lock directly.
+ */
+int ext4_discard_partial_page_buffers(handle_t *handle,
+                struct address_space *mapping, loff_t from,
+                loff_t length, int flags)
+{
+        struct inode *inode = mapping->host;
+        struct page *page;
+        int err = 0;
+        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
+        if (!page)
+                return -ENOMEM;
+        err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
+                from, length, flags);
+        unlock_page(page);
+        page_cache_release(page);
+        return err;
+}
+/*
+ * ext4_discard_partial_page_buffers_no_lock()
+ * Zeros a page range of length 'length' starting from offset 'from'.
+ * Buffer heads that correspond to the block aligned regions of the
+ * zeroed range will be unmapped.  Unblock aligned regions
+ * will have the corresponding buffer head mapped if needed so that
+ * that region of the page can be updated with the partial zero out.
+ *
+ * This function assumes that the page has already been  locked.  The
+ * The range to be discarded must be contained with in the given page.
+ * If the specified range exceeds the end of the page it will be shortened
+ * to the end of the page that corresponds to 'from'.  This function is
+ * appropriate for updating a page and it buffer heads to be unmapped and
+ * zeroed for blocks that have been either released, or are going to be
+ * released.
+ *
+ * handle: The journal handle
+ * inode:  The files inode
+ * page:   A locked page that contains the offset "from"
+ * from:   The starting byte offset (from the begining of the file)
+ *         to begin discarding
+ * len:    The length of bytes to discard
+ * flags:  Optional flags that may be used:
+ *
+ *         EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
+ *         Only zero the regions of the page whose buffer heads
+ *         have already been unmapped.  This flag is appropriate
+ *         for updateing the contents of a page whose blocks may
+ *         have already been released, and we only want to zero
+ *         out the regions that correspond to those released blocks.
+ *
+ * Returns zero on sucess or negative on failure.
+ */
+int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
+                struct inode *inode, struct page *page, loff_t from,
+                loff_t length, int flags)
+{
+        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned int offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned int blocksize, max, pos;
+        ext4_lblk_t iblock;
+        struct buffer_head *bh;
+        int err = 0;
+        blocksize = inode->i_sb->s_blocksize;
+        max = PAGE_CACHE_SIZE - offset;
+        if (index != page->index)
+                return -EINVAL;
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the page
+         */
+        if (length > max || length < 0)
+                length = max;
+        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+        if (!page_has_buffers(page)) {
+                /*
+                 * If the range to be discarded covers a partial block
+                 * we need to get the page buffers.  This is because
+                 * partial blocks cannot be released and the page needs
+                 * to be updated with the contents of the block before
+                 * we write the zeros on top of it.
+                 */
+                if ((from & (blocksize - 1)) ||
+                    ((from + length) & (blocksize - 1))) {
+                        create_empty_buffers(page, blocksize, 0);
+                } else {
+                        /*
+                         * If there are no partial blocks,
+                         * there is nothing to update,
+                         * so we can return now
+                         */
+                        return 0;
+                }
+        }
+        /* Find the buffer that contains "offset" */
+        bh = page_buffers(page);
+        pos = blocksize;
+        while (offset >= pos) {
+                bh = bh->b_this_page;
+                iblock++;
+                pos += blocksize;
+        }
+        pos = offset;
+        while (pos < offset + length) {
+                unsigned int end_of_block, range_to_discard;
+                err = 0;
+                /* The length of space left to zero and unmap */
+                range_to_discard = offset + length - pos;
+                /* The length of space until the end of the block */
+                end_of_block = blocksize - (pos & (blocksize-1));
+                /*
+                 * Do not unmap or zero past end of block
+                 * for this buffer head
+                 */
+                if (range_to_discard > end_of_block)
+                        range_to_discard = end_of_block;
+                /*
+                 * Skip this buffer head if we are only zeroing unampped
+                 * regions of the page
+                 */
+                if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
+                        buffer_mapped(bh))
+                                goto next;
+                /* If the range is block aligned, unmap */
+                if (range_to_discard == blocksize) {
+                        clear_buffer_dirty(bh);
+                        bh->b_bdev = NULL;
+                        clear_buffer_mapped(bh);
+                        clear_buffer_req(bh);
+                        clear_buffer_new(bh);
+                        clear_buffer_delay(bh);
+                        clear_buffer_unwritten(bh);
+                        clear_buffer_uptodate(bh);
+                        zero_user(page, pos, range_to_discard);
+                        BUFFER_TRACE(bh, "Buffer discarded");
+                        goto next;
+                }
+                /*
+                 * If this block is not completely contained in the range
+                 * to be discarded, then it is not going to be released. Because
+                 * we need to keep this block, we need to make sure this part
+                 * of the page is uptodate before we modify it by writeing
+                 * partial zeros on it.
+                 */
+                if (!buffer_mapped(bh)) {
+                        /*
+                         * Buffer head must be mapped before we can read
+                         * from the block
+                         */
+                        BUFFER_TRACE(bh, "unmapped");
+                        ext4_get_block(inode, iblock, bh, 0);
+                        /* unmapped? It's a hole - nothing to do */
+                        if (!buffer_mapped(bh)) {
+                                BUFFER_TRACE(bh, "still unmapped");
+                                goto next;
+                        }
+                }
+                /* Ok, it's mapped. Make sure it's up-to-date */
+                if (PageUptodate(page))
+                        set_buffer_uptodate(bh);
+                if (!buffer_uptodate(bh)) {
+                        err = -EIO;
+                        ll_rw_block(READ, 1, &bh);
+                        wait_on_buffer(bh);
+                        /* Uhhuh. Read error. Complain and punt.*/
+                        if (!buffer_uptodate(bh))
+                                goto next;
+                }
+                if (ext4_should_journal_data(inode)) {
+                        BUFFER_TRACE(bh, "get write access");
+                        err = ext4_journal_get_write_access(handle, bh);
+                        if (err)
+                                goto next;
+                }
+                zero_user(page, pos, range_to_discard);
+                err = 0;
+                if (ext4_should_journal_data(inode)) {
+                        err = ext4_handle_dirty_metadata(handle, inode, bh);
+                } else
+                        mark_buffer_dirty(bh);
+                BUFFER_TRACE(bh, "Partial buffer zeroed");
+next:
+                bh = bh->b_this_page;
+                iblock++;
+                pos += range_to_discard;
+        }
+        return err;
+}
 /*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
@@ -3004,7 +3377,7 @@ int ext4_block_zero_page_range(handle_t *handle,
        page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
                                   mapping_gfp_mask(mapping) & ~__GFP_FS);
        if (!page)
-                return -EINVAL;
+                return -ENOMEM;
        blocksize = inode->i_sb->s_blocksize;
        max = blocksize - (offset & (blocksize - 1));
@@ -3073,11 +3446,8 @@ int ext4_block_zero_page_range(handle_t *handle,
        err = 0;
        if (ext4_should_journal_data(inode)) {
                err = ext4_handle_dirty_metadata(handle, inode, bh);
-        } else {
+        } else
-                if (ext4_should_order_data(inode) && EXT4_I(inode)->jinode)
-                        err = ext4_jbd2_file_inode(handle, inode);
                mark_buffer_dirty(bh);
-        }
 unlock:
        unlock_page(page);
@@ -3118,6 +3488,11 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
                return -ENOTSUPP;
        }
+        if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+                /* TODO: Add support for bigalloc file systems */
+                return -ENOTSUPP;
+        }
        return ext4_ext_punch_hole(file, offset, length);
 }
@@ -3301,7 +3676,7 @@ make_io:
                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_META, bh);
+                submit_bh(READ | REQ_META | REQ_PRIO, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        EXT4_ERROR_INODE_BLOCK(inode, block,
@@ -3417,7 +3792,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
-        inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
        ext4_clear_state_flags(ei);     /* Only relevant on 32-bit archs */
        ei->i_dir_start_lookup = 0;
@@ -4419,6 +4794,7 @@ retry_alloc:
                          PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
                        unlock_page(page);
                        ret = VM_FAULT_SIGBUS;
+                        ext4_journal_stop(handle);
                        goto out;
                }
                ext4_set_inode_state(inode, EXT4_STATE_JDATA);