171 files changed, 9130 insertions, 3632 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 099c7712631c..fb9ffcb43277 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -78,7 +78,6 @@ enum p9_cache_modes {
 * @cache: cache mode of type &p9_cache_modes
 * @cachetag: the tag of the cache associated with this session
 * @fscache: session cookie associated with FS-Cache
- * @options: copy of options string given by user
 * @uname: string user name to mount hierarchy as
 * @aname: mount specifier for remote hierarchy
 * @maxdata: maximum data to be sent/recvd per protocol message
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index be35d05a4d0e..e9e04376c52c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -231,9 +231,7 @@ static int v9fs_launder_page(struct page *page)
 /**
 * v9fs_direct_IO - 9P address space operation for direct I/O
 * @iocb: target I/O control block
- * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
- * @nr_segs: size of iovec array
 *
 * The presence of v9fs_direct_IO() in the address space ops vector
 * allowes open() O_DIRECT flags which would have failed otherwise.
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2a9dd37dc426..1ef16bd8280b 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -151,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
        struct p9_flock flock;
        struct p9_fid *fid;
-        uint8_t status;
+        uint8_t status = P9_LOCK_ERROR;
        int res = 0;
        unsigned char fl_type;
@@ -196,7 +196,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
        for (;;) {
                res = p9_client_lock_dotl(fid, &flock, &status);
                if (res < 0)
-                        break;
+                        goto out_unlock;
                if (status != P9_LOCK_BLOCKED)
                        break;
@@ -214,14 +214,16 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
        case P9_LOCK_BLOCKED:
                res = -EAGAIN;
                break;
+        default:
+                WARN_ONCE(1, "unknown lock status code: %d\n", status);
+                /* fallthough */
        case P9_LOCK_ERROR:
        case P9_LOCK_GRACE:
                res = -ENOLCK;
                break;
-        default:
-                BUG();
        }
+out_unlock:
        /*
         * incase server returned error for lock request, revert
         * it locally
diff --git a/fs/Kconfig b/fs/Kconfig
index ec35851e5b71..011f43365d7b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -32,6 +32,7 @@ source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
+source "fs/f2fs/Kconfig"
 config FS_DAX
        bool "Direct Access (DAX) support"
@@ -217,7 +218,6 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/f2fs/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4dabeb893b7c..df9932b00d08 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper);
 BTRFS_WORK_HELPER(scrubnc_helper);
 static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
                         int thresh)
 {
        struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -132,7 +132,7 @@ static inline void
 __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
-                                              int flags,
+                                              unsigned int flags,
                                              int max_active,
                                              int thresh)
 {
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e386c29ef1f6..ec2ee477f8ba 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
 BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
 struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
-                                              int flags,
+                                              unsigned int flags,
                                              int max_active,
                                              int thresh);
 void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f55721ff9385..9de772ee0031 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
        struct ulist *roots = NULL;
        struct ulist_iterator uiter;
        struct ulist_node *node;
-        struct seq_list elem = {};
+        struct seq_list elem = SEQ_LIST_INIT(elem);
        int ret = 0;
        tmp = ulist_alloc(GFP_NOFS);
@@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
        struct ulist_node *root_node = NULL;
-        struct seq_list tree_mod_seq_elem = {};
+        struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
        struct ulist_iterator ref_uiter;
        struct ulist_iterator root_uiter;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
         */
        struct btrfs_key location;
-        /* Lock for counters */
+        /*
+         * Lock for counters and all fields used to determine if the inode is in
+         * the log or not (last_trans, last_sub_trans, last_log_commit,
+         * logged_trans).
+         */
        spinlock_t lock;
        /* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
 static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 {
+        int ret = 0;
+        spin_lock(&BTRFS_I(inode)->lock);
        if (BTRFS_I(inode)->logged_trans == generation &&
            BTRFS_I(inode)->last_sub_trans <=
            BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
                 */
                smp_mb();
                if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
-                        return 1;
+                        ret = 1;
        }
-        return 0;
+        spin_unlock(&BTRFS_I(inode)->lock);
+        return ret;
 }
 #define BTRFS_DIO_ORIG_BIO_SUBMITTED    0x1
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..ce7dec88f4b8 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
                               (unsigned long long)bio->bi_iter.bi_sector,
                               dev_bytenr, bio->bi_bdev);
-                mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+                mapped_datav = kmalloc_array(bio->bi_vcnt,
-                                       GFP_NOFS);
+                                             sizeof(*mapped_datav), GFP_NOFS);
                if (!mapped_datav)
                        goto leave;
                cur_bytenr = dev_bytenr;
@@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root,
        mutex_unlock(&btrfsic_mutex);
-        if (is_vmalloc_addr(state))
+        kvfree(state);
-                vfree(state);
-        else
-                kfree(state);
 }
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..ce62324c78e7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->orig_bio = bio;
        nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
-        cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
+        cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
                                       GFP_NOFS);
        if (!cb->compressed_pages)
                goto fail1;
@@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
-static struct btrfs_compress_op *btrfs_compress_op[] = {
+static const struct btrfs_compress_op * const btrfs_compress_op[] = {
        &btrfs_zlib_compress,
        &btrfs_lzo_compress,
 };
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d181f70caae0..13a4dc0436c9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ struct btrfs_compress_op {
                          size_t srclen, size_t destlen);
 };
-extern struct btrfs_compress_op btrfs_zlib_compress;
+extern const struct btrfs_compress_op btrfs_zlib_compress;
-extern struct btrfs_compress_op btrfs_lzo_compress;
+extern const struct btrfs_compress_op btrfs_lzo_compress;
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d67f32e648d..0f11ebc92f02 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
        if (!tree_mod_need_log(fs_info, eb))
                return 0;
-        tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+        tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
        if (!tm_list)
                return -ENOMEM;
@@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
        if (log_removal && btrfs_header_level(old_root) > 0) {
                nritems = btrfs_header_nritems(old_root);
-                tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+                tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
                                  flags);
                if (!tm_list) {
                        ret = -ENOMEM;
@@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
        if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
                return 0;
-        tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+        tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
                          GFP_NOFS);
        if (!tm_list)
                return -ENOMEM;
@@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
                return 0;
        nritems = btrfs_header_nritems(eb);
-        tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+        tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
-                          GFP_NOFS);
        if (!tm_list)
                return -ENOMEM;
@@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        ret = btrfs_dec_ref(trans, root, buf, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
-                clean_tree_block(trans, root, buf);
+                clean_tree_block(trans, root->fs_info, buf);
                *last_ref = 1;
        }
        return 0;
@@ -1678,7 +1677,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        continue;
                }
-                cur = btrfs_find_tree_block(root, blocknr);
+                cur = btrfs_find_tree_block(root->fs_info, blocknr);
                if (cur)
                        uptodate = btrfs_buffer_uptodate(cur, gen, 0);
                else
@@ -1943,7 +1942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                path->locks[level] = 0;
                path->nodes[level] = NULL;
-                clean_tree_block(trans, root, mid);
+                clean_tree_block(trans, root->fs_info, mid);
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
@@ -1997,7 +1996,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                        clean_tree_block(trans, root, right);
+                        clean_tree_block(trans, root->fs_info, right);
                        btrfs_tree_unlock(right);
                        del_ptr(root, path, level + 1, pslot + 1);
                        root_sub_used(root, right->len);
@@ -2041,7 +2040,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-                clean_tree_block(trans, root, mid);
+                clean_tree_block(trans, root->fs_info, mid);
                btrfs_tree_unlock(mid);
                del_ptr(root, path, level + 1, pslot);
                root_sub_used(root, mid->len);
@@ -2259,7 +2258,7 @@ static void reada_for_search(struct btrfs_root *root,
        search = btrfs_node_blockptr(node, slot);
        blocksize = root->nodesize;
-        eb = btrfs_find_tree_block(root, search);
+        eb = btrfs_find_tree_block(root->fs_info, search);
        if (eb) {
                free_extent_buffer(eb);
                return;
@@ -2319,7 +2318,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        if (slot > 0) {
                block1 = btrfs_node_blockptr(parent, slot - 1);
                gen = btrfs_node_ptr_generation(parent, slot - 1);
-                eb = btrfs_find_tree_block(root, block1);
+                eb = btrfs_find_tree_block(root->fs_info, block1);
                /*
                 * if we get -eagain from btrfs_buffer_uptodate, we
                 * don't want to return eagain here.  That will loop
@@ -2332,7 +2331,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        if (slot + 1 < nritems) {
                block2 = btrfs_node_blockptr(parent, slot + 1);
                gen = btrfs_node_ptr_generation(parent, slot + 1);
-                eb = btrfs_find_tree_block(root, block2);
+                eb = btrfs_find_tree_block(root->fs_info, block2);
                if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
                        block2 = 0;
                free_extent_buffer(eb);
@@ -2450,7 +2449,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        blocknr = btrfs_node_blockptr(b, slot);
        gen = btrfs_node_ptr_generation(b, slot);
-        tmp = btrfs_find_tree_block(root, blocknr);
+        tmp = btrfs_find_tree_block(root->fs_info, blocknr);
        if (tmp) {
                /* first we do an atomic uptodate check */
                if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -3126,7 +3125,8 @@ again:
 * higher levels
 *
 */
-static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+static void fixup_low_keys(struct btrfs_fs_info *fs_info,
+                           struct btrfs_path *path,
                           struct btrfs_disk_key *key, int level)
 {
        int i;
@@ -3137,7 +3137,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
-                tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
+                tree_mod_log_set_node_key(fs_info, t, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -3151,7 +3151,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+                             struct btrfs_path *path,
                             struct btrfs_key *new_key)
 {
        struct btrfs_disk_key disk_key;
@@ -3173,7 +3174,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
        btrfs_set_item_key(eb, &disk_key, slot);
        btrfs_mark_buffer_dirty(eb);
        if (slot == 0)
-                fixup_low_keys(root, path, &disk_key, 1);
+                fixup_low_keys(fs_info, path, &disk_key, 1);
 }
 /*
@@ -3692,7 +3693,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
        else
-                clean_tree_block(trans, root, left);
+                clean_tree_block(trans, root->fs_info, left);
        btrfs_mark_buffer_dirty(right);
@@ -3704,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (path->slots[0] >= left_nritems) {
                path->slots[0] -= left_nritems;
                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                        clean_tree_block(trans, root, path->nodes[0]);
+                        clean_tree_block(trans, root->fs_info, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = right;
@@ -3928,10 +3929,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
        else
-                clean_tree_block(trans, root, right);
+                clean_tree_block(trans, root->fs_info, right);
        btrfs_item_key(right, &disk_key, 0);
-        fixup_low_keys(root, path, &disk_key, 1);
+        fixup_low_keys(root->fs_info, path, &disk_key, 1);
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
@@ -4168,6 +4169,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int mid;
        int slot;
        struct extent_buffer *right;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        int ret = 0;
        int wret;
        int split;
@@ -4271,10 +4273,10 @@ again:
        btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
        btrfs_set_header_owner(right, root->root_key.objectid);
        btrfs_set_header_level(right, 0);
-        write_extent_buffer(right, root->fs_info->fsid,
+        write_extent_buffer(right, fs_info->fsid,
                            btrfs_header_fsid(), BTRFS_FSID_SIZE);
-        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
+        write_extent_buffer(right, fs_info->chunk_tree_uuid,
                            btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
@@ -4297,7 +4299,7 @@ again:
                        path->nodes[0] = right;
                        path->slots[0] = 0;
                        if (path->slots[1] == 0)
-                                fixup_low_keys(root, path, &disk_key, 1);
+                                fixup_low_keys(fs_info, path, &disk_key, 1);
                }
                btrfs_mark_buffer_dirty(right);
                return ret;
@@ -4615,7 +4617,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
                btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
                btrfs_set_item_key(leaf, &disk_key, slot);
                if (slot == 0)
-                        fixup_low_keys(root, path, &disk_key, 1);
+                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
        }
        item = btrfs_item_nr(slot);
@@ -4716,7 +4718,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
        if (path->slots[0] == 0) {
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-                fixup_low_keys(root, path, &disk_key, 1);
+                fixup_low_keys(root->fs_info, path, &disk_key, 1);
        }
        btrfs_unlock_up_safe(path, 1);
@@ -4888,7 +4890,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
                struct btrfs_disk_key disk_key;
                btrfs_node_key(parent, &disk_key, 0);
-                fixup_low_keys(root, path, &disk_key, level + 1);
+                fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
        }
        btrfs_mark_buffer_dirty(parent);
 }
@@ -4981,7 +4983,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        btrfs_set_header_level(leaf, 0);
                } else {
                        btrfs_set_path_blocking(path);
-                        clean_tree_block(trans, root, leaf);
+                        clean_tree_block(trans, root->fs_info, leaf);
                        btrfs_del_leaf(trans, root, path, leaf);
                }
        } else {
@@ -4990,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        struct btrfs_disk_key disk_key;
                        btrfs_item_key(leaf, &disk_key, 0);
-                        fixup_low_keys(root, path, &disk_key, 1);
+                        fixup_low_keys(root->fs_info, path, &disk_key, 1);
                }
                /* delete the leaf if it is mostly empty */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9c89cae39ee..6f364e1d8d3d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1061,6 +1061,12 @@ struct btrfs_block_group_item {
        __le64 flags;
 } __attribute__ ((__packed__));
+#define BTRFS_QGROUP_LEVEL_SHIFT                48
+static inline u64 btrfs_qgroup_level(u64 qgroupid)
+{
+        return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
+}
 /*
 * is subvolume quota turned on?
 */
@@ -1256,6 +1262,20 @@ struct btrfs_caching_control {
        atomic_t count;
 };
+struct btrfs_io_ctl {
+        void *cur, *orig;
+        struct page *page;
+        struct page **pages;
+        struct btrfs_root *root;
+        struct inode *inode;
+        unsigned long size;
+        int index;
+        int num_pages;
+        int entries;
+        int bitmaps;
+        unsigned check_crcs:1;
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
@@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache {
        /* For dirty block groups */
        struct list_head dirty_list;
+        struct list_head io_list;
+        struct btrfs_io_ctl io_ctl;
 };
 /* delayed seq elem */
@@ -1329,6 +1352,8 @@ struct seq_list {
        u64 seq;
 };
+#define SEQ_LIST_INIT(name)     { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
 enum btrfs_orphan_cleanup_state {
        ORPHAN_CLEANUP_STARTED  = 1,
        ORPHAN_CLEANUP_DONE     = 2,
@@ -1472,6 +1497,12 @@ struct btrfs_fs_info {
        struct mutex chunk_mutex;
        struct mutex volume_mutex;
+        /*
+         * this is taken to make sure we don't set block groups ro after
+         * the free space cache has been allocated on them
+         */
+        struct mutex ro_block_group_mutex;
        /* this is used during read/modify/write to make sure
         * no two ios are trying to mod the same stripe at the same
         * time
@@ -1513,6 +1544,7 @@ struct btrfs_fs_info {
        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
+        struct rw_semaphore delayed_iput_sem;
        /* this protects tree_mod_seq_list */
        spinlock_t tree_mod_seq_lock;
@@ -3295,6 +3327,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
 }
 /* extent-tree.c */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
 {
@@ -3385,6 +3420,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 owner, u64 offset, int no_quota);
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root);
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -3417,7 +3454,7 @@ enum btrfs_reserve_flush_enum {
        BTRFS_RESERVE_FLUSH_ALL,
 };
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
@@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
                                              unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush);
@@ -3486,7 +3524,8 @@ int btrfs_previous_item(struct btrfs_root *root,
                        int type);
 int btrfs_previous_extent_item(struct btrfs_root *root,
                        struct btrfs_path *path, u64 min_objectid);
-void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
+                             struct btrfs_path *path,
                             struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
@@ -4180,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 static inline int is_fstree(u64 rootid)
 {
        if (rootid == BTRFS_FS_TREE_OBJECTID ||
-            (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+            ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
+              !btrfs_qgroup_level(rootid)))
                return 1;
        return 0;
 }
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 82f0c7c95474..cde698a07d21 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1383,7 +1383,7 @@ out:
 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
-                                     struct btrfs_root *root, int nr)
+                                     struct btrfs_fs_info *fs_info, int nr)
 {
        struct btrfs_async_delayed_work *async_work;
@@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
                        btrfs_async_run_delayed_root, NULL, NULL);
        async_work->nr = nr;
-        btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
+        btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
        return 0;
 }
@@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
        struct btrfs_delayed_root *delayed_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
        delayed_root = btrfs_get_delayed_root(root);
@@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
                seq = atomic_read(&delayed_root->items_seq);
-                ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
+                ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
                if (ret)
                        return;
@@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
                return;
        }
-        btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
+        btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
 }
 /* Will return 0 or -ENOMEM */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6d16bea94e1c..8f8ed7d20bac 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 * existing and update must have the same bytenr
 */
 static noinline void
-update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
+                         struct btrfs_delayed_ref_node *existing,
                         struct btrfs_delayed_ref_node *update)
 {
        struct btrfs_delayed_ref_head *existing_ref;
        struct btrfs_delayed_ref_head *ref;
+        int old_ref_mod;
        existing_ref = btrfs_delayed_node_to_head(existing);
        ref = btrfs_delayed_node_to_head(update);
@@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
         * only need the lock for this case cause we could be processing it
         * currently, for refs we just added we know we're a-ok.
         */
+        old_ref_mod = existing_ref->total_ref_mod;
        existing->ref_mod += update->ref_mod;
+        existing_ref->total_ref_mod += update->ref_mod;
+        /*
+         * If we are going to from a positive ref mod to a negative or vice
+         * versa we need to make sure to adjust pending_csums accordingly.
+         */
+        if (existing_ref->is_data) {
+                if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
+                        delayed_refs->pending_csums -= existing->num_bytes;
+                if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
+                        delayed_refs->pending_csums += existing->num_bytes;
+        }
        spin_unlock(&existing_ref->lock);
 }
@@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        head_ref->is_data = is_data;
        head_ref->ref_root = RB_ROOT;
        head_ref->processing = 0;
+        head_ref->total_ref_mod = count_mod;
        spin_lock_init(&head_ref->lock);
        mutex_init(&head_ref->mutex);
@@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
        existing = htree_insert(&delayed_refs->href_root,
                                &head_ref->href_node);
        if (existing) {
-                update_existing_head_ref(&existing->node, ref);
+                update_existing_head_ref(delayed_refs, &existing->node, ref);
                /*
                 * we've updated the existing ref, free the newly
                 * allocated ref
@@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
                head_ref = existing;
        } else {
+                if (is_data && count_mod < 0)
+                        delayed_refs->pending_csums += num_bytes;
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
                atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a764e2340d48..5eb0892396d0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head {
        struct rb_node href_node;
        struct btrfs_delayed_extent_op *extent_op;
+        /*
+         * This is used to track the final ref_mod from all the refs associated
+         * with this head ref, this is not adjusted as delayed refs are run,
+         * this is meant to track if we need to do the csum accounting or not.
+         */
+        int total_ref_mod;
        /*
         * when a new extent is allocated, it is just reserved in memory
         * The actual extent isn't inserted into the extent allocation tree
@@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root {
        /* total number of head nodes ready for processing */
        unsigned long num_heads_ready;
+        u64 pending_csums;
        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..0573848c7333 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
                srcdev = dev_replace->srcdev;
-                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
+                args->status.progress_1000 = div_u64(dev_replace->cursor_left,
-                        div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
+                        div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
                break;
        }
        btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data)
                btrfs_dev_replace_status(fs_info, status_args);
                progress = status_args->status.progress_1000;
                kfree(status_args);
-                do_div(progress, 10);
+                progress = div_u64(progress, 10);
                printk_in_rcu(KERN_INFO
                        "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
                        dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 639f2663ed3f..2ef9a4b72d06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,7 +54,7 @@
 #include <asm/cpufeature.h>
 #endif
-static struct extent_io_ops btree_extent_io_ops;
+static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result)
 * compute the csum for a btree block, and either verify it or write it
 * into the csum field of the block.
 */
-static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
+static int csum_tree_block(struct btrfs_fs_info *fs_info,
+                           struct extent_buffer *buf,
                           int verify)
 {
-        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+        u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
        char *result = NULL;
        unsigned long len;
        unsigned long cur_len;
@@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                offset += cur_len;
        }
        if (csum_size > sizeof(inline_result)) {
-                result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
+                result = kzalloc(csum_size, GFP_NOFS);
                if (!result)
                        return 1;
        } else {
@@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        printk_ratelimited(KERN_WARNING
                                "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
                                "level %d\n",
-                                root->fs_info->sb->s_id, buf->start,
+                                fs_info->sb->s_id, buf->start,
                                val, found, btrfs_header_level(buf));
                        if (result != (char *)&inline_result)
                                kfree(result);
@@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
                if (memcmp(raw_disk_sb, result, csum_size))
                        ret = 1;
-                if (ret && btrfs_super_generation(disk_sb) < 10) {
-                        printk(KERN_WARNING
-                                "BTRFS: super block crcs don't match, older mkfs detected\n");
-                        ret = 0;
-                }
        }
        if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
@@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 * we only fill in the checksum field in the first page of a multi-page block
 */
-static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
+static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
 {
        u64 start = page_offset(page);
        u64 found_start;
@@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        found_start = btrfs_header_bytenr(eb);
        if (WARN_ON(found_start != start || !PageUptodate(page)))
                return 0;
-        csum_tree_block(root, eb, 0);
+        csum_tree_block(fs_info, eb, 0);
        return 0;
 }
-static int check_tree_block_fsid(struct btrfs_root *root,
+static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
                                 struct extent_buffer *eb)
 {
-        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        u8 fsid[BTRFS_UUID_SIZE];
        int ret = 1;
@@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
                ret = -EIO;
                goto err;
        }
-        if (check_tree_block_fsid(root, eb)) {
+        if (check_tree_block_fsid(root->fs_info, eb)) {
                printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
                               eb->fs_info->sb->s_id, eb->start);
                ret = -EIO;
@@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
                                       eb, found_level);
-        ret = csum_tree_block(root, eb, 1);
+        ret = csum_tree_block(root->fs_info, eb, 1);
        if (ret) {
                ret = -EIO;
                goto err;
@@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio)
        bio_for_each_segment_all(bvec, bio, i) {
                root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-                ret = csum_dirty_buffer(root, bvec->bv_page);
+                ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
                if (ret)
                        break;
        }
@@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
        return 0;
 }
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
                                            u64 bytenr)
 {
-        return find_extent_buffer(root->fs_info, bytenr);
+        return find_extent_buffer(fs_info, bytenr);
 }
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -1165,11 +1160,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 }
-void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+void clean_tree_block(struct btrfs_trans_handle *trans,
+                      struct btrfs_fs_info *fs_info,
                      struct extent_buffer *buf)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
        if (btrfs_header_generation(buf) ==
            fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
@@ -2146,6 +2140,267 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
        }
 }
+static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
+{
+        mutex_init(&fs_info->scrub_lock);
+        atomic_set(&fs_info->scrubs_running, 0);
+        atomic_set(&fs_info->scrub_pause_req, 0);
+        atomic_set(&fs_info->scrubs_paused, 0);
+        atomic_set(&fs_info->scrub_cancel_req, 0);
+        init_waitqueue_head(&fs_info->scrub_pause_wait);
+        fs_info->scrub_workers_refcnt = 0;
+}
+static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+{
+        spin_lock_init(&fs_info->balance_lock);
+        mutex_init(&fs_info->balance_mutex);
+        atomic_set(&fs_info->balance_running, 0);
+        atomic_set(&fs_info->balance_pause_req, 0);
+        atomic_set(&fs_info->balance_cancel_req, 0);
+        fs_info->balance_ctl = NULL;
+        init_waitqueue_head(&fs_info->balance_wait_q);
+}
+static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_root *tree_root)
+{
+        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+        set_nlink(fs_info->btree_inode, 1);
+        /*
+         * we set the i_size on the btree inode to the max possible int.
+         * the real end of the address space is determined by all of
+         * the devices in the system
+         */
+        fs_info->btree_inode->i_size = OFFSET_MAX;
+        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
+        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
+        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
+                             fs_info->btree_inode->i_mapping);
+        BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
+        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
+        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+        BTRFS_I(fs_info->btree_inode)->root = tree_root;
+        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+               sizeof(struct btrfs_key));
+        set_bit(BTRFS_INODE_DUMMY,
+                &BTRFS_I(fs_info->btree_inode)->runtime_flags);
+        btrfs_insert_inode_hash(fs_info->btree_inode);
+}
+static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+{
+        fs_info->dev_replace.lock_owner = 0;
+        atomic_set(&fs_info->dev_replace.nesting_level, 0);
+        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+        mutex_init(&fs_info->dev_replace.lock_management_lock);
+        mutex_init(&fs_info->dev_replace.lock);
+        init_waitqueue_head(&fs_info->replace_wait);
+}
+static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
+{
+        spin_lock_init(&fs_info->qgroup_lock);
+        mutex_init(&fs_info->qgroup_ioctl_lock);
+        fs_info->qgroup_tree = RB_ROOT;
+        fs_info->qgroup_op_tree = RB_ROOT;
+        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+        fs_info->qgroup_seq = 1;
+        fs_info->quota_enabled = 0;
+        fs_info->pending_quota_state = 0;
+        fs_info->qgroup_ulist = NULL;
+        mutex_init(&fs_info->qgroup_rescan_lock);
+}
+static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
+                struct btrfs_fs_devices *fs_devices)
+{
+        int max_active = fs_info->thread_pool_size;
+        unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
+        fs_info->workers =
+                btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+                                      max_active, 16);
+        fs_info->delalloc_workers =
+                btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+        fs_info->flush_workers =
+                btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+        fs_info->caching_workers =
+                btrfs_alloc_workqueue("cache", flags, max_active, 0);
+        /*
+         * a higher idle thresh on the submit workers makes it much more
+         * likely that bios will be send down in a sane order to the
+         * devices
+         */
+        fs_info->submit_workers =
+                btrfs_alloc_workqueue("submit", flags,
+                                      min_t(u64, fs_devices->num_devices,
+                                            max_active), 64);
+        fs_info->fixup_workers =
+                btrfs_alloc_workqueue("fixup", flags, 1, 0);
+        /*
+         * endios are largely parallel and should have a very
+         * low idle thresh
+         */
+        fs_info->endio_workers =
+                btrfs_alloc_workqueue("endio", flags, max_active, 4);
+        fs_info->endio_meta_workers =
+                btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+        fs_info->endio_meta_write_workers =
+                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+        fs_info->endio_raid56_workers =
+                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+        fs_info->endio_repair_workers =
+                btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+        fs_info->rmw_workers =
+                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+        fs_info->endio_write_workers =
+                btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+        fs_info->endio_freespace_worker =
+                btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+        fs_info->delayed_workers =
+                btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+        fs_info->readahead_workers =
+                btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+        fs_info->qgroup_rescan_workers =
+                btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+        fs_info->extent_workers =
+                btrfs_alloc_workqueue("extent-refs", flags,
+                                      min_t(u64, fs_devices->num_devices,
+                                            max_active), 8);
+        if (!(fs_info->workers && fs_info->delalloc_workers &&
+              fs_info->submit_workers && fs_info->flush_workers &&
+              fs_info->endio_workers && fs_info->endio_meta_workers &&
+              fs_info->endio_meta_write_workers &&
+              fs_info->endio_repair_workers &&
+              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+              fs_info->caching_workers && fs_info->readahead_workers &&
+              fs_info->fixup_workers && fs_info->delayed_workers &&
+              fs_info->extent_workers &&
+              fs_info->qgroup_rescan_workers)) {
+                return -ENOMEM;
+        }
+        return 0;
+}
+static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
+                            struct btrfs_fs_devices *fs_devices)
+{
+        int ret;
+        struct btrfs_root *tree_root = fs_info->tree_root;
+        struct btrfs_root *log_tree_root;
+        struct btrfs_super_block *disk_super = fs_info->super_copy;
+        u64 bytenr = btrfs_super_log_root(disk_super);
+        if (fs_devices->rw_devices == 0) {
+                printk(KERN_WARNING "BTRFS: log replay required "
+                       "on RO media\n");
+                return -EIO;
+        }
+        log_tree_root = btrfs_alloc_root(fs_info);
+        if (!log_tree_root)
+                return -ENOMEM;
+        __setup_root(tree_root->nodesize, tree_root->sectorsize,
+                        tree_root->stripesize, log_tree_root, fs_info,
+                        BTRFS_TREE_LOG_OBJECTID);
+        log_tree_root->node = read_tree_block(tree_root, bytenr,
+                        fs_info->generation + 1);
+        if (!log_tree_root->node ||
+            !extent_buffer_uptodate(log_tree_root->node)) {
+                printk(KERN_ERR "BTRFS: failed to read log tree\n");
+                free_extent_buffer(log_tree_root->node);
+                kfree(log_tree_root);
+                return -EIO;
+        }
+        /* returns with log_tree_root freed on success */
+        ret = btrfs_recover_log_trees(log_tree_root);
+        if (ret) {
+                btrfs_error(tree_root->fs_info, ret,
+                            "Failed to recover log tree");
+                free_extent_buffer(log_tree_root->node);
+                kfree(log_tree_root);
+                return ret;
+        }
+        if (fs_info->sb->s_flags & MS_RDONLY) {
+                ret = btrfs_commit_super(tree_root);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
+                            struct btrfs_root *tree_root)
+{
+        struct btrfs_root *root;
+        struct btrfs_key location;
+        int ret;
+        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+        location.type = BTRFS_ROOT_ITEM_KEY;
+        location.offset = 0;
+        root = btrfs_read_tree_root(tree_root, &location);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+        fs_info->extent_root = root;
+        location.objectid = BTRFS_DEV_TREE_OBJECTID;
+        root = btrfs_read_tree_root(tree_root, &location);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+        fs_info->dev_root = root;
+        btrfs_init_devices_late(fs_info);
+        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+        root = btrfs_read_tree_root(tree_root, &location);
+        if (IS_ERR(root))
+                return PTR_ERR(root);
+        set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+        fs_info->csum_root = root;
+        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+        root = btrfs_read_tree_root(tree_root, &location);
+        if (!IS_ERR(root)) {
+                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+                fs_info->quota_enabled = 1;
+                fs_info->pending_quota_state = 1;
+                fs_info->quota_root = root;
+        }
+        location.objectid = BTRFS_UUID_TREE_OBJECTID;
+        root = btrfs_read_tree_root(tree_root, &location);
+        if (IS_ERR(root)) {
+                ret = PTR_ERR(root);
+                if (ret != -ENOENT)
+                        return ret;
+        } else {
+                set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+                fs_info->uuid_root = root;
+        }
+        return 0;
+}
 int open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options)
@@ -2160,21 +2415,12 @@ int open_ctree(struct super_block *sb,
        struct btrfs_super_block *disk_super;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *tree_root;
-        struct btrfs_root *extent_root;
-        struct btrfs_root *csum_root;
        struct btrfs_root *chunk_root;
-        struct btrfs_root *dev_root;
-        struct btrfs_root *quota_root;
-        struct btrfs_root *uuid_root;
-        struct btrfs_root *log_tree_root;
        int ret;
        int err = -EINVAL;
        int num_backups_tried = 0;
        int backup_index = 0;
        int max_active;
-        int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
-        bool create_uuid_tree;
-        bool check_uuid_tree;
        tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
        chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2241,11 +2487,12 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
-        mutex_init(&fs_info->unused_bg_unpin_mutex);
        rwlock_init(&fs_info->tree_mod_log_lock);
+        mutex_init(&fs_info->unused_bg_unpin_mutex);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
        seqlock_init(&fs_info->profiles_lock);
+        init_rwsem(&fs_info->delayed_iput_sem);
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2276,7 +2523,7 @@ int open_ctree(struct super_block *sb,
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
        fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-        fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
+        fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        spin_lock_init(&fs_info->reada_lock);
@@ -2294,55 +2541,18 @@ int open_ctree(struct super_block *sb,
        }
        btrfs_init_delayed_root(fs_info->delayed_root);
-        mutex_init(&fs_info->scrub_lock);
+        btrfs_init_scrub(fs_info);
-        atomic_set(&fs_info->scrubs_running, 0);
-        atomic_set(&fs_info->scrub_pause_req, 0);
-        atomic_set(&fs_info->scrubs_paused, 0);
-        atomic_set(&fs_info->scrub_cancel_req, 0);
-        init_waitqueue_head(&fs_info->replace_wait);
-        init_waitqueue_head(&fs_info->scrub_pause_wait);
-        fs_info->scrub_workers_refcnt = 0;
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
        fs_info->check_integrity_print_mask = 0;
 #endif
+        btrfs_init_balance(fs_info);
-        spin_lock_init(&fs_info->balance_lock);
-        mutex_init(&fs_info->balance_mutex);
-        atomic_set(&fs_info->balance_running, 0);
-        atomic_set(&fs_info->balance_pause_req, 0);
-        atomic_set(&fs_info->balance_cancel_req, 0);
-        fs_info->balance_ctl = NULL;
-        init_waitqueue_head(&fs_info->balance_wait_q);
        btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
        sb->s_bdi = &fs_info->bdi;
-        fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+        btrfs_init_btree_inode(fs_info, tree_root);
-        set_nlink(fs_info->btree_inode, 1);
-        /*
-         * we set the i_size on the btree inode to the max possible int.
-         * the real end of the address space is determined by all of
-         * the devices in the system
-         */
-        fs_info->btree_inode->i_size = OFFSET_MAX;
-        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
-        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-                             fs_info->btree_inode->i_mapping);
-        BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
-        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
-        BTRFS_I(fs_info->btree_inode)->root = tree_root;
-        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
-               sizeof(struct btrfs_key));
-        set_bit(BTRFS_INODE_DUMMY,
-                &BTRFS_I(fs_info->btree_inode)->runtime_flags);
-        btrfs_insert_inode_hash(fs_info->btree_inode);
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT;
@@ -2363,26 +2573,14 @@ int open_ctree(struct super_block *sb,
        mutex_init(&fs_info->transaction_kthread_mutex);
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
+        mutex_init(&fs_info->ro_block_group_mutex);
        init_rwsem(&fs_info->commit_root_sem);
        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
        sema_init(&fs_info->uuid_tree_rescan_sem, 1);
-        fs_info->dev_replace.lock_owner = 0;
-        atomic_set(&fs_info->dev_replace.nesting_level, 0);
-        mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
-        mutex_init(&fs_info->dev_replace.lock_management_lock);
-        mutex_init(&fs_info->dev_replace.lock);
-        spin_lock_init(&fs_info->qgroup_lock);
+        btrfs_init_dev_replace_locks(fs_info);
-        mutex_init(&fs_info->qgroup_ioctl_lock);
+        btrfs_init_qgroup(fs_info);
-        fs_info->qgroup_tree = RB_ROOT;
-        fs_info->qgroup_op_tree = RB_ROOT;
-        INIT_LIST_HEAD(&fs_info->dirty_qgroups);
-        fs_info->qgroup_seq = 1;
-        fs_info->quota_enabled = 0;
-        fs_info->pending_quota_state = 0;
-        fs_info->qgroup_ulist = NULL;
-        mutex_init(&fs_info->qgroup_rescan_lock);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2554,75 +2752,9 @@ int open_ctree(struct super_block *sb,
        max_active = fs_info->thread_pool_size;
-        fs_info->workers =
+        ret = btrfs_init_workqueues(fs_info, fs_devices);
-                btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+        if (ret) {
-                                      max_active, 16);
+                err = ret;
-        fs_info->delalloc_workers =
-                btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
-        fs_info->flush_workers =
-                btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
-        fs_info->caching_workers =
-                btrfs_alloc_workqueue("cache", flags, max_active, 0);
-        /*
-         * a higher idle thresh on the submit workers makes it much more
-         * likely that bios will be send down in a sane order to the
-         * devices
-         */
-        fs_info->submit_workers =
-                btrfs_alloc_workqueue("submit", flags,
-                                      min_t(u64, fs_devices->num_devices,
-                                            max_active), 64);
-        fs_info->fixup_workers =
-                btrfs_alloc_workqueue("fixup", flags, 1, 0);
-        /*
-         * endios are largely parallel and should have a very
-         * low idle thresh
-         */
-        fs_info->endio_workers =
-                btrfs_alloc_workqueue("endio", flags, max_active, 4);
-        fs_info->endio_meta_workers =
-                btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
-        fs_info->endio_meta_write_workers =
-                btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
-        fs_info->endio_raid56_workers =
-                btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
-        fs_info->endio_repair_workers =
-                btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
-        fs_info->rmw_workers =
-                btrfs_alloc_workqueue("rmw", flags, max_active, 2);
-        fs_info->endio_write_workers =
-                btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
-        fs_info->endio_freespace_worker =
-                btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
-        fs_info->delayed_workers =
-                btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
-        fs_info->readahead_workers =
-                btrfs_alloc_workqueue("readahead", flags, max_active, 2);
-        fs_info->qgroup_rescan_workers =
-                btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
-        fs_info->extent_workers =
-                btrfs_alloc_workqueue("extent-refs", flags,
-                                      min_t(u64, fs_devices->num_devices,
-                                            max_active), 8);
-        if (!(fs_info->workers && fs_info->delalloc_workers &&
-              fs_info->submit_workers && fs_info->flush_workers &&
-              fs_info->endio_workers && fs_info->endio_meta_workers &&
-              fs_info->endio_meta_write_workers &&
-              fs_info->endio_repair_workers &&
-              fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
-              fs_info->endio_freespace_worker && fs_info->rmw_workers &&
-              fs_info->caching_workers && fs_info->readahead_workers &&
-              fs_info->fixup_workers && fs_info->delayed_workers &&
-              fs_info->extent_workers &&
-              fs_info->qgroup_rescan_workers)) {
-                err = -ENOMEM;
                goto fail_sb_buffer;
        }
@@ -2688,7 +2820,7 @@ int open_ctree(struct super_block *sb,
         * keep the device that is marked to be the target device for the
         * dev_replace procedure
         */
-        btrfs_close_extra_devices(fs_info, fs_devices, 0);
+        btrfs_close_extra_devices(fs_devices, 0);
        if (!fs_devices->latest_bdev) {
                printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
@@ -2714,61 +2846,9 @@ retry_root_backup:
        tree_root->commit_root = btrfs_root_node(tree_root);
        btrfs_set_root_refs(&tree_root->root_item, 1);
-        location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+        ret = btrfs_read_roots(fs_info, tree_root);
-        location.type = BTRFS_ROOT_ITEM_KEY;
+        if (ret)
-        location.offset = 0;
-        extent_root = btrfs_read_tree_root(tree_root, &location);
-        if (IS_ERR(extent_root)) {
-                ret = PTR_ERR(extent_root);
-                goto recovery_tree_root;
-        }
-        set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
-        fs_info->extent_root = extent_root;
-        location.objectid = BTRFS_DEV_TREE_OBJECTID;
-        dev_root = btrfs_read_tree_root(tree_root, &location);
-        if (IS_ERR(dev_root)) {
-                ret = PTR_ERR(dev_root);
-                goto recovery_tree_root;
-        }
-        set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
-        fs_info->dev_root = dev_root;
-        btrfs_init_devices_late(fs_info);
-        location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-        csum_root = btrfs_read_tree_root(tree_root, &location);
-        if (IS_ERR(csum_root)) {
-                ret = PTR_ERR(csum_root);
                goto recovery_tree_root;
-        }
-        set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
-        fs_info->csum_root = csum_root;
-        location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
-        quota_root = btrfs_read_tree_root(tree_root, &location);
-        if (!IS_ERR(quota_root)) {
-                set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
-                fs_info->quota_enabled = 1;
-                fs_info->pending_quota_state = 1;
-                fs_info->quota_root = quota_root;
-        }
-        location.objectid = BTRFS_UUID_TREE_OBJECTID;
-        uuid_root = btrfs_read_tree_root(tree_root, &location);
-        if (IS_ERR(uuid_root)) {
-                ret = PTR_ERR(uuid_root);
-                if (ret != -ENOENT)
-                        goto recovery_tree_root;
-                create_uuid_tree = true;
-                check_uuid_tree = false;
-        } else {
-                set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
-                fs_info->uuid_root = uuid_root;
-                create_uuid_tree = false;
-                check_uuid_tree =
-                    generation != btrfs_super_uuid_tree_generation(disk_super);
-        }
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
@@ -2792,7 +2872,7 @@ retry_root_backup:
                goto fail_block_groups;
        }
-        btrfs_close_extra_devices(fs_info, fs_devices, 1);
+        btrfs_close_extra_devices(fs_devices, 1);
        ret = btrfs_sysfs_add_one(fs_info);
        if (ret) {
@@ -2806,7 +2886,7 @@ retry_root_backup:
                goto fail_sysfs;
        }
-        ret = btrfs_read_block_groups(extent_root);
+        ret = btrfs_read_block_groups(fs_info->extent_root);
        if (ret) {
                printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
                goto fail_sysfs;
@@ -2864,48 +2944,11 @@ retry_root_backup:
        /* do not make disk changes in broken FS */
        if (btrfs_super_log_root(disk_super) != 0) {
-                u64 bytenr = btrfs_super_log_root(disk_super);
+                ret = btrfs_replay_log(fs_info, fs_devices);
-                if (fs_devices->rw_devices == 0) {
-                        printk(KERN_WARNING "BTRFS: log replay required "
-                               "on RO media\n");
-                        err = -EIO;
-                        goto fail_qgroup;
-                }
-                log_tree_root = btrfs_alloc_root(fs_info);
-                if (!log_tree_root) {
-                        err = -ENOMEM;
-                        goto fail_qgroup;
-                }
-                __setup_root(nodesize, sectorsize, stripesize,
-                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
-                log_tree_root->node = read_tree_block(tree_root, bytenr,
-                                                      generation + 1);
-                if (!log_tree_root->node ||
-                    !extent_buffer_uptodate(log_tree_root->node)) {
-                        printk(KERN_ERR "BTRFS: failed to read log tree\n");
-                        free_extent_buffer(log_tree_root->node);
-                        kfree(log_tree_root);
-                        goto fail_qgroup;
-                }
-                /* returns with log_tree_root freed on success */
-                ret = btrfs_recover_log_trees(log_tree_root);
                if (ret) {
-                        btrfs_error(tree_root->fs_info, ret,
+                        err = ret;
-                                    "Failed to recover log tree");
-                        free_extent_buffer(log_tree_root->node);
-                        kfree(log_tree_root);
                        goto fail_qgroup;
                }
-                if (sb->s_flags & MS_RDONLY) {
-                        ret = btrfs_commit_super(tree_root);
-                        if (ret)
-                                goto fail_qgroup;
-                }
        }
        ret = btrfs_find_orphan_roots(tree_root);
@@ -2966,7 +3009,7 @@ retry_root_backup:
        btrfs_qgroup_rescan_resume(fs_info);
-        if (create_uuid_tree) {
+        if (!fs_info->uuid_root) {
                pr_info("BTRFS: creating UUID tree\n");
                ret = btrfs_create_uuid_tree(fs_info);
                if (ret) {
@@ -2975,8 +3018,9 @@ retry_root_backup:
                        close_ctree(tree_root);
                        return ret;
                }
-        } else if (check_uuid_tree ||
+        } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
-                   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+                   fs_info->generation !=
+                                btrfs_super_uuid_tree_generation(disk_super)) {
                pr_info("BTRFS: checking UUID tree\n");
                ret = btrfs_check_uuid_tree(fs_info);
                if (ret) {
@@ -3668,7 +3712,7 @@ void close_ctree(struct btrfs_root *root)
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
                if (ret)
-                        btrfs_err(root->fs_info, "commit super ret %d", ret);
+                        btrfs_err(fs_info, "commit super ret %d", ret);
        }
        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3680,10 +3724,10 @@ void close_ctree(struct btrfs_root *root)
        fs_info->closing = 2;
        smp_mb();
-        btrfs_free_qgroup_config(root->fs_info);
+        btrfs_free_qgroup_config(fs_info);
        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
-                btrfs_info(root->fs_info, "at unmount delalloc count %lld",
+                btrfs_info(fs_info, "at unmount delalloc count %lld",
                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
@@ -3723,7 +3767,7 @@ void close_ctree(struct btrfs_root *root)
        btrfs_free_stripe_hash_table(fs_info);
-        btrfs_free_block_rsv(root, root->orphan_block_rsv);
+        __btrfs_free_block_rsv(root->orphan_block_rsv);
        root->orphan_block_rsv = NULL;
        lock_chunks(root);
@@ -4134,7 +4178,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
                while (start <= end) {
-                        eb = btrfs_find_tree_block(root, start);
+                        eb = btrfs_find_tree_block(root->fs_info, start);
                        start += root->nodesize;
                        if (!eb)
                                continue;
@@ -4285,7 +4329,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
        return 0;
 }
-static struct extent_io_ops btree_extent_io_ops = {
+static const struct extent_io_ops btree_extent_io_ops = {
        .readpage_end_io_hook = btree_readpage_end_io_hook,
        .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0fd236..d4cbfeeeedd4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr);
 void clean_tree_block(struct btrfs_trans_handle *trans,
-                      struct btrfs_root *root, struct extent_buffer *buf);
+                      struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
 int open_ctree(struct super_block *sb,
               struct btrfs_fs_devices *fs_devices,
               char *options);
@@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
+struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
                                            u64 bytenr);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
                                      struct btrfs_key *location);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8b353ad02f03..1eef4ee01d1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 * list before we release it.
                 */
                if (btrfs_delayed_ref_is_head(ref)) {
+                        if (locked_ref->is_data &&
+                            locked_ref->total_ref_mod < 0) {
+                                spin_lock(&delayed_refs->lock);
+                                delayed_refs->pending_csums -= ref->num_bytes;
+                                spin_unlock(&delayed_refs->lock);
+                        }
                        btrfs_delayed_ref_unlock(locked_ref);
                        locked_ref = NULL;
                }
@@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 */
                spin_lock(&delayed_refs->lock);
                avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
-                avg = div64_u64(avg, 4);
+                fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
-                fs_info->avg_delayed_ref_runtime = avg;
                spin_unlock(&delayed_refs->lock);
        }
        return 0;
@@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
         * We don't ever fill up leaves all the way so multiply by 2 just to be
         * closer to what we're really going to want to ouse.
         */
-        return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+        return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+/*
+ * Takes the number of bytes to be csumm'ed and figures out how many leaves it
+ * would require to store the csums for that many bytes.
+ */
+u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
+{
+        u64 csum_size;
+        u64 num_csums_per_leaf;
+        u64 num_csums;
+        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+        num_csums_per_leaf = div64_u64(csum_size,
+                        (u64)btrfs_super_csum_size(root->fs_info->super_copy));
+        num_csums = div64_u64(csum_bytes, root->sectorsize);
+        num_csums += num_csums_per_leaf - 1;
+        num_csums = div64_u64(num_csums, num_csums_per_leaf);
+        return num_csums;
 }
 int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
 {
        struct btrfs_block_rsv *global_rsv;
        u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
-        u64 num_bytes;
+        u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
+        u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
+        u64 num_bytes, num_dirty_bgs_bytes;
        int ret = 0;
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
        if (num_heads > 1)
                num_bytes += (num_heads - 1) * root->nodesize;
        num_bytes <<= 1;
+        num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
+        num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
+                                                             num_dirty_bgs);
        global_rsv = &root->fs_info->global_block_rsv;
        /*
         * If we can't allocate any more chunks lets make sure we have _lots_ of
         * wiggle room since running delayed refs can create more delayed refs.
         */
-        if (global_rsv->space_info->full)
+        if (global_rsv->space_info->full) {
+                num_dirty_bgs_bytes <<= 1;
                num_bytes <<= 1;
+        }
        spin_lock(&global_rsv->lock);
-        if (global_rsv->reserved <= num_bytes)
+        if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
                ret = 1;
        spin_unlock(&global_rsv->lock);
        return ret;
@@ -3193,7 +3224,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        struct inode *inode = NULL;
        u64 alloc_hint = 0;
        int dcs = BTRFS_DC_ERROR;
-        int num_pages = 0;
+        u64 num_pages = 0;
        int retries = 0;
        int ret = 0;
@@ -3267,7 +3298,7 @@ again:
                if (ret)
                        goto out_put;
-                ret = btrfs_truncate_free_space_cache(root, trans, inode);
+                ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                if (ret)
                        goto out_put;
        }
@@ -3293,14 +3324,14 @@ again:
         * taking up quite a bit since it's not folded into the other space
         * cache.
         */
-        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
+        num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
-        ret = btrfs_check_data_free_space(inode, num_pages);
+        ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
        if (ret)
                goto out_put;
@@ -3351,16 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
        return 0;
 }
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+/*
+ * transaction commit does final block group cache writeback during a
+ * critical section where nothing is allowed to change the FS.  This is
+ * required in order for the cache to actually match the block group,
+ * but can introduce a lot of latency into the commit.
+ *
+ * So, btrfs_start_dirty_block_groups is here to kick off block group
+ * cache IO.  There's a chance we'll have to redo some of it if the
+ * block group changes again during the commit, but it greatly reduces
+ * the commit latency by getting rid of the easy block groups while
+ * we're still allowing others to join the commit.
+ */
+int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
-        struct btrfs_path *path;
+        int should_put;
+        struct btrfs_path *path = NULL;
+        LIST_HEAD(dirty);
+        struct list_head *io = &cur_trans->io_bgs;
+        int num_started = 0;
+        int loops = 0;
+        spin_lock(&cur_trans->dirty_bgs_lock);
+        if (!list_empty(&cur_trans->dirty_bgs)) {
+                list_splice_init(&cur_trans->dirty_bgs, &dirty);
+        }
+        spin_unlock(&cur_trans->dirty_bgs_lock);
-        if (list_empty(&cur_trans->dirty_bgs))
+again:
+        if (list_empty(&dirty)) {
+                btrfs_free_path(path);
                return 0;
+        }
+        /*
+         * make sure all the block groups on our dirty list actually
+         * exist
+         */
+        btrfs_create_pending_block_groups(trans, root);
+        if (!path) {
+                path = btrfs_alloc_path();
+                if (!path)
+                        return -ENOMEM;
+        }
+        while (!list_empty(&dirty)) {
+                cache = list_first_entry(&dirty,
+                                         struct btrfs_block_group_cache,
+                                         dirty_list);
+                /*
+                 * cache_write_mutex is here only to save us from balance
+                 * deleting this block group while we are writing out the
+                 * cache
+                 */
+                mutex_lock(&trans->transaction->cache_write_mutex);
+                /*
+                 * this can happen if something re-dirties a block
+                 * group that is already under IO.  Just wait for it to
+                 * finish and then do it all again
+                 */
+                if (!list_empty(&cache->io_list)) {
+                        list_del_init(&cache->io_list);
+                        btrfs_wait_cache_io(root, trans, cache,
+                                            &cache->io_ctl, path,
+                                            cache->key.objectid);
+                        btrfs_put_block_group(cache);
+                }
+                /*
+                 * btrfs_wait_cache_io uses the cache->dirty_list to decide
+                 * if it should update the cache_state.  Don't delete
+                 * until after we wait.
+                 *
+                 * Since we're not running in the commit critical section
+                 * we need the dirty_bgs_lock to protect from update_block_group
+                 */
+                spin_lock(&cur_trans->dirty_bgs_lock);
+                list_del_init(&cache->dirty_list);
+                spin_unlock(&cur_trans->dirty_bgs_lock);
+                should_put = 1;
+                cache_save_setup(cache, trans, path);
+                if (cache->disk_cache_state == BTRFS_DC_SETUP) {
+                        cache->io_ctl.inode = NULL;
+                        ret = btrfs_write_out_cache(root, trans, cache, path);
+                        if (ret == 0 && cache->io_ctl.inode) {
+                                num_started++;
+                                should_put = 0;
+                                /*
+                                 * the cache_write_mutex is protecting
+                                 * the io_list
+                                 */
+                                list_add_tail(&cache->io_list, io);
+                        } else {
+                                /*
+                                 * if we failed to write the cache, the
+                                 * generation will be bad and life goes on
+                                 */
+                                ret = 0;
+                        }
+                }
+                if (!ret)
+                        ret = write_one_cache_group(trans, root, path, cache);
+                mutex_unlock(&trans->transaction->cache_write_mutex);
+                /* if its not on the io list, we need to put the block group */
+                if (should_put)
+                        btrfs_put_block_group(cache);
+                if (ret)
+                        break;
+        }
+        /*
+         * go through delayed refs for all the stuff we've just kicked off
+         * and then loop back (just once)
+         */
+        ret = btrfs_run_delayed_refs(trans, root, 0);
+        if (!ret && loops == 0) {
+                loops++;
+                spin_lock(&cur_trans->dirty_bgs_lock);
+                list_splice_init(&cur_trans->dirty_bgs, &dirty);
+                spin_unlock(&cur_trans->dirty_bgs_lock);
+                goto again;
+        }
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *cache;
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int ret = 0;
+        int should_put;
+        struct btrfs_path *path;
+        struct list_head *io = &cur_trans->io_bgs;
+        int num_started = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3376,16 +3547,61 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
                                         dirty_list);
+                /*
+                 * this can happen if cache_save_setup re-dirties a block
+                 * group that is already under IO.  Just wait for it to
+                 * finish and then do it all again
+                 */
+                if (!list_empty(&cache->io_list)) {
+                        list_del_init(&cache->io_list);
+                        btrfs_wait_cache_io(root, trans, cache,
+                                            &cache->io_ctl, path,
+                                            cache->key.objectid);
+                        btrfs_put_block_group(cache);
+                }
+                /*
+                 * don't remove from the dirty list until after we've waited
+                 * on any pending IO
+                 */
                list_del_init(&cache->dirty_list);
-                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                should_put = 1;
-                        cache_save_setup(cache, trans, path);
+                cache_save_setup(cache, trans, path);
                if (!ret)
-                        ret = btrfs_run_delayed_refs(trans, root,
+                        ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
-                                                     (unsigned long) -1);
-                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
-                        btrfs_write_out_cache(root, trans, cache, path);
+                        cache->io_ctl.inode = NULL;
+                        ret = btrfs_write_out_cache(root, trans, cache, path);
+                        if (ret == 0 && cache->io_ctl.inode) {
+                                num_started++;
+                                should_put = 0;
+                                list_add_tail(&cache->io_list, io);
+                        } else {
+                                /*
+                                 * if we failed to write the cache, the
+                                 * generation will be bad and life goes on
+                                 */
+                                ret = 0;
+                        }
+                }
                if (!ret)
                        ret = write_one_cache_group(trans, root, path, cache);
+                /* if its not on the io list, we need to put the block group */
+                if (should_put)
+                        btrfs_put_block_group(cache);
+        }
+        while (!list_empty(io)) {
+                cache = list_first_entry(io, struct btrfs_block_group_cache,
+                                         io_list);
+                list_del_init(&cache->io_list);
+                btrfs_wait_cache_io(root, trans, cache,
+                                    &cache->io_ctl, path, cache->key.objectid);
                btrfs_put_block_group(cache);
        }
@@ -3635,19 +3851,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 * This will check the space that the inode allocates from to make sure we have
 * enough space for bytes.
 */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
 {
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        u64 used;
-        int ret = 0, committed = 0, alloc_chunk = 1;
+        int ret = 0;
+        int need_commit = 2;
+        int have_pinned_space;
        /* make sure bytes are sectorsize aligned */
        bytes = ALIGN(bytes, root->sectorsize);
        if (btrfs_is_free_space_inode(inode)) {
-                committed = 1;
+                need_commit = 0;
                ASSERT(current->journal_info);
        }
@@ -3669,7 +3887,7 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-                if (!data_sinfo->full && alloc_chunk) {
+                if (!data_sinfo->full) {
                        u64 alloc_target;
                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3697,8 +3915,10 @@ alloc:
                        if (ret < 0) {
                                if (ret != -ENOSPC)
                                        return ret;
-                                else
+                                else {
+                                        have_pinned_space = 1;
                                        goto commit_trans;
+                                }
                        }
                        if (!data_sinfo)
@@ -3709,26 +3929,39 @@ alloc:
                /*
                 * If we don't have enough pinned space to deal with this
-                 * allocation don't bother committing the transaction.
+                 * allocation, and no removed chunk in current transaction,
+                 * don't bother committing the transaction.
                 */
-                if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+                have_pinned_space = percpu_counter_compare(
-                                           bytes) < 0)
+                        &data_sinfo->total_bytes_pinned,
-                        committed = 1;
+                        used + bytes - data_sinfo->total_bytes);
                spin_unlock(&data_sinfo->lock);
                /* commit the current transaction and try again */
 commit_trans:
-                if (!committed &&
+                if (need_commit &&
                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
-                        committed = 1;
+                        need_commit--;
                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
-                        ret = btrfs_commit_transaction(trans, root);
+                        if (have_pinned_space >= 0 ||
-                        if (ret)
+                            trans->transaction->have_free_bgs ||
-                                return ret;
+                            need_commit > 0) {
-                        goto again;
+                                ret = btrfs_commit_transaction(trans, root);
+                                if (ret)
+                                        return ret;
+                                /*
+                                 * make sure that all running delayed iput are
+                                 * done
+                                 */
+                                down_write(&root->fs_info->delayed_iput_sem);
+                                up_write(&root->fs_info->delayed_iput_sem);
+                                goto again;
+                        } else {
+                                btrfs_end_transaction(trans, root);
+                        }
                }
                trace_btrfs_space_reservation(root->fs_info,
@@ -3736,12 +3969,16 @@ commit_trans:
                                              data_sinfo->flags, bytes, 1);
                return -ENOSPC;
        }
+        ret = btrfs_qgroup_reserve(root, write_bytes);
+        if (ret)
+                goto out;
        data_sinfo->bytes_may_use += bytes;
        trace_btrfs_space_reservation(root->fs_info, "space_info",
                                      data_sinfo->flags, bytes, 1);
+out:
        spin_unlock(&data_sinfo->lock);
-        return 0;
+        return ret;
 }
 /*
@@ -4298,8 +4535,13 @@ out:
 static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
                                        struct btrfs_fs_info *fs_info, u64 used)
 {
-        return (used >= div_factor_fine(space_info->total_bytes, 98) &&
+        u64 thresh = div_factor_fine(space_info->total_bytes, 98);
-                !btrfs_fs_closing(fs_info) &&
+        /* If we're just plain full then async reclaim just slows us down. */
+        if (space_info->bytes_used >= thresh)
+                return 0;
+        return (used >= thresh && !btrfs_fs_closing(fs_info) &&
                !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
 }
@@ -4354,10 +4596,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
                if (!btrfs_need_do_async_reclaim(space_info, fs_info,
                                                 flush_state))
                        return;
-        } while (flush_state <= COMMIT_TRANS);
+        } while (flush_state < COMMIT_TRANS);
-        if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
-                queue_work(system_unbound_wq, work);
 }
 void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4700,6 +4939,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
        kfree(rsv);
 }
+void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
+{
+        kfree(rsv);
+}
 int btrfs_block_rsv_add(struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv, u64 num_bytes,
                        enum btrfs_reserve_flush_enum flush)
@@ -4812,10 +5056,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
                    csum_size * 2;
-        num_bytes += div64_u64(data_used + meta_used, 50);
+        num_bytes += div_u64(data_used + meta_used, 50);
        if (num_bytes * 3 > meta_used)
-                num_bytes = div64_u64(meta_used, 3);
+                num_bytes = div_u64(meta_used, 3);
        return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
 }
@@ -4998,8 +5242,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
                                      u64 qgroup_reserved)
 {
        btrfs_block_rsv_release(root, rsv, (u64)-1);
-        if (qgroup_reserved)
-                btrfs_qgroup_free(root, qgroup_reserved);
 }
 /**
@@ -5066,30 +5308,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
                                   int reserve)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 csum_size;
+        u64 old_csums, num_csums;
-        int num_csums_per_leaf;
-        int num_csums;
-        int old_csums;
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
            BTRFS_I(inode)->csum_bytes == 0)
                return 0;
-        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+        old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
        if (reserve)
                BTRFS_I(inode)->csum_bytes += num_bytes;
        else
                BTRFS_I(inode)->csum_bytes -= num_bytes;
-        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+        num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
-        num_csums_per_leaf = (int)div64_u64(csum_size,
-                                            sizeof(struct btrfs_csum_item) +
-                                            sizeof(struct btrfs_disk_key));
-        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
-        num_csums = num_csums + num_csums_per_leaf - 1;
-        num_csums = num_csums / num_csums_per_leaf;
-        old_csums = old_csums + num_csums_per_leaf - 1;
-        old_csums = old_csums / num_csums_per_leaf;
        /* No change, no need to reserve more */
        if (old_csums == num_csums)
@@ -5163,8 +5393,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        spin_unlock(&BTRFS_I(inode)->lock);
        if (root->fs_info->quota_enabled) {
-                ret = btrfs_qgroup_reserve(root, num_bytes +
+                ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
-                                           nr_extents * root->nodesize);
                if (ret)
                        goto out_fail;
        }
@@ -5172,8 +5401,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
        if (unlikely(ret)) {
                if (root->fs_info->quota_enabled)
-                        btrfs_qgroup_free(root, num_bytes +
+                        btrfs_qgroup_free(root, nr_extents * root->nodesize);
-                                                nr_extents * root->nodesize);
                goto out_fail;
        }
@@ -5290,10 +5518,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
-        if (root->fs_info->quota_enabled) {
-                btrfs_qgroup_free(root, num_bytes +
-                                        dropped * root->nodesize);
-        }
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
@@ -5318,7 +5542,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
        int ret;
-        ret = btrfs_check_data_free_space(inode, num_bytes);
+        ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
        if (ret)
                return ret;
@@ -5390,14 +5614,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
-                spin_lock(&trans->transaction->dirty_bgs_lock);
-                if (list_empty(&cache->dirty_list)) {
-                        list_add_tail(&cache->dirty_list,
-                                      &trans->transaction->dirty_bgs);
-                        btrfs_get_block_group(cache);
-                }
-                spin_unlock(&trans->transaction->dirty_bgs_lock);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -5446,6 +5662,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                                spin_unlock(&info->unused_bgs_lock);
                        }
                }
+                spin_lock(&trans->transaction->dirty_bgs_lock);
+                if (list_empty(&cache->dirty_list)) {
+                        list_add_tail(&cache->dirty_list,
+                                      &trans->transaction->dirty_bgs);
+                                trans->transaction->num_dirty_bgs++;
+                        btrfs_get_block_group(cache);
+                }
+                spin_unlock(&trans->transaction->dirty_bgs_lock);
                btrfs_put_block_group(cache);
                total -= num_bytes;
                bytenr += num_bytes;
@@ -6956,15 +7182,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
                return -ENOSPC;
        }
-        if (btrfs_test_opt(root, DISCARD))
-                ret = btrfs_discard_extent(root, start, len, NULL);
        if (pin)
                pin_down_extent(root, cache, start, len, 1);
        else {
+                if (btrfs_test_opt(root, DISCARD))
+                        ret = btrfs_discard_extent(root, start, len, NULL);
                btrfs_add_free_space(cache, start, len);
                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
        }
        btrfs_put_block_group(cache);
        trace_btrfs_reserved_extent_free(root, start, len);
@@ -7095,9 +7321,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
                                      ins, size);
        if (ret) {
+                btrfs_free_path(path);
                btrfs_free_and_pin_reserved_extent(root, ins->objectid,
                                                   root->nodesize);
-                btrfs_free_path(path);
                return ret;
        }
@@ -7217,7 +7443,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        btrfs_set_header_generation(buf, trans->transid);
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
-        clean_tree_block(trans, root, buf);
+        clean_tree_block(trans, root->fs_info, buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
        btrfs_set_lock_blocking(buf);
@@ -7815,7 +8041,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
        blocksize = root->nodesize;
-        next = btrfs_find_tree_block(root, bytenr);
+        next = btrfs_find_tree_block(root->fs_info, bytenr);
        if (!next) {
                next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
@@ -8016,7 +8242,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        btrfs_set_lock_blocking(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
-                clean_tree_block(trans, root, eb);
+                clean_tree_block(trans, root->fs_info, eb);
        }
        if (eb == root->node) {
@@ -8533,10 +8759,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        BUG_ON(cache->ro);
+again:
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
+        /*
+         * we're not allowed to set block groups readonly after the dirty
+         * block groups cache has started writing.  If it already started,
+         * back off and let this transaction commit
+         */
+        mutex_lock(&root->fs_info->ro_block_group_mutex);
+        if (trans->transaction->dirty_bg_run) {
+                u64 transid = trans->transid;
+                mutex_unlock(&root->fs_info->ro_block_group_mutex);
+                btrfs_end_transaction(trans, root);
+                ret = btrfs_wait_for_commit(root, transid);
+                if (ret)
+                        return ret;
+                goto again;
+        }
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
@@ -8551,6 +8797,7 @@ out:
                alloc_flags = update_block_group_flags(root, cache->flags);
                check_system_chunk(trans, root, alloc_flags);
        }
+        mutex_unlock(&root->fs_info->ro_block_group_mutex);
        btrfs_end_transaction(trans, root);
        return ret;
@@ -8720,7 +8967,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                min_free <<= 1;
        } else if (index == BTRFS_RAID_RAID0) {
                dev_min = fs_devices->rw_devices;
-                do_div(min_free, dev_min);
+                min_free = div64_u64(min_free, dev_min);
        }
        /* We need to do this so that we can look at pending chunks */
@@ -8992,6 +9239,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
        INIT_LIST_HEAD(&cache->dirty_list);
+        INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
@@ -9355,7 +9603,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
        }
+        /*
+         * get the inode first so any iput calls done for the io_list
+         * aren't the final iput (no unlinks allowed now)
+         */
        inode = lookup_free_space_inode(tree_root, block_group, path);
+        mutex_lock(&trans->transaction->cache_write_mutex);
+        /*
+         * make sure our free spache cache IO is done before remove the
+         * free space inode
+         */
+        spin_lock(&trans->transaction->dirty_bgs_lock);
+        if (!list_empty(&block_group->io_list)) {
+                list_del_init(&block_group->io_list);
+                WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
+                spin_unlock(&trans->transaction->dirty_bgs_lock);
+                btrfs_wait_cache_io(root, trans, block_group,
+                                    &block_group->io_ctl, path,
+                                    block_group->key.objectid);
+                btrfs_put_block_group(block_group);
+                spin_lock(&trans->transaction->dirty_bgs_lock);
+        }
+        if (!list_empty(&block_group->dirty_list)) {
+                list_del_init(&block_group->dirty_list);
+                btrfs_put_block_group(block_group);
+        }
+        spin_unlock(&trans->transaction->dirty_bgs_lock);
+        mutex_unlock(&trans->transaction->cache_write_mutex);
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
                if (ret) {
@@ -9448,18 +9727,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&trans->transaction->dirty_bgs_lock);
        if (!list_empty(&block_group->dirty_list)) {
-                list_del_init(&block_group->dirty_list);
+                WARN_ON(1);
-                btrfs_put_block_group(block_group);
+        }
+        if (!list_empty(&block_group->io_list)) {
+                WARN_ON(1);
        }
        spin_unlock(&trans->transaction->dirty_bgs_lock);
        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        list_del_init(&block_group->ro_list);
+        if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                WARN_ON(block_group->space_info->total_bytes
+                        < block_group->key.offset);
+                WARN_ON(block_group->space_info->bytes_readonly
+                        < block_group->key.offset);
+                WARN_ON(block_group->space_info->disk_total
+                        < block_group->key.offset * factor);
+        }
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
        spin_unlock(&block_group->space_info->lock);
        memcpy(&key, &block_group->key, sizeof(key));
@@ -9647,8 +9937,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                /* Reset pinned so btrfs_put_block_group doesn't complain */
+                spin_lock(&space_info->lock);
+                spin_lock(&block_group->lock);
+                space_info->bytes_pinned -= block_group->pinned;
+                space_info->bytes_readonly += block_group->pinned;
+                percpu_counter_add(&space_info->total_bytes_pinned,
+                                   -block_group->pinned);
                block_group->pinned = 0;
+                spin_unlock(&block_group->lock);
+                spin_unlock(&space_info->lock);
                /*
                 * Btrfs_remove_chunk will abort the transaction if things go
                 * horribly wrong.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d688cfe5d496..782f3bc4651d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                }
                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
                                              em_len, flags);
-                if (ret)
+                if (ret) {
+                        if (ret == 1)
+                                ret = 0;
                        goto out_free;
+                }
        }
 out_free:
        free_extent_map(em);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 695b0ccfb755..c668f36898d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,7 +97,7 @@ struct extent_io_tree {
        u64 dirty_bytes;
        int track_uptodate;
        spinlock_t lock;
-        struct extent_io_ops *ops;
+        const struct extent_io_ops *ops;
 };
 struct extent_state {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..58ece6558430 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
        if (!dst) {
                if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
-                        btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
+                        btrfs_bio->csum_allocated = kmalloc_array(nblocks,
-                                                            GFP_NOFS);
+                                        csum_size, GFP_NOFS);
                        if (!btrfs_bio->csum_allocated) {
                                btrfs_free_path(path);
                                return -ENOMEM;
@@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
                btrfs_truncate_item(root, path, new_size, 0);
                key->offset = end_byte;
-                btrfs_set_item_key_safe(root, path, key);
+                btrfs_set_item_key_safe(root->fs_info, path, key);
        } else {
                BUG();
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index faa7d390841b..467620a3b1f9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
                defrag = rb_entry(node, struct inode_defrag, rb_node);
                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-                if (need_resched()) {
+                cond_resched_lock(&fs_info->defrag_inodes_lock);
-                        spin_unlock(&fs_info->defrag_inodes_lock);
-                        cond_resched();
-                        spin_lock(&fs_info->defrag_inodes_lock);
-                }
                node = rb_first(&fs_info->defrag_inodes);
        }
@@ -868,7 +864,7 @@ next_slot:
                        memcpy(&new_key, &key, sizeof(new_key));
                        new_key.offset = end;
-                        btrfs_set_item_key_safe(root, path, &new_key);
+                        btrfs_set_item_key_safe(root->fs_info, path, &new_key);
                        extent_offset += end - key.offset;
                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1126,7 +1122,7 @@ again:
                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
-                        btrfs_set_item_key_safe(root, path, &new_key);
+                        btrfs_set_item_key_safe(root->fs_info, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
                        btrfs_set_file_extent_generation(leaf, fi,
@@ -1160,7 +1156,7 @@ again:
                                                         trans->transid);
                        path->slots[0]++;
                        new_key.offset = start;
-                        btrfs_set_item_key_safe(root, path, &new_key);
+                        btrfs_set_item_key_safe(root->fs_info, path, &new_key);
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
@@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                        PAGE_CACHE_SIZE / (sizeof(struct page *)));
        nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
        nrptrs = max(nrptrs, 8);
-        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                return -ENOMEM;
@@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                }
                reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
-                ret = btrfs_check_data_free_space(inode, reserve_bytes);
+                ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
                if (ret == -ENOSPC &&
                    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
                                              BTRFS_INODE_PREALLOC))) {
@@ -1635,8 +1631,8 @@ again:
                        btrfs_end_write_no_snapshoting(root);
                if (only_release_metadata && copied > 0) {
-                        u64 lockstart = round_down(pos, root->sectorsize);
+                        lockstart = round_down(pos, root->sectorsize);
-                        u64 lockend = lockstart +
+                        lockend = lockstart +
                                (dirty_pages << PAGE_CACHE_SHIFT) - 1;
                        set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -1809,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
         * otherwise subsequent syncs to a file that's been synced in this
         * transaction will appear to have already occured.
         */
+        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->last_sub_trans = root->log_transid;
+        spin_unlock(&BTRFS_I(inode)->lock);
        if (num_written > 0) {
                err = generic_write_sync(file, pos, num_written);
                if (err < 0)
@@ -2162,7 +2160,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
                u64 num_bytes;
                key.offset = offset;
-                btrfs_set_item_key_safe(root, path, &key);
+                btrfs_set_item_key_safe(root->fs_info, path, &key);
                fi = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_file_extent_item);
                num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2545,7 +2543,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 {
        struct inode *inode = file_inode(file);
        struct extent_state *cached_state = NULL;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 cur_offset;
        u64 last_byte;
        u64 alloc_start;
@@ -2570,14 +2567,9 @@ static long btrfs_fallocate(struct file *file, int mode,
         * Make sure we have enough space before we do the
         * allocation.
         */
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
        if (ret)
                return ret;
-        if (root->fs_info->quota_enabled) {
-                ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
-                if (ret)
-                        goto out_reserve_fail;
-        }
        mutex_lock(&inode->i_mutex);
        ret = inode_newsize_ok(inode, alloc_end);
@@ -2667,23 +2659,35 @@ static long btrfs_fallocate(struct file *file, int mode,
                                                        1 << inode->i_blkbits,
                                                        offset + len,
                                                        &alloc_hint);
-                        if (ret < 0) {
-                                free_extent_map(em);
-                                break;
-                        }
                } else if (actual_end > inode->i_size &&
                           !(mode & FALLOC_FL_KEEP_SIZE)) {
+                        struct btrfs_trans_handle *trans;
+                        struct btrfs_root *root = BTRFS_I(inode)->root;
                        /*
                         * We didn't need to allocate any more space, but we
                         * still extended the size of the file so we need to
-                         * update i_size.
+                         * update i_size and the inode item.
                         */
-                        inode->i_ctime = CURRENT_TIME;
+                        trans = btrfs_start_transaction(root, 1);
-                        i_size_write(inode, actual_end);
+                        if (IS_ERR(trans)) {
-                        btrfs_ordered_update_i_size(inode, actual_end, NULL);
+                                ret = PTR_ERR(trans);
+                        } else {
+                                inode->i_ctime = CURRENT_TIME;
+                                i_size_write(inode, actual_end);
+                                btrfs_ordered_update_i_size(inode, actual_end,
+                                                            NULL);
+                                ret = btrfs_update_inode(trans, root, inode);
+                                if (ret)
+                                        btrfs_end_transaction(trans, root);
+                                else
+                                        ret = btrfs_end_transaction(trans,
+                                                                    root);
+                        }
                }
                free_extent_map(em);
+                if (ret < 0)
+                        break;
                cur_offset = last_byte;
                if (cur_offset >= alloc_end) {
@@ -2695,9 +2699,6 @@ static long btrfs_fallocate(struct file *file, int mode,
                             &cached_state, GFP_NOFS);
 out:
        mutex_unlock(&inode->i_mutex);
-        if (root->fs_info->quota_enabled)
-                btrfs_qgroup_free(root, alloc_end - alloc_start);
-out_reserve_fail:
        /* Let go of our reservation. */
        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
        return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..253cb74b0e27 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
        }
        mapping_set_gfp_mask(inode->i_mapping,
-                        mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+                        mapping_gfp_mask(inode->i_mapping) &
+                        ~(GFP_NOFS & ~__GFP_HIGHMEM));
        return inode;
 }
@@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      sizeof(struct btrfs_free_space_header));
        if (ret < 0) {
                btrfs_release_path(path);
                return ret;
        }
        leaf = path->nodes[0];
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
@@ -225,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_block_group_cache *block_group,
                                    struct inode *inode)
 {
        int ret = 0;
+        struct btrfs_path *path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        if (block_group) {
+                mutex_lock(&trans->transaction->cache_write_mutex);
+                if (!list_empty(&block_group->io_list)) {
+                        list_del_init(&block_group->io_list);
+                        btrfs_wait_cache_io(root, trans, block_group,
+                                            &block_group->io_ctl, path,
+                                            block_group->key.objectid);
+                        btrfs_put_block_group(block_group);
+                }
+                /*
+                 * now that we've truncated the cache away, its no longer
+                 * setup or written
+                 */
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_CLEAR;
+                spin_unlock(&block_group->lock);
+        }
+        btrfs_free_path(path);
        btrfs_i_size_write(inode, 0);
        truncate_pagecache(inode, 0);
@@ -235,15 +264,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
        /*
         * We don't need an orphan item because truncating the free space cache
         * will never be split across transactions.
+         * We don't need to check for -EAGAIN because we're a free space
+         * cache inode
         */
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
        if (ret) {
+                mutex_unlock(&trans->transaction->cache_write_mutex);
                btrfs_abort_transaction(trans, root, ret);
                return ret;
        }
        ret = btrfs_update_inode(trans, root, inode);
+        if (block_group)
+                mutex_unlock(&trans->transaction->cache_write_mutex);
+fail:
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
@@ -269,18 +306,7 @@ static int readahead_cache(struct inode *inode)
        return 0;
 }
-struct io_ctl {
+static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
-        void *cur, *orig;
-        struct page *page;
-        struct page **pages;
-        struct btrfs_root *root;
-        unsigned long size;
-        int index;
-        int num_pages;
-        unsigned check_crcs:1;
-};
-static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
                       struct btrfs_root *root, int write)
 {
        int num_pages;
@@ -296,45 +322,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
            (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
                return -ENOSPC;
-        memset(io_ctl, 0, sizeof(struct io_ctl));
+        memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
-        io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+        io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
        if (!io_ctl->pages)
                return -ENOMEM;
        io_ctl->num_pages = num_pages;
        io_ctl->root = root;
        io_ctl->check_crcs = check_crcs;
+        io_ctl->inode = inode;
        return 0;
 }
-static void io_ctl_free(struct io_ctl *io_ctl)
+static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
        kfree(io_ctl->pages);
+        io_ctl->pages = NULL;
 }
-static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
 {
        if (io_ctl->cur) {
-                kunmap(io_ctl->page);
                io_ctl->cur = NULL;
                io_ctl->orig = NULL;
        }
 }
-static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 {
        ASSERT(io_ctl->index < io_ctl->num_pages);
        io_ctl->page = io_ctl->pages[io_ctl->index++];
-        io_ctl->cur = kmap(io_ctl->page);
+        io_ctl->cur = page_address(io_ctl->page);
        io_ctl->orig = io_ctl->cur;
        io_ctl->size = PAGE_CACHE_SIZE;
        if (clear)
                memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
 }
-static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 {
        int i;
@@ -349,7 +376,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
        }
 }
-static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
                                int uptodate)
 {
        struct page *page;
@@ -383,7 +410,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
        return 0;
 }
-static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
        __le64 *val;
@@ -406,7 +433,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
        io_ctl->cur += sizeof(u64);
 }
-static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
 {
        __le64 *gen;
@@ -435,7 +462,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
        return 0;
 }
-static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
        u32 *tmp;
        u32 crc = ~(u32)0;
@@ -453,13 +480,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
                              PAGE_CACHE_SIZE - offset);
        btrfs_csum_final(crc, (char *)&crc);
        io_ctl_unmap_page(io_ctl);
-        tmp = kmap(io_ctl->pages[0]);
+        tmp = page_address(io_ctl->pages[0]);
        tmp += index;
        *tmp = crc;
-        kunmap(io_ctl->pages[0]);
 }
-static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 {
        u32 *tmp, val;
        u32 crc = ~(u32)0;
@@ -473,10 +499,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
        if (index == 0)
                offset = sizeof(u32) * io_ctl->num_pages;
-        tmp = kmap(io_ctl->pages[0]);
+        tmp = page_address(io_ctl->pages[0]);
        tmp += index;
        val = *tmp;
-        kunmap(io_ctl->pages[0]);
        io_ctl_map_page(io_ctl, 0);
        crc = btrfs_csum_data(io_ctl->orig + offset, crc,
@@ -492,7 +517,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
        return 0;
 }
-static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
                            void *bitmap)
 {
        struct btrfs_free_space_entry *entry;
@@ -522,7 +547,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
        return 0;
 }
-static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 {
        if (!io_ctl->cur)
                return -ENOSPC;
@@ -545,7 +570,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
        return 0;
 }
-static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
 {
        /*
         * If we're not on the boundary we know we've modified the page and we
@@ -562,7 +587,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
        }
 }
-static int io_ctl_read_entry(struct io_ctl *io_ctl,
+static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
                            struct btrfs_free_space *entry, u8 *type)
 {
        struct btrfs_free_space_entry *e;
@@ -589,7 +614,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
        return 0;
 }
-static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
                              struct btrfs_free_space *entry)
 {
        int ret;
@@ -648,7 +673,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 {
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
-        struct io_ctl io_ctl;
+        struct btrfs_io_ctl io_ctl;
        struct btrfs_key key;
        struct btrfs_free_space *e, *n;
        LIST_HEAD(bitmaps);
@@ -877,7 +902,7 @@ out:
 }
 static noinline_for_stack
-int write_cache_extent_entries(struct io_ctl *io_ctl,
+int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
                              struct btrfs_free_space_ctl *ctl,
                              struct btrfs_block_group_cache *block_group,
                              int *entries, int *bitmaps,
@@ -885,6 +910,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
 {
        int ret;
        struct btrfs_free_cluster *cluster = NULL;
+        struct btrfs_free_cluster *cluster_locked = NULL;
        struct rb_node *node = rb_first(&ctl->free_space_offset);
        struct btrfs_trim_range *trim_entry;
@@ -896,6 +922,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
        }
        if (!node && cluster) {
+                cluster_locked = cluster;
+                spin_lock(&cluster_locked->lock);
                node = rb_first(&cluster->root);
                cluster = NULL;
        }
@@ -919,9 +947,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
                node = rb_next(node);
                if (!node && cluster) {
                        node = rb_first(&cluster->root);
+                        cluster_locked = cluster;
+                        spin_lock(&cluster_locked->lock);
                        cluster = NULL;
                }
        }
+        if (cluster_locked) {
+                spin_unlock(&cluster_locked->lock);
+                cluster_locked = NULL;
+        }
        /*
         * Make sure we don't miss any range that was removed from our rbtree
@@ -939,6 +973,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
        return 0;
 fail:
+        if (cluster_locked)
+                spin_unlock(&cluster_locked->lock);
        return -ENOSPC;
 }
@@ -1000,7 +1036,7 @@ fail:
 static noinline_for_stack int
 write_pinned_extent_entries(struct btrfs_root *root,
                            struct btrfs_block_group_cache *block_group,
-                            struct io_ctl *io_ctl,
+                            struct btrfs_io_ctl *io_ctl,
                            int *entries)
 {
        u64 start, extent_start, extent_end, len;
@@ -1050,7 +1086,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
 }
 static noinline_for_stack int
-write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
 {
        struct list_head *pos, *n;
        int ret;
@@ -1084,7 +1120,7 @@ static int flush_dirty_cache(struct inode *inode)
 static void noinline_for_stack
 cleanup_write_cache_enospc(struct inode *inode,
-                           struct io_ctl *io_ctl,
+                           struct btrfs_io_ctl *io_ctl,
                           struct extent_state **cached_state,
                           struct list_head *bitmap_list)
 {
@@ -1101,6 +1137,70 @@ cleanup_write_cache_enospc(struct inode *inode,
                             GFP_NOFS);
 }
+int btrfs_wait_cache_io(struct btrfs_root *root,
+                        struct btrfs_trans_handle *trans,
+                        struct btrfs_block_group_cache *block_group,
+                        struct btrfs_io_ctl *io_ctl,
+                        struct btrfs_path *path, u64 offset)
+{
+        int ret;
+        struct inode *inode = io_ctl->inode;
+        if (!inode)
+                return 0;
+        root = root->fs_info->tree_root;
+        /* Flush the dirty pages in the cache file. */
+        ret = flush_dirty_cache(inode);
+        if (ret)
+                goto out;
+        /* Update the cache item to tell everyone this cache file is valid. */
+        ret = update_cache_item(trans, root, inode, path, offset,
+                                io_ctl->entries, io_ctl->bitmaps);
+out:
+        io_ctl_free(io_ctl);
+        if (ret) {
+                invalidate_inode_pages2(inode->i_mapping);
+                BTRFS_I(inode)->generation = 0;
+                if (block_group) {
+#ifdef DEBUG
+                        btrfs_err(root->fs_info,
+                                "failed to write free space cache for block group %llu",
+                                block_group->key.objectid);
+#endif
+                }
+        }
+        btrfs_update_inode(trans, root, inode);
+        if (block_group) {
+                /* the dirty list is protected by the dirty_bgs_lock */
+                spin_lock(&trans->transaction->dirty_bgs_lock);
+                /* the disk_cache_state is protected by the block group lock */
+                spin_lock(&block_group->lock);
+                /*
+                 * only mark this as written if we didn't get put back on
+                 * the dirty list while waiting for IO.   Otherwise our
+                 * cache state won't be right, and we won't get written again
+                 */
+                if (!ret && list_empty(&block_group->dirty_list))
+                        block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+                else if (ret)
+                        block_group->disk_cache_state = BTRFS_DC_ERROR;
+                spin_unlock(&block_group->lock);
+                spin_unlock(&trans->transaction->dirty_bgs_lock);
+                io_ctl->inode = NULL;
+                iput(inode);
+        }
+        return ret;
+}
 /**
 * __btrfs_write_out_cache - write out cached info to an inode
 * @root - the root the inode belongs to
@@ -1117,20 +1217,22 @@ cleanup_write_cache_enospc(struct inode *inode,
 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                                   struct btrfs_free_space_ctl *ctl,
                                   struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_io_ctl *io_ctl,
                                   struct btrfs_trans_handle *trans,
                                   struct btrfs_path *path, u64 offset)
 {
        struct extent_state *cached_state = NULL;
-        struct io_ctl io_ctl;
        LIST_HEAD(bitmap_list);
        int entries = 0;
        int bitmaps = 0;
        int ret;
+        int must_iput = 0;
        if (!i_size_read(inode))
                return -1;
-        ret = io_ctl_init(&io_ctl, inode, root, 1);
+        WARN_ON(io_ctl->pages);
+        ret = io_ctl_init(io_ctl, inode, root, 1);
        if (ret)
                return -1;
@@ -1143,24 +1245,27 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                        up_write(&block_group->data_rwsem);
                        BTRFS_I(inode)->generation = 0;
                        ret = 0;
+                        must_iput = 1;
                        goto out;
                }
                spin_unlock(&block_group->lock);
        }
        /* Lock all pages first so we can lock the extent safely. */
-        io_ctl_prepare_pages(&io_ctl, inode, 0);
+        io_ctl_prepare_pages(io_ctl, inode, 0);
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state);
-        io_ctl_set_generation(&io_ctl, trans->transid);
+        io_ctl_set_generation(io_ctl, trans->transid);
        mutex_lock(&ctl->cache_writeout_mutex);
        /* Write out the extent entries in the free space cache */
-        ret = write_cache_extent_entries(&io_ctl, ctl,
+        spin_lock(&ctl->tree_lock);
+        ret = write_cache_extent_entries(io_ctl, ctl,
                                         block_group, &entries, &bitmaps,
                                         &bitmap_list);
+        spin_unlock(&ctl->tree_lock);
        if (ret) {
                mutex_unlock(&ctl->cache_writeout_mutex);
                goto out_nospc;
@@ -1170,8 +1275,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * Some spaces that are freed in the current transaction are pinned,
         * they will be added into free space cache after the transaction is
         * committed, we shouldn't lose them.
+         *
+         * If this changes while we are working we'll get added back to
+         * the dirty list and redo it.  No locking needed
         */
-        ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+        ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
        if (ret) {
                mutex_unlock(&ctl->cache_writeout_mutex);
                goto out_nospc;
@@ -1182,16 +1290,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * locked while doing it because a concurrent trim can be manipulating
         * or freeing the bitmap.
         */
-        ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+        spin_lock(&ctl->tree_lock);
+        ret = write_bitmap_entries(io_ctl, &bitmap_list);
+        spin_unlock(&ctl->tree_lock);
        mutex_unlock(&ctl->cache_writeout_mutex);
        if (ret)
                goto out_nospc;
        /* Zero out the rest of the pages just to make sure */
-        io_ctl_zero_remaining_pages(&io_ctl);
+        io_ctl_zero_remaining_pages(io_ctl);
        /* Everything is written out, now we dirty the pages in the file. */
-        ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+        ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
                                0, i_size_read(inode), &cached_state);
        if (ret)
                goto out_nospc;
@@ -1202,30 +1312,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         * Release the pages and unlock the extent, we will flush
         * them out later
         */
-        io_ctl_drop_pages(&io_ctl);
+        io_ctl_drop_pages(io_ctl);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
-        /* Flush the dirty pages in the cache file. */
+        /*
-        ret = flush_dirty_cache(inode);
+         * at this point the pages are under IO and we're happy,
+         * The caller is responsible for waiting on them and updating the
+         * the cache and the inode
+         */
+        io_ctl->entries = entries;
+        io_ctl->bitmaps = bitmaps;
+        ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
        if (ret)
                goto out;
-        /* Update the cache item to tell everyone this cache file is valid. */
+        return 0;
-        ret = update_cache_item(trans, root, inode, path, offset,
-                                entries, bitmaps);
 out:
-        io_ctl_free(&io_ctl);
+        io_ctl->inode = NULL;
+        io_ctl_free(io_ctl);
        if (ret) {
                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
        }
        btrfs_update_inode(trans, root, inode);
+        if (must_iput)
+                iput(inode);
        return ret;
 out_nospc:
-        cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+        cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
        if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
                up_write(&block_group->data_rwsem);
@@ -1241,7 +1360,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct inode *inode;
        int ret = 0;
-        enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
        root = root->fs_info->tree_root;
@@ -1250,34 +1368,34 @@ int btrfs_write_out_cache(struct btrfs_root *root,
                spin_unlock(&block_group->lock);
                return 0;
        }
-        if (block_group->delalloc_bytes) {
-                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
-                spin_unlock(&block_group->lock);
-                return 0;
-        }
        spin_unlock(&block_group->lock);
        inode = lookup_free_space_inode(root, block_group, path);
        if (IS_ERR(inode))
                return 0;
-        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+        ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
+                                      &block_group->io_ctl, trans,
                                      path, block_group->key.objectid);
        if (ret) {
-                dcs = BTRFS_DC_ERROR;
-                ret = 0;
 #ifdef DEBUG
                btrfs_err(root->fs_info,
                        "failed to write free space cache for block group %llu",
                        block_group->key.objectid);
 #endif
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_ERROR;
+                spin_unlock(&block_group->lock);
+                block_group->io_ctl.inode = NULL;
+                iput(inode);
        }
-        spin_lock(&block_group->lock);
+        /*
-        block_group->disk_cache_state = dcs;
+         * if ret == 0 the caller is expected to call btrfs_wait_cache_io
-        spin_unlock(&block_group->lock);
+         * to wait for IO and put the inode
-        iput(inode);
+         */
        return ret;
 }
@@ -1298,11 +1416,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
                                   u64 offset)
 {
        u64 bitmap_start;
-        u64 bytes_per_bitmap;
+        u32 bytes_per_bitmap;
        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
        bitmap_start = offset - ctl->start;
-        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
        bitmap_start *= bytes_per_bitmap;
        bitmap_start += ctl->start;
@@ -1521,10 +1639,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        u64 bitmap_bytes;
        u64 extent_bytes;
        u64 size = block_group->key.offset;
-        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
+        u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
-        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+        u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
-        max_bitmaps = max(max_bitmaps, 1);
+        max_bitmaps = max_t(u32, max_bitmaps, 1);
        ASSERT(ctl->total_bitmaps <= max_bitmaps);
@@ -1537,7 +1655,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
                max_bytes = MAX_CACHE_BYTES_PER_GIG;
        else
                max_bytes = MAX_CACHE_BYTES_PER_GIG *
-                        div64_u64(size, 1024 * 1024 * 1024);
+                        div_u64(size, 1024 * 1024 * 1024);
        /*
         * we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1670,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        }
        /*
-         * we want the extent entry threshold to always be at most 1/2 the maxw
+         * we want the extent entry threshold to always be at most 1/2 the max
         * bytes we can have, or whatever is less than that.
         */
        extent_bytes = max_bytes - bitmap_bytes;
-        extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+        extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
        ctl->extents_thresh =
-                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+                div_u64(extent_bytes, sizeof(struct btrfs_free_space));
 }
 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1673,7 +1791,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
                 */
                if (*bytes >= align) {
                        tmp = entry->offset - ctl->start + align - 1;
-                        do_div(tmp, align);
+                        tmp = div64_u64(tmp, align);
                        tmp = tmp * align + ctl->start;
                        align_off = tmp - entry->offset;
                } else {
@@ -2402,11 +2520,8 @@ static void __btrfs_remove_free_space_cache_locked(
                } else {
                        free_bitmap(ctl, info);
                }
-                if (need_resched()) {
-                        spin_unlock(&ctl->tree_lock);
+                cond_resched_lock(&ctl->tree_lock);
-                        cond_resched();
-                        spin_lock(&ctl->tree_lock);
-                }
        }
 }
@@ -2431,11 +2546,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
-                if (need_resched()) {
-                        spin_unlock(&ctl->tree_lock);
+                cond_resched_lock(&ctl->tree_lock);
-                        cond_resched();
-                        spin_lock(&ctl->tree_lock);
-                }
        }
        __btrfs_remove_free_space_cache_locked(ctl);
        spin_unlock(&ctl->tree_lock);
@@ -3346,11 +3458,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 {
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        int ret;
+        struct btrfs_io_ctl io_ctl;
        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
                return 0;
-        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
+                                      trans, path, 0) ||
+                btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
        if (ret) {
                btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238a0aed..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
                           struct btrfs_free_space *info);
 };
+struct btrfs_io_ctl;
 struct inode *lookup_free_space_inode(struct btrfs_root *root,
                                      struct btrfs_block_group_cache
                                      *block_group, struct btrfs_path *path);
@@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
                                       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_block_group_cache *block_group,
                                    struct inode *inode);
 int load_free_space_cache(struct btrfs_fs_info *fs_info,
                          struct btrfs_block_group_cache *block_group);
+int btrfs_wait_cache_io(struct btrfs_root *root,
+                        struct btrfs_trans_handle *trans,
+                        struct btrfs_block_group_cache *block_group,
+                        struct btrfs_io_ctl *io_ctl,
+                        struct btrfs_path *path, u64 offset);
 int btrfs_write_out_cache(struct btrfs_root *root,
                          struct btrfs_trans_handle *trans,
                          struct btrfs_block_group_cache *block_group,
                          struct btrfs_path *path);
 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
                                    struct btrfs_path *path);
 int create_free_ino_inode(struct btrfs_root *root,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..f6a596d5a637 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ again:
        }
        if (i_size_read(inode) > 0) {
-                ret = btrfs_truncate_free_space_cache(root, trans, inode);
+                ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
                if (ret) {
                        if (ret != -ENOSPC)
                                btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43192e10cc43..56f00a25c003 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -59,6 +59,7 @@
 #include "backref.h"
 #include "hash.h"
 #include "props.h"
+#include "qgroup.h"
 struct btrfs_iget_args {
        struct btrfs_key *location;
@@ -470,7 +471,7 @@ again:
         */
        if (inode_need_compress(inode)) {
                WARN_ON(pages);
-                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
                if (!pages) {
                        /* just bail out to the uncompressed code */
                        goto cont;
@@ -752,7 +753,6 @@ retry:
                        }
                        goto out_free;
                }
                /*
                 * here we're doing allocation and writeback of the
                 * compressed pages
@@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        if (empty)
                return;
+        down_read(&fs_info->delayed_iput_sem);
        spin_lock(&fs_info->delayed_iput_lock);
        list_splice_init(&fs_info->delayed_iputs, &list);
        spin_unlock(&fs_info->delayed_iput_lock);
@@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                iput(delayed->inode);
                kfree(delayed);
        }
+        up_read(&root->fs_info->delayed_iput_sem);
 }
 /*
@@ -4162,6 +4166,21 @@ out:
        return err;
 }
+static int truncate_space_check(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                u64 bytes_deleted)
+{
+        int ret;
+        bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
+        ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
+                                  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
+        if (!ret)
+                trans->bytes_reserved += bytes_deleted;
+        return ret;
+}
 /*
 * this can truncate away extent items, csum items and directory items.
 * It starts at a high offset and removes keys until it can't find
@@ -4197,9 +4216,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
        u64 ino = btrfs_ino(inode);
+        u64 bytes_deleted = 0;
+        bool be_nice = 0;
+        bool should_throttle = 0;
+        bool should_end = 0;
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
+        /*
+         * for non-free space inodes and ref cows, we want to back off from
+         * time to time
+         */
+        if (!btrfs_is_free_space_inode(inode) &&
+            test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+                be_nice = 1;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -4229,6 +4260,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.type = (u8)-1;
 search_again:
+        /*
+         * with a 16K leaf size and 128MB extents, you can actually queue
+         * up a huge file in a single leaf.  Most of the time that
+         * bytes_deleted is > 0, it will be huge by the time we get here
+         */
+        if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+                if (btrfs_should_end_transaction(trans, root)) {
+                        err = -EAGAIN;
+                        goto error;
+                }
+        }
        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0) {
@@ -4371,22 +4415,39 @@ delete:
                } else {
                        break;
                }
+                should_throttle = 0;
                if (found_extent &&
                    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
                     root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
+                        bytes_deleted += extent_num_bytes;
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
                                                ino, extent_offset, 0);
                        BUG_ON(ret);
+                        if (btrfs_should_throttle_delayed_refs(trans, root))
+                                btrfs_async_run_delayed_refs(root,
+                                        trans->delayed_ref_updates * 2, 0);
+                        if (be_nice) {
+                                if (truncate_space_check(trans, root,
+                                                         extent_num_bytes)) {
+                                        should_end = 1;
+                                }
+                                if (btrfs_should_throttle_delayed_refs(trans,
+                                                                       root)) {
+                                        should_throttle = 1;
+                                }
+                        }
                }
                if (found_type == BTRFS_INODE_ITEM_KEY)
                        break;
                if (path->slots[0] == 0 ||
-                    path->slots[0] != pending_del_slot) {
+                    path->slots[0] != pending_del_slot ||
+                    should_throttle || should_end) {
                        if (pending_del_nr) {
                                ret = btrfs_del_items(trans, root, path,
                                                pending_del_slot,
@@ -4399,6 +4460,23 @@ delete:
                                pending_del_nr = 0;
                        }
                        btrfs_release_path(path);
+                        if (should_throttle) {
+                                unsigned long updates = trans->delayed_ref_updates;
+                                if (updates) {
+                                        trans->delayed_ref_updates = 0;
+                                        ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+                                        if (ret && !err)
+                                                err = ret;
+                                }
+                        }
+                        /*
+                         * if we failed to refill our space rsv, bail out
+                         * and let the transaction restart
+                         */
+                        if (should_end) {
+                                err = -EAGAIN;
+                                goto error;
+                        }
                        goto search_again;
                } else {
                        path->slots[0]--;
@@ -4415,7 +4493,18 @@ error:
        if (last_size != (u64)-1 &&
            root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
        btrfs_free_path(path);
+        if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+                unsigned long updates = trans->delayed_ref_updates;
+                if (updates) {
+                        trans->delayed_ref_updates = 0;
+                        ret = btrfs_run_delayed_refs(trans, root, updates * 2);
+                        if (ret && !err)
+                                err = ret;
+                }
+        }
        return err;
 }
@@ -4924,6 +5013,7 @@ void btrfs_evict_inode(struct inode *inode)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_block_rsv *rsv, *global_rsv;
+        int steal_from_global = 0;
        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        int ret;
@@ -4991,9 +5081,20 @@ void btrfs_evict_inode(struct inode *inode)
                 * hard as possible to get this to work.
                 */
                if (ret)
-                        ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+                        steal_from_global++;
+                else
+                        steal_from_global = 0;
+                ret = 0;
-                if (ret) {
+                /*
+                 * steal_from_global == 0: we reserved stuff, hooray!
+                 * steal_from_global == 1: we didn't reserve stuff, boo!
+                 * steal_from_global == 2: we've committed, still not a lot of
+                 * room but maybe we'll have room in the global reserve this
+                 * time.
+                 * steal_from_global == 3: abandon all hope!
+                 */
+                if (steal_from_global > 2) {
                        btrfs_warn(root->fs_info,
                                "Could not get space for a delete, will truncate on mount %d",
                                ret);
@@ -5009,10 +5110,40 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
+                /*
+                 * We can't just steal from the global reserve, we need tomake
+                 * sure there is room to do it, if not we need to commit and try
+                 * again.
+                 */
+                if (steal_from_global) {
+                        if (!btrfs_check_space_for_delayed_refs(trans, root))
+                                ret = btrfs_block_rsv_migrate(global_rsv, rsv,
+                                                              min_size);
+                        else
+                                ret = -ENOSPC;
+                }
+                /*
+                 * Couldn't steal from the global reserve, we have too much
+                 * pending stuff built up, commit the transaction and try it
+                 * again.
+                 */
+                if (ret) {
+                        ret = btrfs_commit_transaction(trans, root);
+                        if (ret) {
+                                btrfs_orphan_del(NULL, inode);
+                                btrfs_free_block_rsv(root, rsv);
+                                goto no_delete;
+                        }
+                        continue;
+                } else {
+                        steal_from_global = 0;
+                }
                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-                if (ret != -ENOSPC)
+                if (ret != -ENOSPC && ret != -EAGAIN)
                        break;
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -8581,7 +8712,7 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-                if (ret != -ENOSPC) {
+                if (ret != -ENOSPC && ret != -EAGAIN) {
                        err = ret;
                        break;
                }
@@ -9451,6 +9582,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                                btrfs_end_transaction(trans, root);
                        break;
                }
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74609b931ba5..ca5d968f4c37 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir,
        if (ret)
                return ret;
+        /*
+         * Don't create subvolume whose level is not zero. Or qgroup will be
+         * screwed up since it assume subvolme qgroup's level to be 0.
+         */
+        if (btrfs_qgroup_level(objectid))
+                return -ENOSPC;
        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
        /*
         * The same as the snapshot creation, please see the comment
@@ -1564,7 +1571,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                goto out_free;
        }
-        do_div(new_size, root->sectorsize);
+        new_size = div_u64(new_size, root->sectorsize);
        new_size *= root->sectorsize;
        printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
@@ -2897,6 +2904,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
        if (src == dst)
                return -EINVAL;
+        if (len == 0)
+                return 0;
        btrfs_double_lock(src, loff, dst, dst_loff, len);
        ret = extent_same_check_offsets(src, loff, len);
@@ -3039,7 +3049,7 @@ out:
 static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                     u64 disko)
 {
-        struct seq_list tree_mod_seq_elem = {};
+        struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
        struct ulist *roots;
        struct ulist_iterator uiter;
        struct ulist_node *root_node = NULL;
@@ -3202,6 +3212,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
        key.offset = off;
        while (1) {
+                u64 next_key_min_offset = key.offset + 1;
                /*
                 * note the key will change type as we walk through the
                 * tree.
@@ -3282,7 +3294,7 @@ process_slot:
                        } else if (key.offset >= off + len) {
                                break;
                        }
+                        next_key_min_offset = key.offset + datal;
                        size = btrfs_item_size_nr(leaf, slot);
                        read_extent_buffer(leaf, buf,
                                           btrfs_item_ptr_offset(leaf, slot),
@@ -3497,7 +3509,7 @@ process_slot:
                                break;
                }
                btrfs_release_path(path);
-                key.offset++;
+                key.offset = next_key_min_offset;
        }
        ret = 0;
@@ -3626,6 +3638,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (off + len == src->i_size)
                len = ALIGN(src->i_size, bs) - off;
+        if (len == 0) {
+                ret = 0;
+                goto out_unlock;
+        }
        /* verify the end result is block aligned */
        if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
            !IS_ALIGNED(destoff, bs))
@@ -4624,6 +4641,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
                                                sa->src, sa->dst);
        }
+        /* update qgroup status and info */
+        err = btrfs_run_qgroups(trans, root->fs_info);
+        if (err < 0)
+                btrfs_error(root->fs_info, ret,
+                            "failed to update qgroup status and info\n");
        err = btrfs_end_transaction(trans, root);
        if (err && !ret)
                ret = err;
@@ -4669,8 +4691,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
        /* FIXME: check if the IDs really exist */
        if (sa->create) {
-                ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+                ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
-                                          NULL);
        } else {
                ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
        }
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 617553cdb7d3..a2f051347731 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -434,7 +434,7 @@ out:
        return ret;
 }
-struct btrfs_compress_op btrfs_lzo_compress = {
+const struct btrfs_compress_op btrfs_lzo_compress = {
        .alloc_workspace        = lzo_alloc_workspace,
        .free_workspace         = lzo_free_workspace,
        .compress_pages         = lzo_compress_pages,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
        if (factor == 10)
                return num;
        num *= factor;
-        do_div(num, 10);
+        return div_u64(num, 10);
-        return num;
 }
 static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
        if (factor == 100)
                return num;
        num *= factor;
-        do_div(num, 100);
+        return div_u64(num, 100);
-        return num;
 }
 #endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 129b1dd28527..dca137b04095 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode)
        return NULL;
 }
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 058c79eecbfb..3d6546581bb9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -644,9 +644,8 @@ out:
 }
 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root, u64 qgroupid,
+                                    struct btrfs_root *root,
-                                    u64 flags, u64 max_rfer, u64 max_excl,
+                                    struct btrfs_qgroup *qgroup)
-                                    u64 rsv_rfer, u64 rsv_excl)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
        key.objectid = 0;
        key.type = BTRFS_QGROUP_LIMIT_KEY;
-        key.offset = qgroupid;
+        key.offset = qgroup->qgroupid;
        path = btrfs_alloc_path();
        if (!path)
@@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
        l = path->nodes[0];
        slot = path->slots[0];
        qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
-        btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
+        btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
-        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
+        btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
-        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
+        btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
-        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer);
+        btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
-        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl);
+        btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
        btrfs_mark_buffer_dirty(l);
@@ -967,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
        fs_info->pending_quota_state = 0;
        quota_root = fs_info->quota_root;
        fs_info->quota_root = NULL;
+        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
        spin_unlock(&fs_info->qgroup_lock);
        btrfs_free_qgroup_config(fs_info);
@@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
        list_del(&quota_root->dirty_list);
        btrfs_tree_lock(quota_root->node);
-        clean_tree_block(trans, tree_root, quota_root->node);
+        clean_tree_block(trans, tree_root->fs_info, quota_root->node);
        btrfs_tree_unlock(quota_root->node);
        btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
@@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
                list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
 }
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ *
+ * Caller should hold fs_info->qgroup_lock.
+ */
+static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+                                    struct ulist *tmp, u64 ref_root,
+                                    u64 num_bytes, int sign)
+{
+        struct btrfs_qgroup *qgroup;
+        struct btrfs_qgroup_list *glist;
+        struct ulist_node *unode;
+        struct ulist_iterator uiter;
+        int ret = 0;
+        qgroup = find_qgroup_rb(fs_info, ref_root);
+        if (!qgroup)
+                goto out;
+        qgroup->rfer += sign * num_bytes;
+        qgroup->rfer_cmpr += sign * num_bytes;
+        WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+        qgroup->excl += sign * num_bytes;
+        qgroup->excl_cmpr += sign * num_bytes;
+        if (sign > 0)
+                qgroup->reserved -= num_bytes;
+        qgroup_dirty(fs_info, qgroup);
+        /* Get all of the parent groups that contain this qgroup */
+        list_for_each_entry(glist, &qgroup->groups, next_group) {
+                ret = ulist_add(tmp, glist->group->qgroupid,
+                                ptr_to_u64(glist->group), GFP_ATOMIC);
+                if (ret < 0)
+                        goto out;
+        }
+        /* Iterate all of the parents and adjust their reference counts */
+        ULIST_ITER_INIT(&uiter);
+        while ((unode = ulist_next(tmp, &uiter))) {
+                qgroup = u64_to_ptr(unode->aux);
+                qgroup->rfer += sign * num_bytes;
+                qgroup->rfer_cmpr += sign * num_bytes;
+                WARN_ON(sign < 0 && qgroup->excl < num_bytes);
+                qgroup->excl += sign * num_bytes;
+                if (sign > 0)
+                        qgroup->reserved -= num_bytes;
+                qgroup->excl_cmpr += sign * num_bytes;
+                qgroup_dirty(fs_info, qgroup);
+                /* Add any parents of the parents */
+                list_for_each_entry(glist, &qgroup->groups, next_group) {
+                        ret = ulist_add(tmp, glist->group->qgroupid,
+                                        ptr_to_u64(glist->group), GFP_ATOMIC);
+                        if (ret < 0)
+                                goto out;
+                }
+        }
+        ret = 0;
+out:
+        return ret;
+}
+/*
+ * Quick path for updating qgroup with only excl refs.
+ *
+ * In that case, just update all parent will be enough.
+ * Or we needs to do a full rescan.
+ * Caller should also hold fs_info->qgroup_lock.
+ *
+ * Return 0 for quick update, return >0 for need to full rescan
+ * and mark INCONSISTENT flag.
+ * Return < 0 for other error.
+ */
+static int quick_update_accounting(struct btrfs_fs_info *fs_info,
+                                   struct ulist *tmp, u64 src, u64 dst,
+                                   int sign)
+{
+        struct btrfs_qgroup *qgroup;
+        int ret = 1;
+        int err = 0;
+        qgroup = find_qgroup_rb(fs_info, src);
+        if (!qgroup)
+                goto out;
+        if (qgroup->excl == qgroup->rfer) {
+                ret = 0;
+                err = __qgroup_excl_accounting(fs_info, tmp, dst,
+                                               qgroup->excl, sign);
+                if (err < 0) {
+                        ret = err;
+                        goto out;
+                }
+        }
+out:
+        if (ret)
+                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+        return ret;
+}
 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
@@ -1008,8 +1112,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
+        struct ulist *tmp;
        int ret = 0;
+        tmp = ulist_alloc(GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
+        /* Check the level of src and dst first */
+        if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
+                return -EINVAL;
        mutex_lock(&fs_info->qgroup_ioctl_lock);
        quota_root = fs_info->quota_root;
        if (!quota_root) {
@@ -1043,23 +1156,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
        spin_lock(&fs_info->qgroup_lock);
        ret = add_relation_rb(quota_root->fs_info, src, dst);
+        if (ret < 0) {
+                spin_unlock(&fs_info->qgroup_lock);
+                goto out;
+        }
+        ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
        spin_unlock(&fs_info->qgroup_lock);
 out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
+        ulist_free(tmp);
        return ret;
 }
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+int __del_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *parent;
        struct btrfs_qgroup *member;
        struct btrfs_qgroup_list *list;
+        struct ulist *tmp;
        int ret = 0;
        int err;
-        mutex_lock(&fs_info->qgroup_ioctl_lock);
+        tmp = ulist_alloc(GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
        quota_root = fs_info->quota_root;
        if (!quota_root) {
                ret = -EINVAL;
@@ -1088,14 +1211,27 @@ exist:
        spin_lock(&fs_info->qgroup_lock);
        del_relation_rb(fs_info, src, dst);
+        ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
        spin_unlock(&fs_info->qgroup_lock);
 out:
+        ulist_free(tmp);
+        return ret;
+}
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+                              struct btrfs_fs_info *fs_info, u64 src, u64 dst)
+{
+        int ret = 0;
+        mutex_lock(&fs_info->qgroup_ioctl_lock);
+        ret = __del_qgroup_relation(trans, fs_info, src, dst);
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
 }
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-                        struct btrfs_fs_info *fs_info, u64 qgroupid, char *name)
+                        struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
@@ -1133,6 +1269,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
 {
        struct btrfs_root *quota_root;
        struct btrfs_qgroup *qgroup;
+        struct btrfs_qgroup_list *list;
        int ret = 0;
        mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1147,15 +1284,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
                ret = -ENOENT;
                goto out;
        } else {
-                /* check if there are no relations to this qgroup */
+                /* check if there are no children of this qgroup */
-                if (!list_empty(&qgroup->groups) ||
+                if (!list_empty(&qgroup->members)) {
-                    !list_empty(&qgroup->members)) {
                        ret = -EBUSY;
                        goto out;
                }
        }
        ret = del_qgroup_item(trans, quota_root, qgroupid);
+        while (!list_empty(&qgroup->groups)) {
+                list = list_first_entry(&qgroup->groups,
+                                        struct btrfs_qgroup_list, next_group);
+                ret = __del_qgroup_relation(trans, fs_info,
+                                           qgroupid,
+                                           list->group->qgroupid);
+                if (ret)
+                        goto out;
+        }
        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(quota_root->fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
@@ -1184,23 +1330,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
                ret = -ENOENT;
                goto out;
        }
-        ret = update_qgroup_limit_item(trans, quota_root, qgroupid,
-                                       limit->flags, limit->max_rfer,
+        spin_lock(&fs_info->qgroup_lock);
-                                       limit->max_excl, limit->rsv_rfer,
+        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
-                                       limit->rsv_excl);
+                qgroup->max_rfer = limit->max_rfer;
+        if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
+                qgroup->max_excl = limit->max_excl;
+        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
+                qgroup->rsv_rfer = limit->rsv_rfer;
+        if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
+                qgroup->rsv_excl = limit->rsv_excl;
+        qgroup->lim_flags |= limit->flags;
+        spin_unlock(&fs_info->qgroup_lock);
+        ret = update_qgroup_limit_item(trans, quota_root, qgroup);
        if (ret) {
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
                btrfs_info(fs_info, "unable to update quota limit for %llu",
                       qgroupid);
        }
-        spin_lock(&fs_info->qgroup_lock);
-        qgroup->lim_flags = limit->flags;
-        qgroup->max_rfer = limit->max_rfer;
-        qgroup->max_excl = limit->max_excl;
-        qgroup->rsv_rfer = limit->rsv_rfer;
-        qgroup->rsv_excl = limit->rsv_excl;
-        spin_unlock(&fs_info->qgroup_lock);
 out:
        mutex_unlock(&fs_info->qgroup_ioctl_lock);
        return ret;
@@ -1256,14 +1406,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
                return -1;
        if (oper1->bytenr > oper2->bytenr)
                return 1;
-        if (oper1->seq < oper2->seq)
-                return -1;
-        if (oper1->seq > oper2->seq)
-                return 1;
        if (oper1->ref_root < oper2->ref_root)
                return -1;
        if (oper1->ref_root > oper2->ref_root)
                return 1;
+        if (oper1->seq < oper2->seq)
+                return -1;
+        if (oper1->seq > oper2->seq)
+                return 1;
        if (oper1->type < oper2->type)
                return -1;
        if (oper1->type > oper2->type)
@@ -1372,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
-/*
- * The easy accounting, if we are adding/removing the only ref for an extent
- * then this qgroup and all of the parent qgroups get their refrence and
- * exclusive counts adjusted.
- */
 static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                                  struct btrfs_qgroup_operation *oper)
 {
-        struct btrfs_qgroup *qgroup;
        struct ulist *tmp;
-        struct btrfs_qgroup_list *glist;
-        struct ulist_node *unode;
-        struct ulist_iterator uiter;
        int sign = 0;
        int ret = 0;
@@ -1395,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
        spin_lock(&fs_info->qgroup_lock);
        if (!fs_info->quota_root)
                goto out;
-        qgroup = find_qgroup_rb(fs_info, oper->ref_root);
-        if (!qgroup)
-                goto out;
        switch (oper->type) {
        case BTRFS_QGROUP_OPER_ADD_EXCL:
                sign = 1;
@@ -1408,43 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
        default:
                ASSERT(0);
        }
-        qgroup->rfer += sign * oper->num_bytes;
+        ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
-        qgroup->rfer_cmpr += sign * oper->num_bytes;
+                                       oper->num_bytes, sign);
-        WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
-        qgroup->excl += sign * oper->num_bytes;
-        qgroup->excl_cmpr += sign * oper->num_bytes;
-        qgroup_dirty(fs_info, qgroup);
-        /* Get all of the parent groups that contain this qgroup */
-        list_for_each_entry(glist, &qgroup->groups, next_group) {
-                ret = ulist_add(tmp, glist->group->qgroupid,
-                                ptr_to_u64(glist->group), GFP_ATOMIC);
-                if (ret < 0)
-                        goto out;
-        }
-        /* Iterate all of the parents and adjust their reference counts */
-        ULIST_ITER_INIT(&uiter);
-        while ((unode = ulist_next(tmp, &uiter))) {
-                qgroup = u64_to_ptr(unode->aux);
-                qgroup->rfer += sign * oper->num_bytes;
-                qgroup->rfer_cmpr += sign * oper->num_bytes;
-                WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
-                qgroup->excl += sign * oper->num_bytes;
-                qgroup->excl_cmpr += sign * oper->num_bytes;
-                qgroup_dirty(fs_info, qgroup);
-                /* Add any parents of the parents */
-                list_for_each_entry(glist, &qgroup->groups, next_group) {
-                        ret = ulist_add(tmp, glist->group->qgroupid,
-                                        ptr_to_u64(glist->group), GFP_ATOMIC);
-                        if (ret < 0)
-                                goto out;
-                }
-        }
-        ret = 0;
 out:
        spin_unlock(&fs_info->qgroup_lock);
        ulist_free(tmp);
@@ -1845,7 +1949,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
        struct ulist *roots = NULL;
        struct ulist *qgroups, *tmp;
        struct btrfs_qgroup *qgroup;
-        struct seq_list elem = {};
+        struct seq_list elem = SEQ_LIST_INIT(elem);
        u64 seq;
        int old_roots = 0;
        int new_roots = 0;
@@ -1967,7 +2071,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
        int err;
        struct btrfs_qgroup *qg;
        u64 root_obj = 0;
-        struct seq_list elem = {};
+        struct seq_list elem = SEQ_LIST_INIT(elem);
        parents = ulist_alloc(GFP_NOFS);
        if (!parents)
@@ -2156,6 +2260,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
                if (ret)
                        fs_info->qgroup_flags |=
                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                ret = update_qgroup_limit_item(trans, quota_root, qgroup);
+                if (ret)
+                        fs_info->qgroup_flags |=
+                                        BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
                spin_lock(&fs_info->qgroup_lock);
        }
        if (fs_info->quota_enabled)
@@ -2219,6 +2327,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                                ret = -EINVAL;
                                goto out;
                        }
+                        if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
+                                ret = -EINVAL;
+                                goto out;
+                        }
                        ++i_qgroups;
                }
        }
@@ -2230,17 +2343,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
        if (ret)
                goto out;
-        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
-                ret = update_qgroup_limit_item(trans, quota_root, objectid,
-                                               inherit->lim.flags,
-                                               inherit->lim.max_rfer,
-                                               inherit->lim.max_excl,
-                                               inherit->lim.rsv_rfer,
-                                               inherit->lim.rsv_excl);
-                if (ret)
-                        goto out;
-        }
        if (srcid) {
                struct btrfs_root *srcroot;
                struct btrfs_key srckey;
@@ -2286,6 +2388,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                goto unlock;
        }
+        if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
+                dstgroup->lim_flags = inherit->lim.flags;
+                dstgroup->max_rfer = inherit->lim.max_rfer;
+                dstgroup->max_excl = inherit->lim.max_excl;
+                dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
+                dstgroup->rsv_excl = inherit->lim.rsv_excl;
+                ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
+                if (ret) {
+                        fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+                        btrfs_info(fs_info, "unable to update quota limit for %llu",
+                               dstgroup->qgroupid);
+                        goto unlock;
+                }
+        }
        if (srcid) {
                srcgroup = find_qgroup_rb(fs_info, srcid);
                if (!srcgroup)
@@ -2302,6 +2420,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                dstgroup->excl_cmpr = level_size;
                srcgroup->excl = level_size;
                srcgroup->excl_cmpr = level_size;
+                /* inherit the limit info */
+                dstgroup->lim_flags = srcgroup->lim_flags;
+                dstgroup->max_rfer = srcgroup->max_rfer;
+                dstgroup->max_excl = srcgroup->max_excl;
+                dstgroup->rsv_rfer = srcgroup->rsv_rfer;
+                dstgroup->rsv_excl = srcgroup->rsv_excl;
                qgroup_dirty(fs_info, dstgroup);
                qgroup_dirty(fs_info, srcgroup);
        }
@@ -2358,12 +2484,6 @@ out:
        return ret;
 }
-/*
- * reserve some space for a qgroup and all its parents. The reservation takes
- * place with start_transaction or dealloc_reserve, similar to ENOSPC
- * accounting. If not enough space is available, EDQUOT is returned.
- * We assume that the requested space is new for all qgroups.
- */
 int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
 {
        struct btrfs_root *quota_root;
@@ -2513,7 +2633,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
 /*
 * returns < 0 on error, 0 when more leafs are to be scanned.
- * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared.
+ * returns 1 when done.
 */
 static int
 qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
@@ -2522,7 +2642,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 {
        struct btrfs_key found;
        struct ulist *roots = NULL;
-        struct seq_list tree_mod_seq_elem = {};
+        struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
        u64 num_bytes;
        u64 seq;
        int new_roots;
@@ -2618,6 +2738,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
        struct ulist *tmp = NULL, *qgroups = NULL;
        struct extent_buffer *scratch_leaf = NULL;
        int err = -ENOMEM;
+        int ret = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -2660,7 +2781,7 @@ out:
        mutex_lock(&fs_info->qgroup_rescan_lock);
        fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-        if (err == 2 &&
+        if (err > 0 &&
            fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
                fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
        } else if (err < 0) {
@@ -2668,13 +2789,33 @@ out:
        }
        mutex_unlock(&fs_info->qgroup_rescan_lock);
+        /*
+         * only update status, since the previous part has alreay updated the
+         * qgroup info.
+         */
+        trans = btrfs_start_transaction(fs_info->quota_root, 1);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                btrfs_err(fs_info,
+                          "fail to start transaction for status update: %d\n",
+                          err);
+                goto done;
+        }
+        ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
+        if (ret < 0) {
+                err = ret;
+                btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
+        }
+        btrfs_end_transaction(trans, fs_info->quota_root);
        if (err >= 0) {
                btrfs_info(fs_info, "qgroup scan completed%s",
-                        err == 2 ? " (inconsistency flag cleared)" : "");
+                        err > 0 ? " (inconsistency flag cleared)" : "");
        } else {
                btrfs_err(fs_info, "qgroup scan failed with %d", err);
        }
+done:
        complete_all(&fs_info->qgroup_rescan_completion);
 }
@@ -2709,7 +2850,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
                        mutex_unlock(&fs_info->qgroup_rescan_lock);
                        goto err;
                }
                fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
        }
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 18cc68ca3090..c5242aa9a4b2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 src, u64 dst);
 int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
-                        struct btrfs_fs_info *fs_info, u64 qgroupid,
+                        struct btrfs_fs_info *fs_info, u64 qgroupid);
-                        char *name);
 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 qgroupid);
 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..fa72068bd256 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
        }
        x = cmpxchg(&info->stripe_hash_table, NULL, table);
-        if (x) {
+        if (x)
-                if (is_vmalloc_addr(x))
+                kvfree(x);
-                        vfree(x);
-                else
-                        kfree(x);
-        }
        return 0;
 }
@@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
        if (!info->stripe_hash_table)
                return;
        btrfs_clear_rbio_cache(info);
-        if (is_vmalloc_addr(info->stripe_hash_table))
+        kvfree(info->stripe_hash_table);
-                vfree(info->stripe_hash_table);
-        else
-                kfree(info->stripe_hash_table);
        info->stripe_hash_table = NULL;
 }
@@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
        int err;
        int i;
-        pointers = kzalloc(rbio->real_stripes * sizeof(void *),
+        pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
-                           GFP_NOFS);
        if (!pointers) {
                err = -ENOMEM;
                goto cleanup_io;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..74b24b01d574 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
        mutex_lock(&inode->i_mutex);
        ret = btrfs_check_data_free_space(inode, cluster->end +
-                                          1 - cluster->start);
+                                          1 - cluster->start, 0);
        if (ret)
                goto out;
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
 }
 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
-                                    struct inode *inode, u64 ino)
+                                    struct btrfs_block_group_cache *block_group,
+                                    struct inode *inode,
+                                    u64 ino)
 {
        struct btrfs_key key;
        struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ truncate:
                goto out;
        }
-        ret = btrfs_truncate_free_space_cache(root, trans, inode);
+        ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
         */
        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
                ret = delete_block_group_cache(rc->extent_root->fs_info,
+                                               rc->block_group,
                                               NULL, ref_objectid);
                if (ret != -ENOENT)
                        return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_free_path(path);
        if (!IS_ERR(inode))
-                ret = delete_block_group_cache(fs_info, inode, 0);
+                ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
        else
                ret = PTR_ERR(inode);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec57687c9a4d..ab5811545a98 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
         * the statistics.
         */
-        sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+        sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
-                                     sizeof(*sblocks_for_recheck),
+                                      sizeof(*sblocks_for_recheck), GFP_NOFS);
-                                     GFP_NOFS);
        if (!sblocks_for_recheck) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.malloc_errors++;
@@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
                                       unsigned long *bitmap,
                                       u64 start, u64 len)
 {
-        int offset;
+        u32 offset;
        int nsectors;
        int sectorsize = sparity->sctx->dev_root->sectorsize;
@@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
        }
        start -= sparity->logic_start;
-        offset = (int)do_div(start, sparity->stripe_len);
+        start = div_u64_rem(start, sparity->stripe_len, &offset);
        offset /= sectorsize;
        nsectors = (int)len / sectorsize;
@@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
        int j = 0;
        u64 stripe_nr;
        u64 last_offset;
-        int stripe_index;
+        u32 stripe_index;
-        int rot;
+        u32 rot;
        last_offset = (physical - map->stripes[num].physical) *
                      nr_data_stripes(map);
@@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num,
        for (i = 0; i < nr_data_stripes(map); i++) {
                *offset = last_offset + i * map->stripe_len;
-                stripe_nr = *offset;
+                stripe_nr = div_u64(*offset, map->stripe_len);
-                do_div(stripe_nr, map->stripe_len);
+                stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
-                do_div(stripe_nr, nr_data_stripes(map));
                /* Work out the disk rotation on this stripe-set */
-                rot = do_div(stripe_nr, map->num_stripes);
+                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
                /* calculate which stripe this data locates */
                rot += i;
                stripe_index = rot % map->num_stripes;
@@ -2995,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        int extent_mirror_num;
        int stop_loop = 0;
-        nstripes = length;
        physical = map->stripes[num].physical;
        offset = 0;
-        do_div(nstripes, map->stripe_len);
+        nstripes = div_u64(length, map->stripe_len);
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                offset = map->stripe_len * num;
                increment = map->stripe_len * map->num_stripes;
@@ -3563,7 +3560,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
                                                int is_dev_replace)
 {
        int ret = 0;
-        int flags = WQ_FREEZABLE | WQ_UNBOUND;
+        unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
        int max_active = fs_info->thread_pool_size;
        if (fs_info->scrub_workers_refcnt == 0) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d6033f540cc7..a1216f9b4917 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
        return NULL;
 }
-static int path_loop(struct send_ctx *sctx, struct fs_path *name,
-                     u64 ino, u64 gen, u64 *ancestor_ino)
-{
-        int ret = 0;
-        u64 parent_inode = 0;
-        u64 parent_gen = 0;
-        u64 start_ino = ino;
-        *ancestor_ino = 0;
-        while (ino != BTRFS_FIRST_FREE_OBJECTID) {
-                fs_path_reset(name);
-                if (is_waiting_for_rm(sctx, ino))
-                        break;
-                if (is_waiting_for_move(sctx, ino)) {
-                        if (*ancestor_ino == 0)
-                                *ancestor_ino = ino;
-                        ret = get_first_ref(sctx->parent_root, ino,
-                                            &parent_inode, &parent_gen, name);
-                } else {
-                        ret = __get_cur_name_and_parent(sctx, ino, gen,
-                                                        &parent_inode,
-                                                        &parent_gen, name);
-                        if (ret > 0) {
-                                ret = 0;
-                                break;
-                        }
-                }
-                if (ret < 0)
-                        break;
-                if (parent_inode == start_ino) {
-                        ret = 1;
-                        if (*ancestor_ino == 0)
-                                *ancestor_ino = ino;
-                        break;
-                }
-                ino = parent_inode;
-                gen = parent_gen;
-        }
-        return ret;
-}
 static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 {
        struct fs_path *from_path = NULL;
@@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
        struct waiting_dir_move *dm = NULL;
        u64 rmdir_ino = 0;
        int ret;
-        u64 ancestor = 0;
        name = fs_path_alloc();
        from_path = fs_path_alloc();
@@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
                goto out;
        sctx->send_progress = sctx->cur_ino + 1;
-        ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
-        if (ret) {
-                LIST_HEAD(deleted_refs);
-                ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
-                ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
-                                           &pm->update_refs, &deleted_refs,
-                                           pm->is_orphan);
-                if (ret < 0)
-                        goto out;
-                if (rmdir_ino) {
-                        dm = get_waiting_dir_move(sctx, pm->ino);
-                        ASSERT(dm);
-                        dm->rmdir_ino = rmdir_ino;
-                }
-                goto out;
-        }
        fs_path_reset(name);
        to_path = name;
        name = NULL;
@@ -3610,10 +3551,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
                        if (ret < 0)
                                goto out;
                        if (ret) {
+                                struct name_cache_entry *nce;
                                ret = orphanize_inode(sctx, ow_inode, ow_gen,
                                                cur->full_path);
                                if (ret < 0)
                                        goto out;
+                                /*
+                                 * Make sure we clear our orphanized inode's
+                                 * name from the name cache. This is because the
+                                 * inode ow_inode might be an ancestor of some
+                                 * other inode that will be orphanized as well
+                                 * later and has an inode number greater than
+                                 * sctx->send_progress. We need to prevent
+                                 * future name lookups from using the old name
+                                 * and get instead the orphan name.
+                                 */
+                                nce = name_cache_search(sctx, ow_inode, ow_gen);
+                                if (nce) {
+                                        name_cache_delete(sctx, nce);
+                                        kfree(nce);
+                                }
                        } else {
                                ret = send_unlink(sctx, cur->full_path);
                                if (ret < 0)
@@ -5852,19 +5810,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                                ret = PTR_ERR(clone_root);
                                goto out;
                        }
-                        clone_sources_to_rollback = i + 1;
                        spin_lock(&clone_root->root_item_lock);
-                        clone_root->send_in_progress++;
+                        if (!btrfs_root_readonly(clone_root) ||
-                        if (!btrfs_root_readonly(clone_root)) {
+                            btrfs_root_dead(clone_root)) {
                                spin_unlock(&clone_root->root_item_lock);
                                srcu_read_unlock(&fs_info->subvol_srcu, index);
                                ret = -EPERM;
                                goto out;
                        }
+                        clone_root->send_in_progress++;
                        spin_unlock(&clone_root->root_item_lock);
                        srcu_read_unlock(&fs_info->subvol_srcu, index);
                        sctx->clone_roots[i].root = clone_root;
+                        clone_sources_to_rollback = i + 1;
                }
                vfree(clone_sources_tmp);
                clone_sources_tmp = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 05fef198ff94..f2c9f9db3b19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,6 +901,15 @@ find_root:
        if (IS_ERR(new_root))
                return ERR_CAST(new_root);
+        if (!(sb->s_flags & MS_RDONLY)) {
+                int ret;
+                down_read(&fs_info->cleanup_work_sem);
+                ret = btrfs_orphan_cleanup(new_root);
+                up_read(&fs_info->cleanup_work_sem);
+                if (ret)
+                        return ERR_PTR(ret);
+        }
        dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
        location.objectid = dir_id;
@@ -1714,7 +1723,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
                avail_space = device->total_bytes - device->bytes_used;
                /* align with stripe_len */
-                do_div(avail_space, BTRFS_STRIPE_LEN);
+                avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
                avail_space *= BTRFS_STRIPE_LEN;
                /*
@@ -1908,6 +1917,17 @@ static struct file_system_type btrfs_fs_type = {
 };
 MODULE_ALIAS_FS("btrfs");
+static int btrfs_control_open(struct inode *inode, struct file *file)
+{
+        /*
+         * The control file's private_data is used to hold the
+         * transaction when it is started and is used to keep
+         * track of whether a transaction is already in progress.
+         */
+        file->private_data = NULL;
+        return 0;
+}
 /*
 * used by btrfsctl to scan devices when no FS is mounted
 */
@@ -2009,6 +2029,7 @@ static const struct super_operations btrfs_super_ops = {
 };
 static const struct file_operations btrfs_ctl_fops = {
+        .open = btrfs_control_open,
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 94edb0a2a026..e8a4c86d274d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
 static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
 static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
-static u64 supported_feature_masks[3] = {
+static const u64 supported_feature_masks[3] = {
        [FEAT_COMPAT]    = BTRFS_FEATURE_COMPAT_SUPP,
        [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
        [FEAT_INCOMPAT]  = BTRFS_FEATURE_INCOMPAT_SUPP,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f7dd298b3cf6..3a4bbed723fd 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
        BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
 /* convert from attribute */
-#define to_btrfs_feature_attr(a) \
+static inline struct btrfs_feature_attr *
-                        container_of(a, struct btrfs_feature_attr, kobj_attr)
+to_btrfs_feature_attr(struct kobj_attribute *a)
-#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
+{
-#define attr_to_btrfs_feature_attr(a) \
+        return container_of(a, struct btrfs_feature_attr, kobj_attr);
-                        to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+}
+static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
+{
+        return container_of(attr, struct kobj_attribute, attr);
+}
+static inline struct btrfs_feature_attr *
+attr_to_btrfs_feature_attr(struct attribute *attr)
+{
+        return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
+}
 char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
 extern const char * const btrfs_feature_set_names[3];
 extern struct kobj_type space_info_ktype;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 73f299ebdabb..c32a7ba76bca 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
        init_dummy_trans(&trans);
        test_msg("Qgroup basic add\n");
-        ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+        ret = btrfs_create_qgroup(NULL, fs_info, 5);
        if (ret) {
                test_msg("Couldn't create a qgroup %d\n", ret);
                return ret;
@@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root)
        test_msg("Qgroup multiple refs test\n");
        /* We have 5 created already from the previous test */
-        ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+        ret = btrfs_create_qgroup(NULL, fs_info, 256);
        if (ret) {
                test_msg("Couldn't create a qgroup %d\n", ret);
                return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8be4278e25e8..5628e25250c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,7 +35,7 @@
 #define BTRFS_ROOT_TRANS_TAG 0
-static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
        [TRANS_STATE_RUNNING]           = 0U,
        [TRANS_STATE_BLOCKED]           = (__TRANS_USERSPACE |
                                           __TRANS_START),
@@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+                if (transaction->delayed_refs.pending_csums)
+                        printk(KERN_ERR "pending csums is %llu\n",
+                               transaction->delayed_refs.pending_csums);
                while (!list_empty(&transaction->pending_chunks)) {
                        struct extent_map *em;
@@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
                 */
                ASSERT(!waitqueue_active(&state->wq));
                free_extent_state(state);
-                if (need_resched()) {
-                        spin_unlock(&tree->lock);
+                cond_resched_lock(&tree->lock);
-                        cond_resched();
-                        spin_lock(&tree->lock);
-                }
        }
        spin_unlock(&tree->lock);
 }
@@ -222,10 +222,12 @@ loop:
        atomic_set(&cur_trans->use_count, 2);
        cur_trans->have_free_bgs = 0;
        cur_trans->start_time = get_seconds();
+        cur_trans->dirty_bg_run = 0;
        cur_trans->delayed_refs.href_root = RB_ROOT;
        atomic_set(&cur_trans->delayed_refs.num_entries, 0);
        cur_trans->delayed_refs.num_heads_ready = 0;
+        cur_trans->delayed_refs.pending_csums = 0;
        cur_trans->delayed_refs.num_heads = 0;
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
@@ -250,6 +252,9 @@ loop:
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        INIT_LIST_HEAD(&cur_trans->pending_ordered);
        INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+        INIT_LIST_HEAD(&cur_trans->io_bgs);
+        mutex_init(&cur_trans->cache_write_mutex);
+        cur_trans->num_dirty_bgs = 0;
        spin_lock_init(&cur_trans->dirty_bgs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
@@ -721,7 +726,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        updates = trans->delayed_ref_updates;
        trans->delayed_ref_updates = 0;
        if (updates) {
-                err = btrfs_run_delayed_refs(trans, root, updates);
+                err = btrfs_run_delayed_refs(trans, root, updates * 2);
                if (err) /* Error code will also eval true */
                        return err;
        }
@@ -1057,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
+        struct list_head *io_bgs = &trans->transaction->io_bgs;
        struct list_head *next;
        struct extent_buffer *eb;
        int ret;
@@ -1110,7 +1116,7 @@ again:
                        return ret;
        }
-        while (!list_empty(dirty_bgs)) {
+        while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
                ret = btrfs_write_dirty_block_groups(trans, root);
                if (ret)
                        return ret;
@@ -1810,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
+        if (!cur_trans->dirty_bg_run) {
+                int run_it = 0;
+                /* this mutex is also taken before trying to set
+                 * block groups readonly.  We need to make sure
+                 * that nobody has set a block group readonly
+                 * after a extents from that block group have been
+                 * allocated for cache files.  btrfs_set_block_group_ro
+                 * will wait for the transaction to commit if it
+                 * finds dirty_bg_run = 1
+                 *
+                 * The dirty_bg_run flag is also used to make sure only
+                 * one process starts all the block group IO.  It wouldn't
+                 * hurt to have more than one go through, but there's no
+                 * real advantage to it either.
+                 */
+                mutex_lock(&root->fs_info->ro_block_group_mutex);
+                if (!cur_trans->dirty_bg_run) {
+                        run_it = 1;
+                        cur_trans->dirty_bg_run = 1;
+                }
+                mutex_unlock(&root->fs_info->ro_block_group_mutex);
+                if (run_it)
+                        ret = btrfs_start_dirty_block_groups(trans, root);
+        }
+        if (ret) {
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
        spin_lock(&root->fs_info->trans_lock);
        list_splice(&trans->ordered, &cur_trans->pending_ordered);
        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2003,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        assert_qgroups_uptodate(trans);
        ASSERT(list_empty(&cur_trans->dirty_bgs));
+        ASSERT(list_empty(&cur_trans->io_bgs));
        update_super_roots(root);
        btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..0b24755596ba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,9 +64,19 @@ struct btrfs_transaction {
        struct list_head pending_ordered;
        struct list_head switch_commits;
        struct list_head dirty_bgs;
+        struct list_head io_bgs;
+        u64 num_dirty_bgs;
+        /*
+         * we need to make sure block group deletion doesn't race with
+         * free space cache writeout.  This mutex keeps them from stomping
+         * on each other
+         */
+        struct mutex cache_write_mutex;
        spinlock_t dirty_bgs_lock;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
+        int dirty_bg_run;
 };
 #define __TRANS_FREEZABLE       (1U << 0)
@@ -136,9 +146,11 @@ struct btrfs_pending_snapshot {
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
                                              struct inode *inode)
 {
+        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->last_trans = trans->transaction->transid;
        BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+        spin_unlock(&BTRFS_I(inode)->lock);
 }
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c5b8ba37f88e..a089b5944efc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
                if (btrfs_inode_generation(eb, src_item) == 0) {
                        struct extent_buffer *dst_eb = path->nodes[0];
+                        const u64 ino_size = btrfs_inode_size(eb, src_item);
+                        /*
+                         * For regular files an ino_size == 0 is used only when
+                         * logging that an inode exists, as part of a directory
+                         * fsync, and the inode wasn't fsynced before. In this
+                         * case don't set the size of the inode in the fs/subvol
+                         * tree, otherwise we would be throwing valid data away.
+                         */
                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
-                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
+                            ino_size != 0) {
                                struct btrfs_map_token token;
-                                u64 ino_size = btrfs_inode_size(eb, src_item);
                                btrfs_init_map_token(&token);
                                btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -1951,6 +1959,104 @@ out:
        return ret;
 }
+static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root,
+                              struct btrfs_root *log,
+                              struct btrfs_path *path,
+                              const u64 ino)
+{
+        struct btrfs_key search_key;
+        struct btrfs_path *log_path;
+        int i;
+        int nritems;
+        int ret;
+        log_path = btrfs_alloc_path();
+        if (!log_path)
+                return -ENOMEM;
+        search_key.objectid = ino;
+        search_key.type = BTRFS_XATTR_ITEM_KEY;
+        search_key.offset = 0;
+again:
+        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+process_leaf:
+        nritems = btrfs_header_nritems(path->nodes[0]);
+        for (i = path->slots[0]; i < nritems; i++) {
+                struct btrfs_key key;
+                struct btrfs_dir_item *di;
+                struct btrfs_dir_item *log_di;
+                u32 total_size;
+                u32 cur;
+                btrfs_item_key_to_cpu(path->nodes[0], &key, i);
+                if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
+                        ret = 0;
+                        goto out;
+                }
+                di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
+                total_size = btrfs_item_size_nr(path->nodes[0], i);
+                cur = 0;
+                while (cur < total_size) {
+                        u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
+                        u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
+                        u32 this_len = sizeof(*di) + name_len + data_len;
+                        char *name;
+                        name = kmalloc(name_len, GFP_NOFS);
+                        if (!name) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        read_extent_buffer(path->nodes[0], name,
+                                           (unsigned long)(di + 1), name_len);
+                        log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
+                                                    name, name_len, 0);
+                        btrfs_release_path(log_path);
+                        if (!log_di) {
+                                /* Doesn't exist in log tree, so delete it. */
+                                btrfs_release_path(path);
+                                di = btrfs_lookup_xattr(trans, root, path, ino,
+                                                        name, name_len, -1);
+                                kfree(name);
+                                if (IS_ERR(di)) {
+                                        ret = PTR_ERR(di);
+                                        goto out;
+                                }
+                                ASSERT(di);
+                                ret = btrfs_delete_one_dir_name(trans, root,
+                                                                path, di);
+                                if (ret)
+                                        goto out;
+                                btrfs_release_path(path);
+                                search_key = key;
+                                goto again;
+                        }
+                        kfree(name);
+                        if (IS_ERR(log_di)) {
+                                ret = PTR_ERR(log_di);
+                                goto out;
+                        }
+                        cur += this_len;
+                        di = (struct btrfs_dir_item *)((char *)di + this_len);
+                }
+        }
+        ret = btrfs_next_leaf(root, path);
+        if (ret > 0)
+                ret = 0;
+        else if (ret == 0)
+                goto process_leaf;
+out:
+        btrfs_free_path(log_path);
+        btrfs_release_path(path);
+        return ret;
+}
 /*
 * deletion replay happens before we copy any new directory items
 * out of the log or out of backreferences from inodes.  It
@@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        inode_item = btrfs_item_ptr(eb, i,
                                            struct btrfs_inode_item);
+                        ret = replay_xattr_deletes(wc->trans, root, log,
+                                                   path, key.objectid);
+                        if (ret)
+                                break;
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
@@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                if (trans) {
                                        btrfs_tree_lock(next);
                                        btrfs_set_lock_blocking(next);
-                                        clean_tree_block(trans, root, next);
+                                        clean_tree_block(trans, root->fs_info,
+                                                        next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
                                }
@@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                if (trans) {
                                        btrfs_tree_lock(next);
                                        btrfs_set_lock_blocking(next);
-                                        clean_tree_block(trans, root, next);
+                                        clean_tree_block(trans, root->fs_info,
+                                                        next);
                                        btrfs_wait_tree_block_writeback(next);
                                        btrfs_tree_unlock(next);
                                }
@@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        if (trans) {
                                btrfs_tree_lock(next);
                                btrfs_set_lock_blocking(next);
-                                clean_tree_block(trans, log, next);
+                                clean_tree_block(trans, log->fs_info, next);
                                btrfs_wait_tree_block_writeback(next);
                                btrfs_tree_unlock(next);
                        }
@@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct inode *inode,
                          struct btrfs_path *path,
                          struct btrfs_path *dst_path, int key_type,
+                          struct btrfs_log_ctx *ctx,
                          u64 min_offset, u64 *last_offset_ret)
 {
        struct btrfs_key min_key;
@@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                src = path->nodes[0];
                nritems = btrfs_header_nritems(src);
                for (i = path->slots[0]; i < nritems; i++) {
+                        struct btrfs_dir_item *di;
                        btrfs_item_key_to_cpu(src, &min_key, i);
                        if (min_key.objectid != ino || min_key.type != key_type)
@@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                err = ret;
                                goto done;
                        }
+                        /*
+                         * We must make sure that when we log a directory entry,
+                         * the corresponding inode, after log replay, has a
+                         * matching link count. For example:
+                         *
+                         * touch foo
+                         * mkdir mydir
+                         * sync
+                         * ln foo mydir/bar
+                         * xfs_io -c "fsync" mydir
+                         * <crash>
+                         * <mount fs and log replay>
+                         *
+                         * Would result in a fsync log that when replayed, our
+                         * file inode would have a link count of 1, but we get
+                         * two directory entries pointing to the same inode.
+                         * After removing one of the names, it would not be
+                         * possible to remove the other name, which resulted
+                         * always in stale file handle errors, and would not
+                         * be possible to rmdir the parent directory, since
+                         * its i_size could never decrement to the value
+                         * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
+                         */
+                        di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
+                        btrfs_dir_item_key_to_cpu(src, di, &tmp);
+                        if (ctx &&
+                            (btrfs_dir_transid(src, di) == trans->transid ||
+                             btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
+                            tmp.type != BTRFS_ROOT_ITEM_KEY)
+                                ctx->log_new_dentries = true;
                }
                path->slots[0] = nritems;
@@ -3175,7 +3321,8 @@ done:
 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct inode *inode,
                          struct btrfs_path *path,
-                          struct btrfs_path *dst_path)
+                          struct btrfs_path *dst_path,
+                          struct btrfs_log_ctx *ctx)
 {
        u64 min_key;
        u64 max_key;
@@ -3187,7 +3334,7 @@ again:
        max_key = 0;
        while (1) {
                ret = log_dir_items(trans, root, inode, path,
-                                    dst_path, key_type, min_key,
+                                    dst_path, key_type, ctx, min_key,
                                    &max_key);
                if (ret)
                        return ret;
@@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
        if (ret < 0) {
                return ret;
        } else if (ret > 0) {
-                *size_ret = i_size_read(inode);
+                *size_ret = 0;
        } else {
                struct btrfs_inode_item *item;
@@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        if (S_ISDIR(inode->i_mode)) {
                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-                if (inode_only == LOG_INODE_EXISTS) {
+                if (inode_only == LOG_INODE_EXISTS)
-                        max_key_type = BTRFS_INODE_EXTREF_KEY;
+                        max_key_type = BTRFS_XATTR_ITEM_KEY;
-                        max_key.type = max_key_type;
-                }
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
                if (inode_only == LOG_INODE_EXISTS) {
@@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                             &BTRFS_I(inode)->runtime_flags)) {
                        if (inode_only == LOG_INODE_EXISTS) {
-                                max_key.type = BTRFS_INODE_EXTREF_KEY;
+                                max_key.type = BTRFS_XATTR_ITEM_KEY;
                                ret = drop_objectid_items(trans, log, path, ino,
                                                          max_key.type);
                        } else {
@@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                                          &BTRFS_I(inode)->runtime_flags);
                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
                                          &BTRFS_I(inode)->runtime_flags);
-                                ret = btrfs_truncate_inode_items(trans, log,
+                                while(1) {
-                                                                 inode, 0, 0);
+                                        ret = btrfs_truncate_inode_items(trans,
+                                                         log, inode, 0, 0);
+                                        if (ret != -EAGAIN)
+                                                break;
+                                }
                        }
-                } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                    &BTRFS_I(inode)->runtime_flags) ||
+                                              &BTRFS_I(inode)->runtime_flags) ||
                           inode_only == LOG_INODE_EXISTS) {
-                        if (inode_only == LOG_INODE_ALL) {
+                        if (inode_only == LOG_INODE_ALL)
-                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
-                                          &BTRFS_I(inode)->runtime_flags);
                                fast_search = true;
-                                max_key.type = BTRFS_XATTR_ITEM_KEY;
+                        max_key.type = BTRFS_XATTR_ITEM_KEY;
-                        } else {
-                                max_key.type = BTRFS_INODE_EXTREF_KEY;
-                        }
                        ret = drop_objectid_items(trans, log, path, ino,
                                                  max_key.type);
                } else {
@@ -4277,15 +4421,18 @@ log_extents:
        }
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-                ret = log_directory_changes(trans, root, inode, path, dst_path);
+                ret = log_directory_changes(trans, root, inode, path, dst_path,
+                                            ctx);
                if (ret) {
                        err = ret;
                        goto out_unlock;
                }
        }
+        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+        spin_unlock(&BTRFS_I(inode)->lock);
 out_unlock:
        if (unlikely(err))
                btrfs_put_logged_extents(&logged_list);
@@ -4372,6 +4519,181 @@ out:
        return ret;
 }
+struct btrfs_dir_list {
+        u64 ino;
+        struct list_head list;
+};
+/*
+ * Log the inodes of the new dentries of a directory. See log_dir_items() for
+ * details about the why it is needed.
+ * This is a recursive operation - if an existing dentry corresponds to a
+ * directory, that directory's new entries are logged too (same behaviour as
+ * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
+ * the dentries point to we do not lock their i_mutex, otherwise lockdep
+ * complains about the following circular lock dependency / possible deadlock:
+ *
+ *        CPU0                                        CPU1
+ *        ----                                        ----
+ * lock(&type->i_mutex_dir_key#3/2);
+ *                                            lock(sb_internal#2);
+ *                                            lock(&type->i_mutex_dir_key#3/2);
+ * lock(&sb->s_type->i_mutex_key#14);
+ *
+ * Where sb_internal is the lock (a counter that works as a lock) acquired by
+ * sb_start_intwrite() in btrfs_start_transaction().
+ * Not locking i_mutex of the inodes is still safe because:
+ *
+ * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
+ *    that while logging the inode new references (names) are added or removed
+ *    from the inode, leaving the logged inode item with a link count that does
+ *    not match the number of logged inode reference items. This is fine because
+ *    at log replay time we compute the real number of links and correct the
+ *    link count in the inode item (see replay_one_buffer() and
+ *    link_to_fixup_dir());
+ *
+ * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
+ *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
+ *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
+ *    has a size that doesn't match the sum of the lengths of all the logged
+ *    names. This does not result in a problem because if a dir_item key is
+ *    logged but its matching dir_index key is not logged, at log replay time we
+ *    don't use it to replay the respective name (see replay_one_name()). On the
+ *    other hand if only the dir_index key ends up being logged, the respective
+ *    name is added to the fs/subvol tree with both the dir_item and dir_index
+ *    keys created (see replay_one_name()).
+ *    The directory's inode item with a wrong i_size is not a problem as well,
+ *    since we don't use it at log replay time to set the i_size in the inode
+ *    item of the fs/subvol tree (see overwrite_item()).
+ */
+static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *start_inode,
+                                struct btrfs_log_ctx *ctx)
+{
+        struct btrfs_root *log = root->log_root;
+        struct btrfs_path *path;
+        LIST_HEAD(dir_list);
+        struct btrfs_dir_list *dir_elem;
+        int ret = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
+        if (!dir_elem) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
+        dir_elem->ino = btrfs_ino(start_inode);
+        list_add_tail(&dir_elem->list, &dir_list);
+        while (!list_empty(&dir_list)) {
+                struct extent_buffer *leaf;
+                struct btrfs_key min_key;
+                int nritems;
+                int i;
+                dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
+                                            list);
+                if (ret)
+                        goto next_dir_inode;
+                min_key.objectid = dir_elem->ino;
+                min_key.type = BTRFS_DIR_ITEM_KEY;
+                min_key.offset = 0;
+again:
+                btrfs_release_path(path);
+                ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+                if (ret < 0) {
+                        goto next_dir_inode;
+                } else if (ret > 0) {
+                        ret = 0;
+                        goto next_dir_inode;
+                }
+process_leaf:
+                leaf = path->nodes[0];
+                nritems = btrfs_header_nritems(leaf);
+                for (i = path->slots[0]; i < nritems; i++) {
+                        struct btrfs_dir_item *di;
+                        struct btrfs_key di_key;
+                        struct inode *di_inode;
+                        struct btrfs_dir_list *new_dir_elem;
+                        int log_mode = LOG_INODE_EXISTS;
+                        int type;
+                        btrfs_item_key_to_cpu(leaf, &min_key, i);
+                        if (min_key.objectid != dir_elem->ino ||
+                            min_key.type != BTRFS_DIR_ITEM_KEY)
+                                goto next_dir_inode;
+                        di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
+                        type = btrfs_dir_type(leaf, di);
+                        if (btrfs_dir_transid(leaf, di) < trans->transid &&
+                            type != BTRFS_FT_DIR)
+                                continue;
+                        btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
+                        if (di_key.type == BTRFS_ROOT_ITEM_KEY)
+                                continue;
+                        di_inode = btrfs_iget(root->fs_info->sb, &di_key,
+                                              root, NULL);
+                        if (IS_ERR(di_inode)) {
+                                ret = PTR_ERR(di_inode);
+                                goto next_dir_inode;
+                        }
+                        if (btrfs_inode_in_log(di_inode, trans->transid)) {
+                                iput(di_inode);
+                                continue;
+                        }
+                        ctx->log_new_dentries = false;
+                        if (type == BTRFS_FT_DIR)
+                                log_mode = LOG_INODE_ALL;
+                        btrfs_release_path(path);
+                        ret = btrfs_log_inode(trans, root, di_inode,
+                                              log_mode, 0, LLONG_MAX, ctx);
+                        iput(di_inode);
+                        if (ret)
+                                goto next_dir_inode;
+                        if (ctx->log_new_dentries) {
+                                new_dir_elem = kmalloc(sizeof(*new_dir_elem),
+                                                       GFP_NOFS);
+                                if (!new_dir_elem) {
+                                        ret = -ENOMEM;
+                                        goto next_dir_inode;
+                                }
+                                new_dir_elem->ino = di_key.objectid;
+                                list_add_tail(&new_dir_elem->list, &dir_list);
+                        }
+                        break;
+                }
+                if (i == nritems) {
+                        ret = btrfs_next_leaf(log, path);
+                        if (ret < 0) {
+                                goto next_dir_inode;
+                        } else if (ret > 0) {
+                                ret = 0;
+                                goto next_dir_inode;
+                        }
+                        goto process_leaf;
+                }
+                if (min_key.offset < (u64)-1) {
+                        min_key.offset++;
+                        goto again;
+                }
+next_dir_inode:
+                list_del(&dir_elem->list);
+                kfree(dir_elem);
+        }
+        btrfs_free_path(path);
+        return ret;
+}
 /*
 * helper function around btrfs_log_inode to make sure newly created
 * parent directories also end up in the log.  A minimal inode and backref
@@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        const struct dentry * const first_parent = parent;
        const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
                                 last_committed);
+        bool log_dentries = false;
+        struct inode *orig_inode = inode;
        sb = inode->i_sb;
@@ -4449,6 +4773,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
        }
+        if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
+                log_dentries = true;
        while (1) {
                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
@@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                dput(old_parent);
                old_parent = parent;
        }
-        ret = 0;
+        if (log_dentries)
+                ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
+        else
+                ret = 0;
 end_trans:
        dput(old_parent);
        if (ret < 0) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
        int log_ret;
        int log_transid;
        int io_err;
+        bool log_new_dentries;
        struct list_head list;
 };
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
        ctx->log_ret = 0;
        ctx->log_transid = 0;
        ctx->io_err = 0;
+        ctx->log_new_dentries = false;
        INIT_LIST_HEAD(&ctx->list);
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..8bcd2a007517 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,8 +366,8 @@ loop_lock:
                btrfsic_submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
-                if (need_resched())
-                        cond_resched();
+                cond_resched();
                /*
                 * we made progress, there is more work to do and the bdi
@@ -400,8 +400,7 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
-                                if (need_resched())
+                                cond_resched();
-                                        cond_resched();
                                continue;
                        }
                        spin_lock(&device->io_lock);
@@ -609,8 +608,7 @@ error:
        return ERR_PTR(-ENOMEM);
 }
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
-                               struct btrfs_fs_devices *fs_devices, int step)
 {
        struct btrfs_device *device, *next;
        struct btrfs_device *latest_dev = NULL;
@@ -1136,11 +1134,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-again:
        max_hole_start = search_start;
        max_hole_size = 0;
-        hole_size = 0;
+again:
        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
                ret = -ENOSPC;
                goto out;
@@ -1233,21 +1231,23 @@ next:
         * allocated dev extents, and when shrinking the device,
         * search_end may be smaller than search_start.
         */
-        if (search_end > search_start)
+        if (search_end > search_start) {
                hole_size = search_end - search_start;
-        if (hole_size > max_hole_size) {
+                if (contains_pending_extent(trans, device, &search_start,
-                max_hole_start = search_start;
+                                            hole_size)) {
-                max_hole_size = hole_size;
+                        btrfs_release_path(path);
-        }
+                        goto again;
+                }
-        if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+                if (hole_size > max_hole_size) {
-                btrfs_release_path(path);
+                        max_hole_start = search_start;
-                goto again;
+                        max_hole_size = hole_size;
+                }
        }
        /* See above. */
-        if (hole_size < num_bytes)
+        if (max_hole_size < num_bytes)
                ret = -ENOSPC;
        else
                ret = 0;
@@ -2487,8 +2487,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
 }
 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
+                            struct btrfs_root *root, u64 chunk_objectid,
-                            u64 chunk_tree, u64 chunk_objectid,
                            u64 chunk_offset)
 {
        int ret;
@@ -2580,7 +2579,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
        struct map_lookup *map;
        u64 dev_extent_len = 0;
        u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
-        u64 chunk_tree = root->fs_info->chunk_root->objectid;
        int i, ret = 0;
        /* Just in case */
@@ -2634,8 +2632,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
                        }
                }
        }
-        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
+        ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
-                               chunk_offset);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out;
@@ -2664,8 +2661,8 @@ out:
 }
 static int btrfs_relocate_chunk(struct btrfs_root *root,
-                         u64 chunk_tree, u64 chunk_objectid,
+                                u64 chunk_objectid,
-                         u64 chunk_offset)
+                                u64 chunk_offset)
 {
        struct btrfs_root *extent_root;
        struct btrfs_trans_handle *trans;
@@ -2707,7 +2704,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
        struct btrfs_chunk *chunk;
        struct btrfs_key key;
        struct btrfs_key found_key;
-        u64 chunk_tree = chunk_root->root_key.objectid;
        u64 chunk_type;
        bool retried = false;
        int failed = 0;
@@ -2744,7 +2740,7 @@ again:
                btrfs_release_path(path);
                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
-                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
+                        ret = btrfs_relocate_chunk(chunk_root,
                                                   found_key.objectid,
                                                   found_key.offset);
                        if (ret == -ENOSPC)
@@ -3022,7 +3018,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
                stripe_offset = btrfs_stripe_offset(leaf, stripe);
                stripe_length = btrfs_chunk_length(leaf, chunk);
-                do_div(stripe_length, factor);
+                stripe_length = div_u64(stripe_length, factor);
                if (stripe_offset < bargs->pend &&
                    stripe_offset + stripe_length > bargs->pstart)
@@ -3255,7 +3251,6 @@ again:
                }
                ret = btrfs_relocate_chunk(chunk_root,
-                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
                if (ret && ret != -ENOSPC)
@@ -3957,7 +3952,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        struct btrfs_dev_extent *dev_extent = NULL;
        struct btrfs_path *path;
        u64 length;
-        u64 chunk_tree;
        u64 chunk_objectid;
        u64 chunk_offset;
        int ret;
@@ -4027,13 +4021,11 @@ again:
                        break;
                }
-                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
                btrfs_release_path(path);
-                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
+                ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
-                                           chunk_offset);
                if (ret && ret != -ENOSPC)
                        goto done;
                if (ret == -ENOSPC)
@@ -4131,7 +4123,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
        return 0;
 }
-static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        [BTRFS_RAID_RAID10] = {
                .sub_stripes    = 2,
                .dev_stripes    = 1,
@@ -4289,7 +4281,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
-        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
+        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
                               GFP_NOFS);
        if (!devices_info)
                return -ENOMEM;
@@ -4400,8 +4392,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
         */
        if (stripe_size * data_stripes > max_chunk_size) {
                u64 mask = (1ULL << 24) - 1;
-                stripe_size = max_chunk_size;
-                do_div(stripe_size, data_stripes);
+                stripe_size = div_u64(max_chunk_size, data_stripes);
                /* bump the answer up to a 16MB boundary */
                stripe_size = (stripe_size + mask) & ~mask;
@@ -4413,10 +4405,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                        stripe_size = devices_info[ndevs-1].max_avail;
        }
-        do_div(stripe_size, dev_stripes);
+        stripe_size = div_u64(stripe_size, dev_stripes);
        /* align to BTRFS_STRIPE_LEN */
-        do_div(stripe_size, raid_stripe_len);
+        stripe_size = div_u64(stripe_size, raid_stripe_len);
        stripe_size *= raid_stripe_len;
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4954,7 +4946,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
        u64 stripe_len;
-        int stripe_index;
+        u32 stripe_index;
        int i;
        int ret = 0;
        int num_stripes;
@@ -4995,7 +4987,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
         * stripe_nr counts the total number of stripes we have to stride
         * to get to this block
         */
-        do_div(stripe_nr, stripe_len);
+        stripe_nr = div64_u64(stripe_nr, stripe_len);
        stripe_offset = stripe_nr * stripe_len;
        BUG_ON(offset < stripe_offset);
@@ -5011,7 +5003,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                /* allow a write of a full stripe, but make sure we don't
                 * allow straddling of stripes
                 */
-                do_div(raid56_full_stripe_start, full_stripe_len);
+                raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
+                                full_stripe_len);
                raid56_full_stripe_start *= full_stripe_len;
        }
@@ -5136,7 +5129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        stripe_index = 0;
        stripe_nr_orig = stripe_nr;
        stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
-        do_div(stripe_nr_end, map->stripe_len);
+        stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
        stripe_end_offset = stripe_nr_end * map->stripe_len -
                            (offset + *length);
@@ -5144,7 +5137,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                if (rw & REQ_DISCARD)
                        num_stripes = min_t(u64, map->num_stripes,
                                            stripe_nr_end - stripe_nr_orig);
-                stripe_index = do_div(stripe_nr, map->num_stripes);
+                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+                                &stripe_index);
                if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
                        mirror_num = 1;
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5170,9 +5164,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-                int factor = map->num_stripes / map->sub_stripes;
+                u32 factor = map->num_stripes / map->sub_stripes;
-                stripe_index = do_div(stripe_nr, factor);
+                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
                stripe_index *= map->sub_stripes;
                if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5198,8 +5192,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
                     mirror_num > 1)) {
                        /* push stripe_nr back to the start of the full stripe */
-                        stripe_nr = raid56_full_stripe_start;
+                        stripe_nr = div_u64(raid56_full_stripe_start,
-                        do_div(stripe_nr, stripe_len * nr_data_stripes(map));
+                                        stripe_len * nr_data_stripes(map));
                        /* RAID[56] write or recovery. Return all stripes */
                        num_stripes = map->num_stripes;
@@ -5209,32 +5203,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        stripe_index = 0;
                        stripe_offset = 0;
                } else {
-                        u64 tmp;
                        /*
                         * Mirror #0 or #1 means the original data block.
                         * Mirror #2 is RAID5 parity block.
                         * Mirror #3 is RAID6 Q block.
                         */
-                        stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+                        stripe_nr = div_u64_rem(stripe_nr,
+                                        nr_data_stripes(map), &stripe_index);
                        if (mirror_num > 1)
                                stripe_index = nr_data_stripes(map) +
                                                mirror_num - 2;
                        /* We distribute the parity blocks across stripes */
-                        tmp = stripe_nr + stripe_index;
+                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
-                        stripe_index = do_div(tmp, map->num_stripes);
+                                        &stripe_index);
                        if (!(rw & (REQ_WRITE | REQ_DISCARD |
                                    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
                                mirror_num = 1;
                }
        } else {
                /*
-                 * after this do_div call, stripe_nr is the number of stripes
+                 * after this, stripe_nr is the number of stripes on this
-                 * on this device we have to walk to find the data, and
+                 * device we have to walk to find the data, and stripe_index is
-                 * stripe_index is the number of our device in the stripe array
+                 * the number of our device in the stripe array
                 */
-                stripe_index = do_div(stripe_nr, map->num_stripes);
+                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+                                &stripe_index);
                mirror_num = stripe_index + 1;
        }
        BUG_ON(stripe_index >= map->num_stripes);
@@ -5261,7 +5255,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
            need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
            mirror_num > 1)) {
                u64 tmp;
-                int i, rot;
+                unsigned rot;
                bbio->raid_map = (u64 *)((void *)bbio->stripes +
                                 sizeof(struct btrfs_bio_stripe) *
@@ -5269,8 +5263,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                 sizeof(int) * tgtdev_indexes);
                /* Work out the disk rotation on this stripe-set */
-                tmp = stripe_nr;
+                div_u64_rem(stripe_nr, num_stripes, &rot);
-                rot = do_div(tmp, num_stripes);
                /* Fill in the logical address of each stripe */
                tmp = stripe_nr * nr_data_stripes(map);
@@ -5285,8 +5278,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        }
        if (rw & REQ_DISCARD) {
-                int factor = 0;
+                u32 factor = 0;
-                int sub_stripes = 0;
+                u32 sub_stripes = 0;
                u64 stripes_per_dev = 0;
                u32 remaining_stripes = 0;
                u32 last_stripe = 0;
@@ -5437,9 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        }
                }
                if (found) {
-                        u64 length = map->stripe_len;
+                        if (physical_of_found + map->stripe_len <=
-                        if (physical_of_found + length <=
                            dev_replace->cursor_left) {
                                struct btrfs_bio_stripe *tgtdev_stripe =
                                        bbio->stripes + num_stripes;
@@ -5535,15 +5526,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        rmap_len = map->stripe_len;
        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
-                do_div(length, map->num_stripes / map->sub_stripes);
+                length = div_u64(length, map->num_stripes / map->sub_stripes);
        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
-                do_div(length, map->num_stripes);
+                length = div_u64(length, map->num_stripes);
        else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                do_div(length, nr_data_stripes(map));
+                length = div_u64(length, nr_data_stripes(map));
                rmap_len = map->stripe_len * nr_data_stripes(map);
        }
-        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+        buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
        BUG_ON(!buf); /* -ENOMEM */
        for (i = 0; i < map->num_stripes; i++) {
@@ -5554,11 +5545,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                        continue;
                stripe_nr = physical - map->stripes[i].physical;
-                do_div(stripe_nr, map->stripe_len);
+                stripe_nr = div_u64(stripe_nr, map->stripe_len);
                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
                        stripe_nr = stripe_nr * map->num_stripes + i;
-                        do_div(stripe_nr, map->sub_stripes);
+                        stripe_nr = div_u64(stripe_nr, map->sub_stripes);
                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                        stripe_nr = stripe_nr * map->num_stripes + i;
                } /* else if RAID[56], multiply by nr_data_stripes().
@@ -5835,8 +5826,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        u64 length = 0;
        u64 map_length;
        int ret;
-        int dev_nr = 0;
+        int dev_nr;
-        int total_devs = 1;
+        int total_devs;
        struct btrfs_bio *bbio = NULL;
        length = bio->bi_iter.bi_size;
@@ -5877,11 +5868,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                BUG();
        }
-        while (dev_nr < total_devs) {
+        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
                dev = bbio->stripes[dev_nr].dev;
                if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
                        bbio_error(bbio, first_bio, logical);
-                        dev_nr++;
                        continue;
                }
@@ -5894,7 +5884,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        ret = breakup_stripe_bio(root, bbio, first_bio, dev,
                                                 dev_nr, rw, async_submit);
                        BUG_ON(ret);
-                        dev_nr++;
                        continue;
                }
@@ -5909,7 +5898,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                submit_stripe_bio(root, bbio, bio,
                                  bbio->stripes[dev_nr].physical, dev_nr, rw,
                                  async_submit);
-                dev_nr++;
        }
        btrfs_bio_counter_dec(root->fs_info);
        return 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 83069dec6898..ebc31331a837 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
-                               struct btrfs_fs_devices *fs_devices, int step);
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
                                         char *device_path,
                                         struct btrfs_device **device);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 883b93623bc5..45ea704be030 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
 /*
 * Check if the attribute is in a supported namespace.
 *
- * This applied after the check for the synthetic attributes in the system
+ * This is applied after the check for the synthetic attributes in the system
 * namespace.
 */
-static bool btrfs_is_valid_xattr(const char *name)
+static int btrfs_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, XATTR_SECURITY_PREFIX,
+        int len = strlen(name);
-                        XATTR_SECURITY_PREFIX_LEN) ||
+        int prefixlen = 0;
-               !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
-               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+        if (!strncmp(name, XATTR_SECURITY_PREFIX,
-               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+                        XATTR_SECURITY_PREFIX_LEN))
-                !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
+                prefixlen = XATTR_SECURITY_PREFIX_LEN;
+        else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                prefixlen = XATTR_SYSTEM_PREFIX_LEN;
+        else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+                prefixlen = XATTR_TRUSTED_PREFIX_LEN;
+        else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+                prefixlen = XATTR_USER_PREFIX_LEN;
+        else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+                prefixlen = XATTR_BTRFS_PREFIX_LEN;
+        else
+                return -EOPNOTSUPP;
+        /*
+         * The name cannot consist of just prefix
+         */
+        if (len <= prefixlen)
+                return -EINVAL;
+        return 0;
 }
 ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
                       void *buffer, size_t size)
 {
+        int ret;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -388,8 +408,9 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return generic_getxattr(dentry, name, buffer, size);
-        if (!btrfs_is_valid_xattr(name))
+        ret = btrfs_is_valid_xattr(name);
-                return -EOPNOTSUPP;
+        if (ret)
+                return ret;
        return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
 }
@@ -397,6 +418,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
 {
        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        int ret;
        /*
         * The permission on security.* and system.* is not checked
@@ -413,8 +435,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return generic_setxattr(dentry, name, value, size, flags);
-        if (!btrfs_is_valid_xattr(name))
+        ret = btrfs_is_valid_xattr(name);
-                return -EOPNOTSUPP;
+        if (ret)
+                return ret;
        if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
                return btrfs_set_prop(dentry->d_inode, name,
@@ -430,6 +453,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        int ret;
        /*
         * The permission on security.* and system.* is not checked
@@ -446,8 +470,9 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return generic_removexattr(dentry, name);
-        if (!btrfs_is_valid_xattr(name))
+        ret = btrfs_is_valid_xattr(name);
-                return -EOPNOTSUPP;
+        if (ret)
+                return ret;
        if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
                return btrfs_set_prop(dentry->d_inode, name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index fb22fd8d8fb8..82990b8f872b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -403,7 +403,7 @@ next:
        return ret;
 }
-struct btrfs_compress_op btrfs_zlib_compress = {
+const struct btrfs_compress_op btrfs_zlib_compress = {
        .alloc_workspace        = zlib_alloc_workspace,
        .free_workspace         = zlib_free_workspace,
        .compress_pages         = zlib_compress_pages,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 155ab9c0246b..e162bcd105ee 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
                     inode, page, (int)pos, (int)len);
                r = ceph_update_writeable_page(file, pos, len, page);
+                if (r < 0)
+                        page_cache_release(page);
+                else
+                        *pagep = page;
        } while (r == -EAGAIN);
        return r;
@@ -1534,19 +1538,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
        osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
-        err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
+        {
-                                    "inline_version", &inline_version,
+                __le64 xattr_buf = cpu_to_le64(inline_version);
-                                    sizeof(inline_version),
+                err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
-                                    CEPH_OSD_CMPXATTR_OP_GT,
+                                            "inline_version", &xattr_buf,
-                                    CEPH_OSD_CMPXATTR_MODE_U64);
+                                            sizeof(xattr_buf),
-        if (err)
+                                            CEPH_OSD_CMPXATTR_OP_GT,
-                goto out_put;
+                                            CEPH_OSD_CMPXATTR_MODE_U64);
+                if (err)
-        err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+                        goto out_put;
-                                    "inline_version", &inline_version,
+        }
-                                    sizeof(inline_version), 0, 0);
-        if (err)
+        {
-                goto out_put;
+                char xattr_buf[32];
+                int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
+                                         "%llu", inline_version);
+                err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
+                                            "inline_version",
+                                            xattr_buf, xattr_len, 0, 0);
+                if (err)
+                        goto out_put;
+        }
        ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
        err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8172775428a0..11631c4c7d14 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode)
        return ret;
 }
+static void drop_inode_snap_realm(struct ceph_inode_info *ci)
+{
+        struct ceph_snap_realm *realm = ci->i_snap_realm;
+        spin_lock(&realm->inodes_with_caps_lock);
+        list_del_init(&ci->i_snap_realm_item);
+        ci->i_snap_realm_counter++;
+        ci->i_snap_realm = NULL;
+        spin_unlock(&realm->inodes_with_caps_lock);
+        ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
+                            realm);
+}
 /*
 * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
 *
@@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
        if (removed)
                ceph_put_cap(mdsc, cap);
-        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+        /* when reconnect denied, we remove session caps forcibly,
-                struct ceph_snap_realm *realm = ci->i_snap_realm;
+         * i_wr_ref can be non-zero. If there are ongoing write,
-                spin_lock(&realm->inodes_with_caps_lock);
+         * keep i_snap_realm.
-                list_del_init(&ci->i_snap_realm_item);
+         */
-                ci->i_snap_realm_counter++;
+        if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
-                ci->i_snap_realm = NULL;
+                drop_inode_snap_realm(ci);
-                spin_unlock(&realm->inodes_with_caps_lock);
-                ceph_put_snap_realm(mdsc, realm);
-        }
        if (!__ceph_is_any_real_caps(ci))
                __cap_delay_cancel(mdsc, ci);
 }
@@ -1394,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
        int was = ci->i_dirty_caps;
        int dirty = 0;
+        if (!ci->i_auth_cap) {
+                pr_warn("__mark_dirty_caps %p %llx mask %s, "
+                        "but no auth cap (session was closed?)\n",
+                        inode, ceph_ino(inode), ceph_cap_string(mask));
+                return 0;
+        }
        dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
             ceph_cap_string(mask), ceph_cap_string(was),
             ceph_cap_string(was | mask));
@@ -1404,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
                                ci->i_snap_realm->cached_context);
                dout(" inode %p now dirty snapc %p auth cap %p\n",
                     &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
-                WARN_ON(!ci->i_auth_cap);
                BUG_ON(!list_empty(&ci->i_dirty_item));
                spin_lock(&mdsc->cap_dirty_lock);
                list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1545,7 +1561,19 @@ retry_locked:
        if (!mdsc->stopping && inode->i_nlink > 0) {
                if (want) {
                        retain |= CEPH_CAP_ANY;       /* be greedy */
+                } else if (S_ISDIR(inode->i_mode) &&
+                           (issued & CEPH_CAP_FILE_SHARED) &&
+                            __ceph_dir_is_complete(ci)) {
+                        /*
+                         * If a directory is complete, we want to keep
+                         * the exclusive cap. So that MDS does not end up
+                         * revoking the shared cap on every create/unlink
+                         * operation.
+                         */
+                        want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
+                        retain |= want;
                } else {
                        retain |= CEPH_CAP_ANY_SHARED;
                        /*
                         * keep RD only if we didn't have the file open RW,
@@ -2309,6 +2337,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                                        wake = 1;
                                }
                        }
+                        /* see comment in __ceph_remove_cap() */
+                        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
+                                drop_inode_snap_realm(ci);
                }
        spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 83e9976f7189..e729b79812b4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
        /* can we use the dcache? */
        spin_lock(&ci->i_ceph_lock);
        if ((ctx->pos == 2 || fi->dentry) &&
+            ceph_test_mount_opt(fsc, DCACHE) &&
            !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
            ceph_snap(inode) != CEPH_SNAPDIR &&
            __ceph_dir_is_complete_ordered(ci) &&
@@ -336,16 +337,23 @@ more:
                        ceph_mdsc_put_request(req);
                        return err;
                }
-                req->r_inode = inode;
-                ihold(inode);
-                req->r_dentry = dget(file->f_path.dentry);
                /* hints to request -> mds selection code */
                req->r_direct_mode = USE_AUTH_MDS;
                req->r_direct_hash = ceph_frag_value(frag);
                req->r_direct_is_hash = true;
-                req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                if (fi->last_name) {
+                        req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+                        if (!req->r_path2) {
+                                ceph_mdsc_put_request(req);
+                                return -ENOMEM;
+                        }
+                }
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
+                req->r_inode = inode;
+                ihold(inode);
+                req->r_dentry = dget(file->f_path.dentry);
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
                        ceph_mdsc_put_request(req);
@@ -629,6 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                            fsc->mount_options->snapdir_name,
                            dentry->d_name.len) &&
                    !is_root_ceph_dentry(dir, dentry) &&
+                    ceph_test_mount_opt(fsc, DCACHE) &&
                    __ceph_dir_is_complete(ci) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
                        spin_unlock(&ci->i_ceph_lock);
@@ -755,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_dentry = dget(dentry);
-        req->r_num_caps = 2;
        req->r_path2 = kstrdup(dest, GFP_NOFS);
+        if (!req->r_path2) {
+                err = -ENOMEM;
+                ceph_mdsc_put_request(req);
+                goto out;
+        }
        req->r_locked_dir = dir;
+        req->r_dentry = dget(dentry);
+        req->r_num_caps = 2;
        req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
        req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
        err = ceph_mdsc_do_request(mdsc, dir, req);
@@ -933,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
+        int op = CEPH_MDS_OP_RENAME;
        int err;
        if (ceph_snap(old_dir) != ceph_snap(new_dir))
                return -EXDEV;
-        if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+        if (ceph_snap(old_dir) != CEPH_NOSNAP) {
-            ceph_snap(new_dir) != CEPH_NOSNAP)
+                if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
-                return -EROFS;
+                        op = CEPH_MDS_OP_RENAMESNAP;
+                else
+                        return -EROFS;
+        }
        dout("rename dir %p dentry %p to dir %p dentry %p\n",
             old_dir, old_dentry, new_dir, new_dentry);
-        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
+        req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
        ihold(old_dir);
@@ -1240,11 +1258,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
                dout("dir_fsync %p wait on tid %llu (until %llu)\n",
                     inode, req->r_tid, last_tid);
                if (req->r_timeout) {
-                        ret = wait_for_completion_timeout(
+                        unsigned long time_left = wait_for_completion_timeout(
-                                &req->r_safe_completion, req->r_timeout);
+                                                        &req->r_safe_completion,
-                        if (ret > 0)
+                                                        req->r_timeout);
+                        if (time_left > 0)
                                ret = 0;
-                        else if (ret == 0)
+                        else
                                ret = -EIO;  /* timed out */
                } else {
                        wait_for_completion(&req->r_safe_completion);
@@ -1372,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = {
        .getattr = ceph_getattr,
        .mkdir = ceph_mkdir,
        .rmdir = ceph_unlink,
+        .rename = ceph_rename,
 };
 const struct dentry_operations ceph_dentry_ops = {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 71c073f38e54..0a2eb32ffe43 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
        spin_unlock(&session->s_cap_lock);
 }
+static void cleanup_session_requests(struct ceph_mds_client *mdsc,
+                                     struct ceph_mds_session *session)
+{
+        struct ceph_mds_request *req;
+        struct rb_node *p;
+        dout("cleanup_session_requests mds%d\n", session->s_mds);
+        mutex_lock(&mdsc->mutex);
+        while (!list_empty(&session->s_unsafe)) {
+                req = list_first_entry(&session->s_unsafe,
+                                       struct ceph_mds_request, r_unsafe_item);
+                list_del_init(&req->r_unsafe_item);
+                pr_info(" dropping unsafe request %llu\n", req->r_tid);
+                __unregister_request(mdsc, req);
+        }
+        /* zero r_attempts, so kick_requests() will re-send requests */
+        p = rb_first(&mdsc->request_tree);
+        while (p) {
+                req = rb_entry(p, struct ceph_mds_request, r_node);
+                p = rb_next(p);
+                if (req->r_session &&
+                    req->r_session->s_mds == session->s_mds)
+                        req->r_attempts = 0;
+        }
+        mutex_unlock(&mdsc->mutex);
+}
 /*
 * Helper to safely iterate over all caps associated with a session, with
 * special care taken to handle a racing __ceph_remove_cap().
@@ -1098,7 +1125,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
             cap, ci, &ci->vfs_inode);
        spin_lock(&ci->i_ceph_lock);
        __ceph_remove_cap(cap, false);
-        if (!__ceph_is_any_real_caps(ci)) {
+        if (!ci->i_auth_cap) {
                struct ceph_mds_client *mdsc =
                        ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1120,13 +1147,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
                        mdsc->num_cap_flushing--;
                        drop = 1;
                }
-                if (drop && ci->i_wrbuffer_ref) {
-                        pr_info(" dropping dirty data for %p %lld\n",
-                                inode, ceph_ino(inode));
-                        ci->i_wrbuffer_ref = 0;
-                        ci->i_wrbuffer_ref_head = 0;
-                        drop++;
-                }
                spin_unlock(&mdsc->cap_dirty_lock);
        }
        spin_unlock(&ci->i_ceph_lock);
@@ -1853,7 +1873,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
 */
 static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                                               struct ceph_mds_request *req,
-                                               int mds)
+                                               int mds, bool drop_cap_releases)
 {
        struct ceph_msg *msg;
        struct ceph_mds_request_head *head;
@@ -1937,6 +1957,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
                releases += ceph_encode_inode_release(&p,
                      req->r_old_dentry->d_inode,
                      mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
+        if (drop_cap_releases) {
+                releases = 0;
+                p = msg->front.iov_base + req->r_request_release_offset;
+        }
        head->num_releases = cpu_to_le16(releases);
        /* time stamp */
@@ -1989,7 +2015,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
 */
 static int __prepare_send_request(struct ceph_mds_client *mdsc,
                                  struct ceph_mds_request *req,
-                                  int mds)
+                                  int mds, bool drop_cap_releases)
 {
        struct ceph_mds_request_head *rhead;
        struct ceph_msg *msg;
@@ -2048,7 +2074,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
                ceph_msg_put(req->r_request);
                req->r_request = NULL;
        }
-        msg = create_request_message(mdsc, req, mds);
+        msg = create_request_message(mdsc, req, mds, drop_cap_releases);
        if (IS_ERR(msg)) {
                req->r_err = PTR_ERR(msg);
                complete_request(mdsc, req);
@@ -2132,7 +2158,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        if (req->r_request_started == 0)   /* note request start time */
                req->r_request_started = jiffies;
-        err = __prepare_send_request(mdsc, req, mds);
+        err = __prepare_send_request(mdsc, req, mds, false);
        if (!err) {
                ceph_msg_get(req->r_request);
                ceph_con_send(&session->s_con, req->r_request);
@@ -2590,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session,
        case CEPH_SESSION_CLOSE:
                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
                        pr_info("mds%d reconnect denied\n", session->s_mds);
+                cleanup_session_requests(mdsc, session);
                remove_session_caps(session);
                wake = 2; /* for good measure */
                wake_up_all(&mdsc->session_close_wq);
@@ -2658,7 +2685,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
        list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
-                err = __prepare_send_request(mdsc, req, session->s_mds);
+                err = __prepare_send_request(mdsc, req, session->s_mds, true);
                if (!err) {
                        ceph_msg_get(req->r_request);
                        ceph_con_send(&session->s_con, req->r_request);
@@ -2679,7 +2706,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
                        continue; /* only old requests */
                if (req->r_session &&
                    req->r_session->s_mds == session->s_mds) {
-                        err = __prepare_send_request(mdsc, req, session->s_mds);
+                        err = __prepare_send_request(mdsc, req,
+                                                     session->s_mds, true);
                        if (!err) {
                                ceph_msg_get(req->r_request);
                                ceph_con_send(&session->s_con, req->r_request);
@@ -2864,7 +2892,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        spin_unlock(&session->s_cap_lock);
        /* trim unused caps to reduce MDS's cache rejoin time */
-        shrink_dcache_parent(mdsc->fsc->sb->s_root);
+        if (mdsc->fsc->sb->s_root)
+                shrink_dcache_parent(mdsc->fsc->sb->s_root);
        ceph_con_close(&session->s_con);
        ceph_con_open(&session->s_con,
@@ -3133,7 +3162,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                    di->lease_renew_from &&
                    di->lease_renew_after == 0) {
                        unsigned long duration =
-                                le32_to_cpu(h->duration_ms) * HZ / 1000;
+                                msecs_to_jiffies(le32_to_cpu(h->duration_ms));
                        di->lease_seq = seq;
                        dentry->d_time = di->lease_renew_from + duration;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 51cc23e48111..89e6bc321df3 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op)
        case CEPH_MDS_OP_LSSNAP: return "lssnap";
        case CEPH_MDS_OP_MKSNAP: return "mksnap";
        case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+        case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
        case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
        case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
        }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a63997b8bcff..e463ebd69a9c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -345,6 +345,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
        fsopt->rsize = CEPH_RSIZE_DEFAULT;
        fsopt->rasize = CEPH_RASIZE_DEFAULT;
        fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+        if (!fsopt->snapdir_name) {
+                err = -ENOMEM;
+                goto out;
+        }
        fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
        fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
@@ -406,31 +411,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
        struct ceph_mount_options *fsopt = fsc->mount_options;
-        struct ceph_options *opt = fsc->client->options;
+        size_t pos;
+        int ret;
-        if (opt->flags & CEPH_OPT_FSID)
-                seq_printf(m, ",fsid=%pU", &opt->fsid);
+        /* a comma between MNT/MS and client options */
-        if (opt->flags & CEPH_OPT_NOSHARE)
+        seq_putc(m, ',');
-                seq_puts(m, ",noshare");
+        pos = m->count;
-        if (opt->flags & CEPH_OPT_NOCRC)
-                seq_puts(m, ",nocrc");
+        ret = ceph_print_client_options(m, fsc->client);
-        if (opt->flags & CEPH_OPT_NOMSGAUTH)
+        if (ret)
-                seq_puts(m, ",nocephx_require_signatures");
+                return ret;
-        if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
-                seq_puts(m, ",notcp_nodelay");
+        /* retract our comma if no client options */
+        if (m->count == pos)
-        if (opt->name)
+                m->count--;
-                seq_printf(m, ",name=%s", opt->name);
-        if (opt->key)
-                seq_puts(m, ",secret=<hidden>");
-        if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
-                seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
-        if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
-                seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
-        if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
-                seq_printf(m, ",osdkeepalivetimeout=%d",
-                           opt->osd_keepalive_timeout);
        if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
                seq_puts(m, ",dirstat");
@@ -438,14 +432,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",norbytes");
        if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
-        if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
+        if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
-                seq_puts(m, ",dcache");
-        else
                seq_puts(m, ",nodcache");
        if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
                seq_puts(m, ",fsc");
-        else
-                seq_puts(m, ",nofsc");
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
        if (fsopt->sb_flags & MS_POSIXACL)
@@ -477,6 +467,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
        if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
        return 0;
 }
@@ -730,6 +721,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
        if (IS_ERR(req))
                return ERR_CAST(req);
        req->r_path1 = kstrdup(path, GFP_NOFS);
+        if (!req->r_path1) {
+                root = ERR_PTR(-ENOMEM);
+                goto out;
+        }
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
        req->r_started = started;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 04c8124ed30e..fa20e1318939 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
 #define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 #define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
-#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES)
+#define CEPH_MOUNT_OPT_DEFAULT    (CEPH_MOUNT_OPT_RBYTES | \
+                                   CEPH_MOUNT_OPT_DCACHE)
 #define ceph_set_mount_opt(fsc, opt) \
        (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -881,7 +882,6 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
 /* file.c */
 extern const struct file_operations ceph_file_fops;
-extern const struct address_space_operations ceph_aops;
 extern int ceph_open(struct inode *inode, struct file *file);
 extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5a492caf34cb..5c4c9c256931 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                err = PTR_ERR(req);
                goto out;
        }
-        req->r_inode = inode;
-        ihold(inode);
-        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
-        req->r_num_caps = 1;
        req->r_args.setxattr.flags = cpu_to_le32(flags);
        req->r_path2 = kstrdup(name, GFP_NOFS);
+        if (!req->r_path2) {
+                ceph_mdsc_put_request(req);
+                err = -ENOMEM;
+                goto out;
+        }
        req->r_pagelist = pagelist;
        pagelist = NULL;
+        req->r_inode = inode;
+        ihold(inode);
+        req->r_num_caps = 1;
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
        dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        ceph_mdsc_put_request(req);
@@ -1019,12 +1026,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
                                       USE_AUTH_MDS);
        if (IS_ERR(req))
                return PTR_ERR(req);
+        req->r_path2 = kstrdup(name, GFP_NOFS);
+        if (!req->r_path2)
+                return -ENOMEM;
        req->r_inode = inode;
        ihold(inode);
-        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
        req->r_num_caps = 1;
-        req->r_path2 = kstrdup(name, GFP_NOFS);
+        req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
        err = ceph_mdsc_do_request(mdsc, NULL, req);
        ceph_mdsc_put_request(req);
        return err;
diff --git a/fs/exec.c b/fs/exec.c
index 02bfd980a40c..49a1c61433b7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1275,6 +1275,53 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
        spin_unlock(&p->fs->lock);
 }
+static void bprm_fill_uid(struct linux_binprm *bprm)
+{
+        struct inode *inode;
+        unsigned int mode;
+        kuid_t uid;
+        kgid_t gid;
+        /* clear any previous set[ug]id data from a previous binary */
+        bprm->cred->euid = current_euid();
+        bprm->cred->egid = current_egid();
+        if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+                return;
+        if (task_no_new_privs(current))
+                return;
+        inode = file_inode(bprm->file);
+        mode = READ_ONCE(inode->i_mode);
+        if (!(mode & (S_ISUID|S_ISGID)))
+                return;
+        /* Be careful if suid/sgid is set */
+        mutex_lock(&inode->i_mutex);
+        /* reload atomically mode/uid/gid now that lock held */
+        mode = inode->i_mode;
+        uid = inode->i_uid;
+        gid = inode->i_gid;
+        mutex_unlock(&inode->i_mutex);
+        /* We ignore suid/sgid if there are no mappings for them in the ns */
+        if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
+                 !kgid_has_mapping(bprm->cred->user_ns, gid))
+                return;
+        if (mode & S_ISUID) {
+                bprm->per_clear |= PER_CLEAR_ON_SETID;
+                bprm->cred->euid = uid;
+        }
+        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
+                bprm->per_clear |= PER_CLEAR_ON_SETID;
+                bprm->cred->egid = gid;
+        }
+}
 /*
 * Fill the binprm structure from the inode.
 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
@@ -1283,36 +1330,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 */
 int prepare_binprm(struct linux_binprm *bprm)
 {
-        struct inode *inode = file_inode(bprm->file);
-        umode_t mode = inode->i_mode;
        int retval;
+        bprm_fill_uid(bprm);
-        /* clear any previous set[ug]id data from a previous binary */
-        bprm->cred->euid = current_euid();
-        bprm->cred->egid = current_egid();
-        if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
-            !task_no_new_privs(current) &&
-            kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
-            kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
-                /* Set-uid? */
-                if (mode & S_ISUID) {
-                        bprm->per_clear |= PER_CLEAR_ON_SETID;
-                        bprm->cred->euid = inode->i_uid;
-                }
-                /* Set-gid? */
-                /*
-                 * If setgid is set but no group execute bit then this
-                 * is a candidate for mandatory locking, not a setgid
-                 * executable.
-                 */
-                if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
-                        bprm->per_clear |= PER_CLEAR_ON_SETID;
-                        bprm->cred->egid = inode->i_gid;
-                }
-        }
        /* fill in binprm security blob */
        retval = security_bprm_set_creds(bprm);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index efea5d5c44ce..18228c201f7f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -64,6 +64,23 @@ config EXT4_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
+config EXT4_FS_ENCRYPTION
+        bool "Ext4 Encryption"
+        depends on EXT4_FS
+        select CRYPTO_AES
+        select CRYPTO_CBC
+        select CRYPTO_ECB
+        select CRYPTO_XTS
+        select CRYPTO_CTS
+        select CRYPTO_SHA256
+        select KEYS
+        select ENCRYPTED_KEYS
+        help
+          Enable encryption of ext4 files and directories.  This
+          feature is similar to ecryptfs, but it is more memory
+          efficient since it avoids caching the encrypted and
+          decrypted pages in the page cache.
 config EXT4_DEBUG
        bool "EXT4 debugging support"
        depends on EXT4_FS
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 0310fec2ee3d..75285ea9aa05 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,9 @@ ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
                ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
                ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
                mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-                xattr_trusted.o inline.o
+                xattr_trusted.o inline.o readpage.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)        += acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)         += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_ENCRYPTION)       += crypto_policy.o crypto.o \
+                crypto_key.o crypto_fname.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d40c8dbbb0d6..69b1e73026a5 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -4,11 +4,6 @@
 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
 */
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/capability.h>
-#include <linux/fs.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 83a6f497c4e0..955bf49a7945 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -14,7 +14,6 @@
 #include <linux/time.h>
 #include <linux/capability.h>
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include "ext4.h"
@@ -641,8 +640,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
-                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
                dquot_alloc_block_nofail(inode,
                                EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
        }
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index b610779a958c..4a606afb171f 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -8,7 +8,6 @@
 */
 #include <linux/buffer_head.h>
-#include <linux/jbd2.h>
 #include "ext4.h"
 unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 41eb9dcfac7e..3522340c7a99 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,7 +16,6 @@
 #include <linux/swap.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
 #include "ext4.h"
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..8ff15273ab0c
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,558 @@
+/*
+ * linux/fs/ext4/crypto.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption functions for ext4
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *      Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *      Ildar Muslukhov, 2014
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/user-type.h>
+#include <keys/encrypted-type.h>
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include "ext4_extents.h"
+#include "xattr.h"
+/* Encryption added and removed here! (L: */
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+                 "Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+                 "Number of crypto contexts to preallocate");
+static mempool_t *ext4_bounce_page_pool;
+static LIST_HEAD(ext4_free_crypto_ctxs);
+static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
+/**
+ * ext4_release_crypto_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
+{
+        unsigned long flags;
+        if (ctx->bounce_page) {
+                if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL)
+                        __free_page(ctx->bounce_page);
+                else
+                        mempool_free(ctx->bounce_page, ext4_bounce_page_pool);
+                ctx->bounce_page = NULL;
+        }
+        ctx->control_page = NULL;
+        if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+                if (ctx->tfm)
+                        crypto_free_tfm(ctx->tfm);
+                kfree(ctx);
+        } else {
+                spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+                list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+                spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+        }
+}
+/**
+ * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context
+ * @mask: The allocation mask.
+ *
+ * Return: An allocated and initialized encryption context on success. An error
+ * value or NULL otherwise.
+ */
+static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask)
+{
+        struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx),
+                                              mask);
+        if (!ctx)
+                return ERR_PTR(-ENOMEM);
+        return ctx;
+}
+/**
+ * ext4_get_crypto_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
+{
+        struct ext4_crypto_ctx *ctx = NULL;
+        int res = 0;
+        unsigned long flags;
+        struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key;
+        if (!ext4_read_workqueue)
+                ext4_init_crypto();
+        /*
+         * We first try getting the ctx from a free list because in
+         * the common case the ctx will have an allocated and
+         * initialized crypto tfm, so it's probably a worthwhile
+         * optimization. For the bounce page, we first try getting it
+         * from the kernel allocator because that's just about as fast
+         * as getting it from a list and because a cache of free pages
+         * should generally be a "last resort" option for a filesystem
+         * to be able to do its job.
+         */
+        spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
+        ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
+                                       struct ext4_crypto_ctx, free_list);
+        if (ctx)
+                list_del(&ctx->free_list);
+        spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
+        if (!ctx) {
+                ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS);
+                if (IS_ERR(ctx)) {
+                        res = PTR_ERR(ctx);
+                        goto out;
+                }
+                ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+        } else {
+                ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
+        }
+        /* Allocate a new Crypto API context if we don't already have
+         * one or if it isn't the right mode. */
+        BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID);
+        if (ctx->tfm && (ctx->mode != key->mode)) {
+                crypto_free_tfm(ctx->tfm);
+                ctx->tfm = NULL;
+                ctx->mode = EXT4_ENCRYPTION_MODE_INVALID;
+        }
+        if (!ctx->tfm) {
+                switch (key->mode) {
+                case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+                        ctx->tfm = crypto_ablkcipher_tfm(
+                                crypto_alloc_ablkcipher("xts(aes)", 0, 0));
+                        break;
+                case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+                        /* TODO(mhalcrow): AEAD w/ gcm(aes);
+                         * crypto_aead_setauthsize() */
+                        ctx->tfm = ERR_PTR(-ENOTSUPP);
+                        break;
+                default:
+                        BUG();
+                }
+                if (IS_ERR_OR_NULL(ctx->tfm)) {
+                        res = PTR_ERR(ctx->tfm);
+                        ctx->tfm = NULL;
+                        goto out;
+                }
+                ctx->mode = key->mode;
+        }
+        BUG_ON(key->size != ext4_encryption_key_size(key->mode));
+        /* There shouldn't be a bounce page attached to the crypto
+         * context at this point. */
+        BUG_ON(ctx->bounce_page);
+out:
+        if (res) {
+                if (!IS_ERR_OR_NULL(ctx))
+                        ext4_release_crypto_ctx(ctx);
+                ctx = ERR_PTR(res);
+        }
+        return ctx;
+}
+struct workqueue_struct *ext4_read_workqueue;
+static DEFINE_MUTEX(crypto_init);
+/**
+ * ext4_exit_crypto() - Shutdown the ext4 encryption system
+ */
+void ext4_exit_crypto(void)
+{
+        struct ext4_crypto_ctx *pos, *n;
+        list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) {
+                if (pos->bounce_page) {
+                        if (pos->flags &
+                            EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) {
+                                __free_page(pos->bounce_page);
+                        } else {
+                                mempool_free(pos->bounce_page,
+                                             ext4_bounce_page_pool);
+                        }
+                }
+                if (pos->tfm)
+                        crypto_free_tfm(pos->tfm);
+                kfree(pos);
+        }
+        INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
+        if (ext4_bounce_page_pool)
+                mempool_destroy(ext4_bounce_page_pool);
+        ext4_bounce_page_pool = NULL;
+        if (ext4_read_workqueue)
+                destroy_workqueue(ext4_read_workqueue);
+        ext4_read_workqueue = NULL;
+}
+/**
+ * ext4_init_crypto() - Set up for ext4 encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_init_crypto(void)
+{
+        int i, res;
+        mutex_lock(&crypto_init);
+        if (ext4_read_workqueue)
+                goto already_initialized;
+        ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
+        if (!ext4_read_workqueue) {
+                res = -ENOMEM;
+                goto fail;
+        }
+        for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+                struct ext4_crypto_ctx *ctx;
+                ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL);
+                if (IS_ERR(ctx)) {
+                        res = PTR_ERR(ctx);
+                        goto fail;
+                }
+                list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
+        }
+        ext4_bounce_page_pool =
+                mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+        if (!ext4_bounce_page_pool) {
+                res = -ENOMEM;
+                goto fail;
+        }
+already_initialized:
+        mutex_unlock(&crypto_init);
+        return 0;
+fail:
+        ext4_exit_crypto();
+        mutex_unlock(&crypto_init);
+        return res;
+}
+void ext4_restore_control_page(struct page *data_page)
+{
+        struct ext4_crypto_ctx *ctx =
+                (struct ext4_crypto_ctx *)page_private(data_page);
+        set_page_private(data_page, (unsigned long)NULL);
+        ClearPagePrivate(data_page);
+        unlock_page(data_page);
+        ext4_release_crypto_ctx(ctx);
+}
+/**
+ * ext4_crypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void ext4_crypt_complete(struct crypto_async_request *req, int res)
+{
+        struct ext4_completion_result *ecr = req->data;
+        if (res == -EINPROGRESS)
+                return;
+        ecr->res = res;
+        complete(&ecr->completion);
+}
+typedef enum {
+        EXT4_DECRYPT = 0,
+        EXT4_ENCRYPT,
+} ext4_direction_t;
+static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
+                            struct inode *inode,
+                            ext4_direction_t rw,
+                            pgoff_t index,
+                            struct page *src_page,
+                            struct page *dest_page)
+{
+        u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
+        struct ablkcipher_request *req = NULL;
+        DECLARE_EXT4_COMPLETION_RESULT(ecr);
+        struct scatterlist dst, src;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm);
+        int res = 0;
+        BUG_ON(!ctx->tfm);
+        BUG_ON(ctx->mode != ei->i_encryption_key.mode);
+        if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) {
+                printk_ratelimited(KERN_ERR
+                                   "%s: unsupported crypto algorithm: %d\n",
+                                   __func__, ctx->mode);
+                return -ENOTSUPP;
+        }
+        crypto_ablkcipher_clear_flags(atfm, ~0);
+        crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw,
+                                       ei->i_encryption_key.size);
+        if (res) {
+                printk_ratelimited(KERN_ERR
+                                   "%s: crypto_ablkcipher_setkey() failed\n",
+                                   __func__);
+                return res;
+        }
+        req = ablkcipher_request_alloc(atfm, GFP_NOFS);
+        if (!req) {
+                printk_ratelimited(KERN_ERR
+                                   "%s: crypto_request_alloc() failed\n",
+                                   __func__);
+                return -ENOMEM;
+        }
+        ablkcipher_request_set_callback(
+                req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                ext4_crypt_complete, &ecr);
+        BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
+        memcpy(xts_tweak, &index, sizeof(index));
+        memset(&xts_tweak[sizeof(index)], 0,
+               EXT4_XTS_TWEAK_SIZE - sizeof(index));
+        sg_init_table(&dst, 1);
+        sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+        sg_init_table(&src, 1);
+        sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+        ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+                                     xts_tweak);
+        if (rw == EXT4_DECRYPT)
+                res = crypto_ablkcipher_decrypt(req);
+        else
+                res = crypto_ablkcipher_encrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                BUG_ON(req->base.data != &ecr);
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+        ablkcipher_request_free(req);
+        if (res) {
+                printk_ratelimited(
+                        KERN_ERR
+                        "%s: crypto_ablkcipher_encrypt() returned %d\n",
+                        __func__, res);
+                return res;
+        }
+        return 0;
+}
+/**
+ * ext4_encrypt() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * ext4_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *ext4_encrypt(struct inode *inode,
+                          struct page *plaintext_page)
+{
+        struct ext4_crypto_ctx *ctx;
+        struct page *ciphertext_page = NULL;
+        int err;
+        BUG_ON(!PageLocked(plaintext_page));
+        ctx = ext4_get_crypto_ctx(inode);
+        if (IS_ERR(ctx))
+                return (struct page *) ctx;
+        /* The encryption operation will require a bounce page. */
+        ciphertext_page = alloc_page(GFP_NOFS);
+        if (!ciphertext_page) {
+                /* This is a potential bottleneck, but at least we'll have
+                 * forward progress. */
+                ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+                                                 GFP_NOFS);
+                if (WARN_ON_ONCE(!ciphertext_page)) {
+                        ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+                                                         GFP_NOFS | __GFP_WAIT);
+                }
+                ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+        } else {
+                ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+        }
+        ctx->bounce_page = ciphertext_page;
+        ctx->control_page = plaintext_page;
+        err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+                               plaintext_page, ciphertext_page);
+        if (err) {
+                ext4_release_crypto_ctx(ctx);
+                return ERR_PTR(err);
+        }
+        SetPagePrivate(ciphertext_page);
+        set_page_private(ciphertext_page, (unsigned long)ctx);
+        lock_page(ciphertext_page);
+        return ciphertext_page;
+}
+/**
+ * ext4_decrypt() - Decrypts a page in-place
+ * @ctx:  The encryption context.
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        return ext4_page_crypto(ctx, page->mapping->host,
+                                EXT4_DECRYPT, page->index, page, page);
+}
+/*
+ * Convenience function which takes care of allocating and
+ * deallocating the encryption context
+ */
+int ext4_decrypt_one(struct inode *inode, struct page *page)
+{
+        int ret;
+        struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
+        if (!ctx)
+                return -ENOMEM;
+        ret = ext4_decrypt(ctx, page);
+        ext4_release_crypto_ctx(ctx);
+        return ret;
+}
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+{
+        struct ext4_crypto_ctx  *ctx;
+        struct page             *ciphertext_page = NULL;
+        struct bio              *bio;
+        ext4_lblk_t             lblk = ex->ee_block;
+        ext4_fsblk_t            pblk = ext4_ext_pblock(ex);
+        unsigned int            len = ext4_ext_get_actual_len(ex);
+        int                     err = 0;
+        BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+        ctx = ext4_get_crypto_ctx(inode);
+        if (IS_ERR(ctx))
+                return PTR_ERR(ctx);
+        ciphertext_page = alloc_page(GFP_NOFS);
+        if (!ciphertext_page) {
+                /* This is a potential bottleneck, but at least we'll have
+                 * forward progress. */
+                ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+                                                 GFP_NOFS);
+                if (WARN_ON_ONCE(!ciphertext_page)) {
+                        ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
+                                                         GFP_NOFS | __GFP_WAIT);
+                }
+                ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+        } else {
+                ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
+        }
+        ctx->bounce_page = ciphertext_page;
+        while (len--) {
+                err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+                                       ZERO_PAGE(0), ciphertext_page);
+                if (err)
+                        goto errout;
+                bio = bio_alloc(GFP_KERNEL, 1);
+                if (!bio) {
+                        err = -ENOMEM;
+                        goto errout;
+                }
+                bio->bi_bdev = inode->i_sb->s_bdev;
+                bio->bi_iter.bi_sector = pblk;
+                err = bio_add_page(bio, ciphertext_page,
+                                   inode->i_sb->s_blocksize, 0);
+                if (err) {
+                        bio_put(bio);
+                        goto errout;
+                }
+                err = submit_bio_wait(WRITE, bio);
+                if (err)
+                        goto errout;
+        }
+        err = 0;
+errout:
+        ext4_release_crypto_ctx(ctx);
+        return err;
+}
+bool ext4_valid_contents_enc_mode(uint32_t mode)
+{
+        return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
+}
+/**
+ * ext4_validate_encryption_key_size() - Validate the encryption key size
+ * @mode: The key mode.
+ * @size: The key size to validate.
+ *
+ * Return: The validated key size for @mode. Zero if invalid.
+ */
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
+{
+        if (size == ext4_encryption_key_size(mode))
+                return size;
+        return 0;
+}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
new file mode 100644
index 000000000000..ca2f5948c1ac
--- /dev/null
+++ b/fs/ext4/crypto_fname.c
@@ -0,0 +1,709 @@
+/*
+ * linux/fs/ext4/crypto_fname.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains functions for filename crypto management in ext4
+ *
+ * Written by Uday Savagaonkar, 2014.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ */
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/key.h>
+#include <linux/list.h>
+#include <linux/mempool.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <linux/spinlock_types.h>
+#include "ext4.h"
+#include "ext4_crypto.h"
+#include "xattr.h"
+/**
+ * ext4_dir_crypt_complete() -
+ */
+static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+        struct ext4_completion_result *ecr = req->data;
+        if (res == -EINPROGRESS)
+                return;
+        ecr->res = res;
+        complete(&ecr->completion);
+}
+bool ext4_valid_filenames_enc_mode(uint32_t mode)
+{
+        return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
+}
+/**
+ * ext4_fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers.  We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
+                              const struct qstr *iname,
+                              struct ext4_str *oname)
+{
+        u32 ciphertext_len;
+        struct ablkcipher_request *req = NULL;
+        DECLARE_EXT4_COMPLETION_RESULT(ecr);
+        struct crypto_ablkcipher *tfm = ctx->ctfm;
+        int res = 0;
+        char iv[EXT4_CRYPTO_BLOCK_SIZE];
+        struct scatterlist sg[1];
+        char *workbuf;
+        if (iname->len <= 0 || iname->len > ctx->lim)
+                return -EIO;
+        ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
+                EXT4_CRYPTO_BLOCK_SIZE : iname->len;
+        ciphertext_len = (ciphertext_len > ctx->lim)
+                        ? ctx->lim : ciphertext_len;
+        /* Allocate request */
+        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        if (!req) {
+                printk_ratelimited(
+                    KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
+                return -ENOMEM;
+        }
+        ablkcipher_request_set_callback(req,
+                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                ext4_dir_crypt_complete, &ecr);
+        /* Map the workpage */
+        workbuf = kmap(ctx->workpage);
+        /* Copy the input */
+        memcpy(workbuf, iname->name, iname->len);
+        if (iname->len < ciphertext_len)
+                memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+        /* Initialize IV */
+        memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+        /* Create encryption request */
+        sg_init_table(sg, 1);
+        sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
+        ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
+        res = crypto_ablkcipher_encrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                BUG_ON(req->base.data != &ecr);
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+        if (res >= 0) {
+                /* Copy the result to output */
+                memcpy(oname->name, workbuf, ciphertext_len);
+                res = ciphertext_len;
+        }
+        kunmap(ctx->workpage);
+        ablkcipher_request_free(req);
+        if (res < 0) {
+                printk_ratelimited(
+                    KERN_ERR "%s: Error (error code %d)\n", __func__, res);
+        }
+        oname->len = ciphertext_len;
+        return res;
+}
+/*
+ * ext4_fname_decrypt()
+ *      This function decrypts the input filename, and returns
+ *      the length of the plaintext.
+ *      Errors are returned as negative numbers.
+ *      We trust the caller to allocate sufficient memory to oname string.
+ */
+static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
+                              const struct ext4_str *iname,
+                              struct ext4_str *oname)
+{
+        struct ext4_str tmp_in[2], tmp_out[1];
+        struct ablkcipher_request *req = NULL;
+        DECLARE_EXT4_COMPLETION_RESULT(ecr);
+        struct scatterlist sg[1];
+        struct crypto_ablkcipher *tfm = ctx->ctfm;
+        int res = 0;
+        char iv[EXT4_CRYPTO_BLOCK_SIZE];
+        char *workbuf;
+        if (iname->len <= 0 || iname->len > ctx->lim)
+                return -EIO;
+        tmp_in[0].name = iname->name;
+        tmp_in[0].len = iname->len;
+        tmp_out[0].name = oname->name;
+        /* Allocate request */
+        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        if (!req) {
+                printk_ratelimited(
+                    KERN_ERR "%s: crypto_request_alloc() failed\n",  __func__);
+                return -ENOMEM;
+        }
+        ablkcipher_request_set_callback(req,
+                CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                ext4_dir_crypt_complete, &ecr);
+        /* Map the workpage */
+        workbuf = kmap(ctx->workpage);
+        /* Copy the input */
+        memcpy(workbuf, iname->name, iname->len);
+        /* Initialize IV */
+        memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
+        /* Create encryption request */
+        sg_init_table(sg, 1);
+        sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
+        ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
+        res = crypto_ablkcipher_decrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                BUG_ON(req->base.data != &ecr);
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+        if (res >= 0) {
+                /* Copy the result to output */
+                memcpy(oname->name, workbuf, iname->len);
+                res = iname->len;
+        }
+        kunmap(ctx->workpage);
+        ablkcipher_request_free(req);
+        if (res < 0) {
+                printk_ratelimited(
+                    KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
+                    __func__, res);
+                return res;
+        }
+        oname->len = strnlen(oname->name, iname->len);
+        return oname->len;
+}
+/**
+ * ext4_fname_encode_digest() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+int ext4_fname_encode_digest(char *dst, char *src, u32 len)
+{
+        static const char *lookup_table =
+                "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+";
+        u32 current_chunk, num_chunks, i;
+        char tmp_buf[3];
+        u32 c0, c1, c2, c3;
+        current_chunk = 0;
+        num_chunks = len/3;
+        for (i = 0; i < num_chunks; i++) {
+                c0 = src[3*i] & 0x3f;
+                c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f;
+                c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f;
+                c3 = (src[3*i+2]>>2) & 0x3f;
+                dst[4*i] = lookup_table[c0];
+                dst[4*i+1] = lookup_table[c1];
+                dst[4*i+2] = lookup_table[c2];
+                dst[4*i+3] = lookup_table[c3];
+        }
+        if (i*3 < len) {
+                memset(tmp_buf, 0, 3);
+                memcpy(tmp_buf, &src[3*i], len-3*i);
+                c0 = tmp_buf[0] & 0x3f;
+                c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f;
+                c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f;
+                c3 = (tmp_buf[2]>>2) & 0x3f;
+                dst[4*i] = lookup_table[c0];
+                dst[4*i+1] = lookup_table[c1];
+                dst[4*i+2] = lookup_table[c2];
+                dst[4*i+3] = lookup_table[c3];
+                i++;
+        }
+        return (i * 4);
+}
+/**
+ * ext4_fname_hash() -
+ *
+ * This function computes the hash of the input filename, and sets the output
+ * buffer to the *encoded* digest.  It returns the length of the digest as its
+ * return value.  Errors are returned as negative numbers.  We trust the caller
+ * to allocate sufficient memory to oname string.
+ */
+static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx,
+                           const struct ext4_str *iname,
+                           struct ext4_str *oname)
+{
+        struct scatterlist sg;
+        struct hash_desc desc = {
+                .tfm = (struct crypto_hash *)ctx->htfm,
+                .flags = CRYPTO_TFM_REQ_MAY_SLEEP
+        };
+        int res = 0;
+        if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
+                res = ext4_fname_encode_digest(oname->name, iname->name,
+                                               iname->len);
+                oname->len = res;
+                return res;
+        }
+        sg_init_one(&sg, iname->name, iname->len);
+        res = crypto_hash_init(&desc);
+        if (res) {
+                printk(KERN_ERR
+                       "%s: Error initializing crypto hash; res = [%d]\n",
+                       __func__, res);
+                goto out;
+        }
+        res = crypto_hash_update(&desc, &sg, iname->len);
+        if (res) {
+                printk(KERN_ERR
+                       "%s: Error updating crypto hash; res = [%d]\n",
+                       __func__, res);
+                goto out;
+        }
+        res = crypto_hash_final(&desc,
+                &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]);
+        if (res) {
+                printk(KERN_ERR
+                       "%s: Error finalizing crypto hash; res = [%d]\n",
+                       __func__, res);
+                goto out;
+        }
+        /* Encode the digest as a printable string--this will increase the
+         * size of the digest */
+        oname->name[0] = 'I';
+        res = ext4_fname_encode_digest(oname->name+1,
+                &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE],
+                EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1;
+        oname->len = res;
+out:
+        return res;
+}
+/**
+ * ext4_free_fname_crypto_ctx() -
+ *
+ * Frees up a crypto context.
+ */
+void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx)
+{
+        if (ctx == NULL || IS_ERR(ctx))
+                return;
+        if (ctx->ctfm && !IS_ERR(ctx->ctfm))
+                crypto_free_ablkcipher(ctx->ctfm);
+        if (ctx->htfm && !IS_ERR(ctx->htfm))
+                crypto_free_hash(ctx->htfm);
+        if (ctx->workpage && !IS_ERR(ctx->workpage))
+                __free_page(ctx->workpage);
+        kfree(ctx);
+}
+/**
+ * ext4_put_fname_crypto_ctx() -
+ *
+ * Return: The crypto context onto free list. If the free list is above a
+ * threshold, completely frees up the context, and returns the memory.
+ *
+ * TODO: Currently we directly free the crypto context. Eventually we should
+ * add code it to return to free list. Such an approach will increase
+ * efficiency of directory lookup.
+ */
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx)
+{
+        if (*ctx == NULL || IS_ERR(*ctx))
+                return;
+        ext4_free_fname_crypto_ctx(*ctx);
+        *ctx = NULL;
+}
+/**
+ * ext4_search_fname_crypto_ctx() -
+ */
+static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx(
+                const struct ext4_encryption_key *key)
+{
+        return NULL;
+}
+/**
+ * ext4_alloc_fname_crypto_ctx() -
+ */
+struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx(
+        const struct ext4_encryption_key *key)
+{
+        struct ext4_fname_crypto_ctx *ctx;
+        ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS);
+        if (ctx == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) {
+                /* This will automatically set key mode to invalid
+                 * As enum for ENCRYPTION_MODE_INVALID is zero */
+                memset(&ctx->key, 0, sizeof(ctx->key));
+        } else {
+                memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key));
+        }
+        ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode)
+                ? 0 : 1;
+        ctx->ctfm_key_is_ready = 0;
+        ctx->ctfm = NULL;
+        ctx->htfm = NULL;
+        ctx->workpage = NULL;
+        return ctx;
+}
+/**
+ * ext4_get_fname_crypto_ctx() -
+ *
+ * Allocates a free crypto context and initializes it to hold
+ * the crypto material for the inode.
+ *
+ * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise.
+ */
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(
+        struct inode *inode, u32 max_ciphertext_len)
+{
+        struct ext4_fname_crypto_ctx *ctx;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int res;
+        /* Check if the crypto policy is set on the inode */
+        res = ext4_encrypted_inode(inode);
+        if (res == 0)
+                return NULL;
+        if (!ext4_has_encryption_key(inode))
+                ext4_generate_encryption_key(inode);
+        /* Get a crypto context based on the key.
+         * A new context is allocated if no context matches the requested key.
+         */
+        ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key));
+        if (ctx == NULL)
+                ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key));
+        if (IS_ERR(ctx))
+                return ctx;
+        if (ctx->has_valid_key) {
+                if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) {
+                        printk_once(KERN_WARNING
+                                    "ext4: unsupported key mode %d\n",
+                                    ctx->key.mode);
+                        return ERR_PTR(-ENOKEY);
+                }
+                /* As a first cut, we will allocate new tfm in every call.
+                 * later, we will keep the tfm around, in case the key gets
+                 * re-used */
+                if (ctx->ctfm == NULL) {
+                        ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))",
+                                        0, 0);
+                }
+                if (IS_ERR(ctx->ctfm)) {
+                        res = PTR_ERR(ctx->ctfm);
+                        printk(
+                            KERN_DEBUG "%s: error (%d) allocating crypto tfm\n",
+                            __func__, res);
+                        ctx->ctfm = NULL;
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return ERR_PTR(res);
+                }
+                if (ctx->ctfm == NULL) {
+                        printk(
+                            KERN_DEBUG "%s: could not allocate crypto tfm\n",
+                            __func__);
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return ERR_PTR(-ENOMEM);
+                }
+                if (ctx->workpage == NULL)
+                        ctx->workpage = alloc_page(GFP_NOFS);
+                if (IS_ERR(ctx->workpage)) {
+                        res = PTR_ERR(ctx->workpage);
+                        printk(
+                            KERN_DEBUG "%s: error (%d) allocating work page\n",
+                            __func__, res);
+                        ctx->workpage = NULL;
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return ERR_PTR(res);
+                }
+                if (ctx->workpage == NULL) {
+                        printk(
+                            KERN_DEBUG "%s: could not allocate work page\n",
+                            __func__);
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return ERR_PTR(-ENOMEM);
+                }
+                ctx->lim = max_ciphertext_len;
+                crypto_ablkcipher_clear_flags(ctx->ctfm, ~0);
+                crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm),
+                        CRYPTO_TFM_REQ_WEAK_KEY);
+                /* If we are lucky, we will get a context that is already
+                 * set up with the right key. Else, we will have to
+                 * set the key */
+                if (!ctx->ctfm_key_is_ready) {
+                        /* Since our crypto objectives for filename encryption
+                         * are pretty weak,
+                         * we directly use the inode master key */
+                        res = crypto_ablkcipher_setkey(ctx->ctfm,
+                                        ctx->key.raw, ctx->key.size);
+                        if (res) {
+                                ext4_put_fname_crypto_ctx(&ctx);
+                                return ERR_PTR(-EIO);
+                        }
+                        ctx->ctfm_key_is_ready = 1;
+                } else {
+                        /* In the current implementation, key should never be
+                         * marked "ready" for a context that has just been
+                         * allocated. So we should never reach here */
+                         BUG();
+                }
+        }
+        if (ctx->htfm == NULL)
+                ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
+        if (IS_ERR(ctx->htfm)) {
+                res = PTR_ERR(ctx->htfm);
+                printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n",
+                        __func__, res);
+                ctx->htfm = NULL;
+                ext4_put_fname_crypto_ctx(&ctx);
+                return ERR_PTR(res);
+        }
+        if (ctx->htfm == NULL) {
+                printk(KERN_DEBUG "%s: could not allocate hash tfm\n",
+                                __func__);
+                ext4_put_fname_crypto_ctx(&ctx);
+                return ERR_PTR(-ENOMEM);
+        }
+        return ctx;
+}
+/**
+ * ext4_fname_crypto_round_up() -
+ *
+ * Return: The next multiple of block size
+ */
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
+{
+        return ((size+blksize-1)/blksize)*blksize;
+}
+/**
+ * ext4_fname_crypto_namelen_on_disk() -
+ */
+int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
+                                      u32 namelen)
+{
+        u32 ciphertext_len;
+        if (ctx == NULL)
+                return -EIO;
+        if (!(ctx->has_valid_key))
+                return -EACCES;
+        ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ?
+                EXT4_CRYPTO_BLOCK_SIZE : namelen;
+        ciphertext_len = (ciphertext_len > ctx->lim)
+                        ? ctx->lim : ciphertext_len;
+        return (int) ciphertext_len;
+}
+/**
+ * ext4_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+                                   u32 ilen, struct ext4_str *crypto_str)
+{
+        unsigned int olen;
+        if (!ctx)
+                return -EIO;
+        olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE);
+        crypto_str->len = olen;
+        if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
+                olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
+        /* Allocated buffer can hold one more character to null-terminate the
+         * string */
+        crypto_str->name = kmalloc(olen+1, GFP_NOFS);
+        if (!(crypto_str->name))
+                return -ENOMEM;
+        return 0;
+}
+/**
+ * ext4_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
+{
+        if (!crypto_str)
+                return;
+        kfree(crypto_str->name);
+        crypto_str->name = NULL;
+}
+/**
+ * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
+ */
+int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                           const struct ext4_str *iname,
+                           struct ext4_str *oname)
+{
+        if (ctx == NULL)
+                return -EIO;
+        if (iname->len < 3) {
+                /*Check for . and .. */
+                if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
+                        oname->name[0] = '.';
+                        oname->name[iname->len-1] = '.';
+                        oname->len = iname->len;
+                        return oname->len;
+                }
+        }
+        if (ctx->has_valid_key)
+                return ext4_fname_decrypt(ctx, iname, oname);
+        else
+                return ext4_fname_hash(ctx, iname, oname);
+}
+int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                           const struct ext4_dir_entry_2 *de,
+                           struct ext4_str *oname)
+{
+        struct ext4_str iname = {.name = (unsigned char *) de->name,
+                                 .len = de->name_len };
+        return _ext4_fname_disk_to_usr(ctx, &iname, oname);
+}
+/**
+ * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
+ */
+int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+                           const struct qstr *iname,
+                           struct ext4_str *oname)
+{
+        int res;
+        if (ctx == NULL)
+                return -EIO;
+        if (iname->len < 3) {
+                /*Check for . and .. */
+                if (iname->name[0] == '.' &&
+                                iname->name[iname->len-1] == '.') {
+                        oname->name[0] = '.';
+                        oname->name[iname->len-1] = '.';
+                        oname->len = iname->len;
+                        return oname->len;
+                }
+        }
+        if (ctx->has_valid_key) {
+                res = ext4_fname_encrypt(ctx, iname, oname);
+                return res;
+        }
+        /* Without a proper key, a user is not allowed to modify the filenames
+         * in a directory. Consequently, a user space name cannot be mapped to
+         * a disk-space name */
+        return -EACCES;
+}
+/*
+ * Calculate the htree hash from a filename from user space
+ */
+int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
+                            const struct qstr *iname,
+                            struct dx_hash_info *hinfo)
+{
+        struct ext4_str tmp, tmp2;
+        int ret = 0;
+        if (!ctx || !ctx->has_valid_key ||
+            ((iname->name[0] == '.') &&
+             ((iname->len == 1) ||
+              ((iname->name[1] == '.') && (iname->len == 2))))) {
+                ext4fs_dirhash(iname->name, iname->len, hinfo);
+                return 0;
+        }
+        /* First encrypt the plaintext name */
+        ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp);
+        if (ret < 0)
+                return ret;
+        ret = ext4_fname_encrypt(ctx, iname, &tmp);
+        if (ret < 0)
+                goto out;
+        tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
+        tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL);
+        if (tmp2.name == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        ret = ext4_fname_hash(ctx, &tmp, &tmp2);
+        if (ret > 0)
+                ext4fs_dirhash(tmp2.name, tmp2.len, hinfo);
+        ext4_fname_crypto_free_buffer(&tmp2);
+out:
+        ext4_fname_crypto_free_buffer(&tmp);
+        return ret;
+}
+/**
+ * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string
+ */
+int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
+                            const struct ext4_dir_entry_2 *de,
+                            struct dx_hash_info *hinfo)
+{
+        struct ext4_str iname = {.name = (unsigned char *) de->name,
+                                 .len = de->name_len};
+        struct ext4_str tmp;
+        int ret;
+        if (!ctx ||
+            ((iname.name[0] == '.') &&
+             ((iname.len == 1) ||
+              ((iname.name[1] == '.') && (iname.len == 2))))) {
+                ext4fs_dirhash(iname.name, iname.len, hinfo);
+                return 0;
+        }
+        tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
+        tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL);
+        if (tmp.name == NULL)
+                return -ENOMEM;
+        ret = ext4_fname_hash(ctx, &iname, &tmp);
+        if (ret > 0)
+                ext4fs_dirhash(tmp.name, tmp.len, hinfo);
+        ext4_fname_crypto_free_buffer(&tmp);
+        return ret;
+}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
new file mode 100644
index 000000000000..c8392af8abbb
--- /dev/null
+++ b/fs/ext4/crypto_key.c
@@ -0,0 +1,165 @@
+/*
+ * linux/fs/ext4/crypto_key.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions for ext4
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include "ext4.h"
+#include "xattr.h"
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+        struct ext4_completion_result *ecr = req->data;
+        if (rc == -EINPROGRESS)
+                return;
+        ecr->res = rc;
+        complete(&ecr->completion);
+}
+/**
+ * ext4_derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivatio.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
+                               char source_key[EXT4_AES_256_XTS_KEY_SIZE],
+                               char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
+{
+        int res = 0;
+        struct ablkcipher_request *req = NULL;
+        DECLARE_EXT4_COMPLETION_RESULT(ecr);
+        struct scatterlist src_sg, dst_sg;
+        struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+                                                                0);
+        if (IS_ERR(tfm)) {
+                res = PTR_ERR(tfm);
+                tfm = NULL;
+                goto out;
+        }
+        crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+        req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+        if (!req) {
+                res = -ENOMEM;
+                goto out;
+        }
+        ablkcipher_request_set_callback(req,
+                        CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+                        derive_crypt_complete, &ecr);
+        res = crypto_ablkcipher_setkey(tfm, deriving_key,
+                                       EXT4_AES_128_ECB_KEY_SIZE);
+        if (res < 0)
+                goto out;
+        sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
+        sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
+        ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+                                     EXT4_AES_256_XTS_KEY_SIZE, NULL);
+        res = crypto_ablkcipher_encrypt(req);
+        if (res == -EINPROGRESS || res == -EBUSY) {
+                BUG_ON(req->base.data != &ecr);
+                wait_for_completion(&ecr.completion);
+                res = ecr.res;
+        }
+out:
+        if (req)
+                ablkcipher_request_free(req);
+        if (tfm)
+                crypto_free_ablkcipher(tfm);
+        return res;
+}
+/**
+ * ext4_generate_encryption_key() - generates an encryption key
+ * @inode: The inode to generate the encryption key for.
+ */
+int ext4_generate_encryption_key(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
+        char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+                                 (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
+        struct key *keyring_key = NULL;
+        struct ext4_encryption_key *master_key;
+        struct ext4_encryption_context ctx;
+        struct user_key_payload *ukp;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                 &ctx, sizeof(ctx));
+        if (res != sizeof(ctx)) {
+                if (res > 0)
+                        res = -EINVAL;
+                goto out;
+        }
+        res = 0;
+        if (S_ISREG(inode->i_mode))
+                crypt_key->mode = ctx.contents_encryption_mode;
+        else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+                crypt_key->mode = ctx.filenames_encryption_mode;
+        else {
+                printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n");
+                BUG();
+        }
+        crypt_key->size = ext4_encryption_key_size(crypt_key->mode);
+        BUG_ON(!crypt_key->size);
+        if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
+                memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
+                goto out;
+        }
+        memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
+               EXT4_KEY_DESC_PREFIX_SIZE);
+        sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
+                "%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
+                ctx.master_key_descriptor);
+        full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
+                            (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
+        keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+        if (IS_ERR(keyring_key)) {
+                res = PTR_ERR(keyring_key);
+                keyring_key = NULL;
+                goto out;
+        }
+        BUG_ON(keyring_key->type != &key_type_logon);
+        ukp = ((struct user_key_payload *)keyring_key->payload.data);
+        if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
+                res = -EINVAL;
+                goto out;
+        }
+        master_key = (struct ext4_encryption_key *)ukp->data;
+        BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
+                     EXT4_KEY_DERIVATION_NONCE_SIZE);
+        BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+        res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw);
+out:
+        if (keyring_key)
+                key_put(keyring_key);
+        if (res < 0)
+                crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID;
+        return res;
+}
+int ext4_has_encryption_key(struct inode *inode)
+{
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
+        return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID);
+}
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
new file mode 100644
index 000000000000..30eaf9e9864a
--- /dev/null
+++ b/fs/ext4/crypto_policy.c
@@ -0,0 +1,194 @@
+/*
+ * linux/fs/ext4/crypto_policy.c
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption policy functions for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include "ext4.h"
+#include "xattr.h"
+static int ext4_inode_has_encryption_context(struct inode *inode)
+{
+        int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
+        return (res > 0);
+}
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int ext4_is_encryption_context_consistent_with_policy(
+        struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+        struct ext4_encryption_context ctx;
+        int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+                                 sizeof(ctx));
+        if (res != sizeof(ctx))
+                return 0;
+        return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+                        EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+                (ctx.contents_encryption_mode ==
+                 policy->contents_encryption_mode) &&
+                (ctx.filenames_encryption_mode ==
+                 policy->filenames_encryption_mode));
+}
+static int ext4_create_encryption_context_from_policy(
+        struct inode *inode, const struct ext4_encryption_policy *policy)
+{
+        struct ext4_encryption_context ctx;
+        int res = 0;
+        ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+        memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+               EXT4_KEY_DESCRIPTOR_SIZE);
+        if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
+                printk(KERN_WARNING
+                       "%s: Invalid contents encryption mode %d\n", __func__,
+                        policy->contents_encryption_mode);
+                res = -EINVAL;
+                goto out;
+        }
+        if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
+                printk(KERN_WARNING
+                       "%s: Invalid filenames encryption mode %d\n", __func__,
+                        policy->filenames_encryption_mode);
+                res = -EINVAL;
+                goto out;
+        }
+        ctx.contents_encryption_mode = policy->contents_encryption_mode;
+        ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+        BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
+        get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+        res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+                             sizeof(ctx), 0);
+out:
+        if (!res)
+                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+        return res;
+}
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+                        struct inode *inode)
+{
+        if (policy->version != 0)
+                return -EINVAL;
+        if (!ext4_inode_has_encryption_context(inode)) {
+                if (!ext4_empty_dir(inode))
+                        return -ENOTEMPTY;
+                return ext4_create_encryption_context_from_policy(inode,
+                                                                  policy);
+        }
+        if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
+                return 0;
+        printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+               __func__);
+        return -EINVAL;
+}
+int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
+{
+        struct ext4_encryption_context ctx;
+        int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
+                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                 &ctx, sizeof(ctx));
+        if (res != sizeof(ctx))
+                return -ENOENT;
+        if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
+                return -EINVAL;
+        policy->version = 0;
+        policy->contents_encryption_mode = ctx.contents_encryption_mode;
+        policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+        memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+               EXT4_KEY_DESCRIPTOR_SIZE);
+        return 0;
+}
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+                                                 struct inode *child)
+{
+        struct ext4_encryption_context parent_ctx, child_ctx;
+        int res;
+        if ((parent == NULL) || (child == NULL)) {
+                pr_err("parent %p child %p\n", parent, child);
+                BUG_ON(1);
+        }
+        /* no restrictions if the parent directory is not encrypted */
+        if (!ext4_encrypted_inode(parent))
+                return 1;
+        res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
+                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                             &parent_ctx, sizeof(parent_ctx));
+        if (res != sizeof(parent_ctx))
+                return 0;
+        /* if the child directory is not encrypted, this is always a problem */
+        if (!ext4_encrypted_inode(child))
+                return 0;
+        res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION,
+                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                             &child_ctx, sizeof(child_ctx));
+        if (res != sizeof(child_ctx))
+                return 0;
+        return (memcmp(parent_ctx.master_key_descriptor,
+                       child_ctx.master_key_descriptor,
+                       EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
+                (parent_ctx.contents_encryption_mode ==
+                 child_ctx.contents_encryption_mode) &&
+                (parent_ctx.filenames_encryption_mode ==
+                 child_ctx.filenames_encryption_mode));
+}
+/**
+ * ext4_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int ext4_inherit_context(struct inode *parent, struct inode *child)
+{
+        struct ext4_encryption_context ctx;
+        int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
+                                 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+                                 &ctx, sizeof(ctx));
+        if (res != sizeof(ctx)) {
+                if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
+                        ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
+                        ctx.contents_encryption_mode =
+                                EXT4_ENCRYPTION_MODE_AES_256_XTS;
+                        ctx.filenames_encryption_mode =
+                                EXT4_ENCRYPTION_MODE_AES_256_CTS;
+                        memset(ctx.master_key_descriptor, 0x42,
+                               EXT4_KEY_DESCRIPTOR_SIZE);
+                        res = 0;
+                } else {
+                        goto out;
+                }
+        }
+        get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
+        res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
+                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
+                             sizeof(ctx), 0);
+out:
+        if (!res)
+                ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
+        return res;
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c24143ea9c08..61db51a5ce4c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -22,10 +22,8 @@
 */
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
-#include <linux/rbtree.h>
 #include "ext4.h"
 #include "xattr.h"
@@ -110,7 +108,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
        int err;
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
+        struct buffer_head *bh = NULL;
        int dir_has_error = 0;
+        struct ext4_fname_crypto_ctx *enc_ctx = NULL;
+        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
        if (is_dx_dir(inode)) {
                err = ext4_dx_readdir(file, ctx);
@@ -127,17 +128,28 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
        if (ext4_has_inline_data(inode)) {
                int has_inline_data = 1;
-                int ret = ext4_read_inline_dir(file, ctx,
+                err = ext4_read_inline_dir(file, ctx,
                                           &has_inline_data);
                if (has_inline_data)
-                        return ret;
+                        return err;
+        }
+        enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN);
+        if (IS_ERR(enc_ctx))
+                return PTR_ERR(enc_ctx);
+        if (enc_ctx) {
+                err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN,
+                                                     &fname_crypto_str);
+                if (err < 0) {
+                        ext4_put_fname_crypto_ctx(&enc_ctx);
+                        return err;
+                }
        }
        offset = ctx->pos & (sb->s_blocksize - 1);
        while (ctx->pos < inode->i_size) {
                struct ext4_map_blocks map;
-                struct buffer_head *bh = NULL;
                map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
                map.m_len = 1;
@@ -180,6 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
                                        (unsigned long long)ctx->pos);
                        ctx->pos += sb->s_blocksize - offset;
                        brelse(bh);
+                        bh = NULL;
                        continue;
                }
                set_buffer_verified(bh);
@@ -226,25 +239,44 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
                        offset += ext4_rec_len_from_disk(de->rec_len,
                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
-                                if (!dir_emit(ctx, de->name,
+                                if (enc_ctx == NULL) {
-                                                de->name_len,
+                                        /* Directory is not encrypted */
-                                                le32_to_cpu(de->inode),
+                                        if (!dir_emit(ctx, de->name,
-                                                get_dtype(sb, de->file_type))) {
+                                            de->name_len,
-                                        brelse(bh);
+                                            le32_to_cpu(de->inode),
-                                        return 0;
+                                            get_dtype(sb, de->file_type)))
+                                                goto done;
+                                } else {
+                                        /* Directory is encrypted */
+                                        err = ext4_fname_disk_to_usr(enc_ctx,
+                                                        de, &fname_crypto_str);
+                                        if (err < 0)
+                                                goto errout;
+                                        if (!dir_emit(ctx,
+                                            fname_crypto_str.name, err,
+                                            le32_to_cpu(de->inode),
+                                            get_dtype(sb, de->file_type)))
+                                                goto done;
                                }
                        }
                        ctx->pos += ext4_rec_len_from_disk(de->rec_len,
                                                sb->s_blocksize);
                }
-                offset = 0;
+                if ((ctx->pos < inode->i_size) && !dir_relax(inode))
+                        goto done;
                brelse(bh);
-                if (ctx->pos < inode->i_size) {
+                bh = NULL;
-                        if (!dir_relax(inode))
+                offset = 0;
-                                return 0;
-                }
        }
-        return 0;
+done:
+        err = 0;
+errout:
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ext4_put_fname_crypto_ctx(&enc_ctx);
+        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
+        brelse(bh);
+        return err;
 }
 static inline int is_32bit_api(void)
@@ -384,10 +416,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p)
 /*
 * Given a directory entry, enter it into the fname rb tree.
+ *
+ * When filename encryption is enabled, the dirent will hold the
+ * encrypted filename, while the htree will hold decrypted filename.
+ * The decrypted filename is passed in via ent_name.  parameter.
 */
 int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                             __u32 minor_hash,
-                             struct ext4_dir_entry_2 *dirent)
+                            struct ext4_dir_entry_2 *dirent,
+                            struct ext4_str *ent_name)
 {
        struct rb_node **p, *parent = NULL;
        struct fname *fname, *new_fn;
@@ -398,17 +435,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
        p = &info->root.rb_node;
        /* Create and allocate the fname structure */
-        len = sizeof(struct fname) + dirent->name_len + 1;
+        len = sizeof(struct fname) + ent_name->len + 1;
        new_fn = kzalloc(len, GFP_KERNEL);
        if (!new_fn)
                return -ENOMEM;
        new_fn->hash = hash;
        new_fn->minor_hash = minor_hash;
        new_fn->inode = le32_to_cpu(dirent->inode);
-        new_fn->name_len = dirent->name_len;
+        new_fn->name_len = ent_name->len;
        new_fn->file_type = dirent->file_type;
-        memcpy(new_fn->name, dirent->name, dirent->name_len);
+        memcpy(new_fn->name, ent_name->name, ent_name->len);
-        new_fn->name[dirent->name_len] = 0;
+        new_fn->name[ent_name->len] = 0;
        while (*p) {
                parent = *p;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c8eb32eefc3c..ef267adce19a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -422,7 +422,7 @@ enum {
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
-        EXT4_INODE_ENCRYPT      = 11,   /* Compression error */
+        EXT4_INODE_ENCRYPT      = 11,   /* Encrypted file */
 /* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
        EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
@@ -582,6 +582,15 @@ enum {
 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER   0x0010
 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER    0x0020
+/* Encryption algorithms */
+#define EXT4_ENCRYPTION_MODE_INVALID            0
+#define EXT4_ENCRYPTION_MODE_AES_256_XTS        1
+#define EXT4_ENCRYPTION_MODE_AES_256_GCM        2
+#define EXT4_ENCRYPTION_MODE_AES_256_CBC        3
+#define EXT4_ENCRYPTION_MODE_AES_256_CTS        4
+#include "ext4_crypto.h"
 /*
 * ioctl commands
 */
@@ -603,6 +612,9 @@ enum {
 #define EXT4_IOC_RESIZE_FS              _IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT              _IO('f', 17)
 #define EXT4_IOC_PRECACHE_EXTENTS       _IO('f', 18)
+#define EXT4_IOC_SET_ENCRYPTION_POLICY  _IOR('f', 19, struct ext4_encryption_policy)
+#define EXT4_IOC_GET_ENCRYPTION_PWSALT  _IOW('f', 20, __u8[16])
+#define EXT4_IOC_GET_ENCRYPTION_POLICY  _IOW('f', 21, struct ext4_encryption_policy)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -939,6 +951,11 @@ struct ext4_inode_info {
        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        /* Encryption params */
+        struct ext4_encryption_key i_encryption_key;
+#endif
 };
 /*
@@ -1142,7 +1159,8 @@ struct ext4_super_block {
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8    s_checksum_type;        /* metadata checksum algorithm used */
-        __le16  s_reserved_pad;
+        __u8    s_encryption_level;     /* versioning level for encryption */
+        __u8    s_reserved_pad;         /* Padding to next 32bits */
        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
        __le32  s_snapshot_inum;        /* Inode number of active snapshot */
        __le32  s_snapshot_id;          /* sequential ID of active snapshot */
@@ -1169,7 +1187,9 @@ struct ext4_super_block {
        __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
        __le32  s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8    s_encrypt_algos[4];     /* Encryption algorithms in use  */
-        __le32  s_reserved[105];        /* Padding to the end of the block */
+        __u8    s_encrypt_pw_salt[16];  /* Salt used for string2key algorithm */
+        __le32  s_lpf_ino;              /* Location of the lost+found inode */
+        __le32  s_reserved[100];        /* Padding to the end of the block */
        __le32  s_checksum;             /* crc32c(superblock) */
 };
@@ -1180,8 +1200,16 @@ struct ext4_super_block {
 /*
 * run-time mount flags
 */
-#define EXT4_MF_MNTDIR_SAMPLED  0x0001
+#define EXT4_MF_MNTDIR_SAMPLED          0x0001
-#define EXT4_MF_FS_ABORTED      0x0002  /* Fatal error detected */
+#define EXT4_MF_FS_ABORTED              0x0002  /* Fatal error detected */
+#define EXT4_MF_TEST_DUMMY_ENCRYPTION   0x0004
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
+                                                EXT4_MF_TEST_DUMMY_ENCRYPTION))
+#else
+#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
+#endif
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 2
@@ -1351,6 +1379,12 @@ struct ext4_sb_info {
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        /* Encryption */
+        uint32_t s_file_encryption_mode;
+        uint32_t s_dir_encryption_mode;
+#endif
 };
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1466,6 +1500,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_SB(sb)     (sb)
 #endif
+/*
+ * Returns true if the inode is inode is encrypted
+ */
+static inline int ext4_encrypted_inode(struct inode *inode)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
+#else
+        return 0;
+#endif
+}
 #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
 /*
@@ -1575,8 +1621,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
-                                         EXT4_FEATURE_INCOMPAT_MMP |    \
+                                         EXT4_FEATURE_INCOMPAT_MMP | \
-                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA)
+                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+                                         EXT4_FEATURE_INCOMPAT_ENCRYPT)
 #define EXT4_FEATURE_RO_COMPAT_SUPP     (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2001,6 +2048,99 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+/* crypto_policy.c */
+int ext4_is_child_context_consistent_with_parent(struct inode *parent,
+                                                 struct inode *child);
+int ext4_inherit_context(struct inode *parent, struct inode *child);
+void ext4_to_hex(char *dst, char *src, size_t src_size);
+int ext4_process_policy(const struct ext4_encryption_policy *policy,
+                        struct inode *inode);
+int ext4_get_policy(struct inode *inode,
+                    struct ext4_encryption_policy *policy);
+/* crypto.c */
+bool ext4_valid_contents_enc_mode(uint32_t mode);
+uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
+extern struct workqueue_struct *ext4_read_workqueue;
+struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
+void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
+void ext4_restore_control_page(struct page *data_page);
+struct page *ext4_encrypt(struct inode *inode,
+                          struct page *plaintext_page);
+int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
+int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_init_crypto(void);
+void ext4_exit_crypto(void);
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+        return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+}
+#else
+static inline int ext4_init_crypto(void) { return 0; }
+static inline void ext4_exit_crypto(void) { }
+static inline int ext4_sb_has_crypto(struct super_block *sb)
+{
+        return 0;
+}
+#endif
+/* crypto_fname.c */
+bool ext4_valid_filenames_enc_mode(uint32_t mode);
+u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
+int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
+                                   u32 ilen, struct ext4_str *crypto_str);
+int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                            const struct ext4_str *iname,
+                            struct ext4_str *oname);
+int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
+                           const struct ext4_dir_entry_2 *de,
+                           struct ext4_str *oname);
+int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
+                           const struct qstr *iname,
+                           struct ext4_str *oname);
+int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
+                           const struct qstr *iname,
+                           struct dx_hash_info *hinfo);
+int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
+                            const struct ext4_dir_entry_2 *de,
+                            struct dx_hash_info *hinfo);
+int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
+                                      u32 namelen);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx);
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
+                                                        u32 max_len);
+void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
+#else
+static inline
+void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { }
+static inline
+struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
+                                                        u32 max_len)
+{
+        return NULL;
+}
+static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
+#endif
+/* crypto_key.c */
+int ext4_generate_encryption_key(struct inode *inode);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+int ext4_has_encryption_key(struct inode *inode);
+#else
+static inline int ext4_has_encryption_key(struct inode *inode)
+{
+        return 0;
+}
+#endif
 /* dir.c */
 extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
@@ -2011,17 +2151,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                        (de), (bh), (buf), (size), (offset)))
 extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
-                                    __u32 minor_hash,
+                                __u32 minor_hash,
-                                    struct ext4_dir_entry_2 *dirent);
+                                struct ext4_dir_entry_2 *dirent,
+                                struct ext4_str *ent_name);
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                             struct buffer_head *bh,
                             void *buf, int buf_size,
                             const char *name, int namelen,
                             struct ext4_dir_entry_2 **dest_de);
-void ext4_insert_dentry(struct inode *inode,
+int ext4_insert_dentry(struct inode *dir,
+                        struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
+                       const struct qstr *iname,
                        const char *name, int namelen);
 static inline void ext4_update_dx_flag(struct inode *inode)
 {
@@ -2099,6 +2242,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
 /* inode.c */
+int ext4_inode_is_fast_symlink(struct inode *inode);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
@@ -2189,6 +2333,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
+extern int ext4_empty_dir(struct inode *inode);
 /* resize.c */
 extern int ext4_group_add(struct super_block *sb,
@@ -2698,6 +2843,10 @@ static inline void ext4_set_de_type(struct super_block *sb,
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
 }
+/* readpages.c */
+extern int ext4_mpage_readpages(struct address_space *mapping,
+                                struct list_head *pages, struct page *page,
+                                unsigned nr_pages);
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
new file mode 100644
index 000000000000..c2ba35a914b6
--- /dev/null
+++ b/fs/ext4/ext4_crypto.h
@@ -0,0 +1,147 @@
+/*
+ * linux/fs/ext4/ext4_crypto.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption header content for ext4
+ *
+ * Written by Michael Halcrow, 2015.
+ */
+#ifndef _EXT4_CRYPTO_H
+#define _EXT4_CRYPTO_H
+#include <linux/fs.h>
+#define EXT4_KEY_DESCRIPTOR_SIZE 8
+/* Policy provided via an ioctl on the topmost directory */
+struct ext4_encryption_policy {
+        char version;
+        char contents_encryption_mode;
+        char filenames_encryption_mode;
+        char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+} __attribute__((__packed__));
+#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
+#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Reserved
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct ext4_encryption_context {
+        char format;
+        char contents_encryption_mode;
+        char filenames_encryption_mode;
+        char reserved;
+        char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
+        char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
+} __attribute__((__packed__));
+/* Encryption parameters */
+#define EXT4_XTS_TWEAK_SIZE 16
+#define EXT4_AES_128_ECB_KEY_SIZE 16
+#define EXT4_AES_256_GCM_KEY_SIZE 32
+#define EXT4_AES_256_CBC_KEY_SIZE 32
+#define EXT4_AES_256_CTS_KEY_SIZE 32
+#define EXT4_AES_256_XTS_KEY_SIZE 64
+#define EXT4_MAX_KEY_SIZE 64
+#define EXT4_KEY_DESC_PREFIX "ext4:"
+#define EXT4_KEY_DESC_PREFIX_SIZE 5
+struct ext4_encryption_key {
+        uint32_t mode;
+        char raw[EXT4_MAX_KEY_SIZE];
+        uint32_t size;
+};
+#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
+#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL     0x00000002
+struct ext4_crypto_ctx {
+        struct crypto_tfm *tfm;         /* Crypto API context */
+        struct page *bounce_page;       /* Ciphertext page on write path */
+        struct page *control_page;      /* Original page on write path */
+        struct bio *bio;                /* The bio for this context */
+        struct work_struct work;        /* Work queue for read complete path */
+        struct list_head free_list;     /* Free list */
+        int flags;                      /* Flags */
+        int mode;                       /* Encryption mode for tfm */
+};
+struct ext4_completion_result {
+        struct completion completion;
+        int res;
+};
+#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
+        struct ext4_completion_result ecr = { \
+                COMPLETION_INITIALIZER((ecr).completion), 0 }
+static inline int ext4_encryption_key_size(int mode)
+{
+        switch (mode) {
+        case EXT4_ENCRYPTION_MODE_AES_256_XTS:
+                return EXT4_AES_256_XTS_KEY_SIZE;
+        case EXT4_ENCRYPTION_MODE_AES_256_GCM:
+                return EXT4_AES_256_GCM_KEY_SIZE;
+        case EXT4_ENCRYPTION_MODE_AES_256_CBC:
+                return EXT4_AES_256_CBC_KEY_SIZE;
+        case EXT4_ENCRYPTION_MODE_AES_256_CTS:
+                return EXT4_AES_256_CTS_KEY_SIZE;
+        default:
+                BUG();
+        }
+        return 0;
+}
+#define EXT4_FNAME_NUM_SCATTER_ENTRIES  4
+#define EXT4_CRYPTO_BLOCK_SIZE          16
+#define EXT4_FNAME_CRYPTO_DIGEST_SIZE   32
+struct ext4_str {
+        unsigned char *name;
+        u32 len;
+};
+struct ext4_fname_crypto_ctx {
+        u32 lim;
+        char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE];
+        struct crypto_ablkcipher *ctfm;
+        struct crypto_hash *htfm;
+        struct page *workpage;
+        struct ext4_encryption_key key;
+        unsigned has_valid_key : 1;
+        unsigned ctfm_key_is_ready : 1;
+};
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct ext4_encrypted_symlink_data {
+        __le16 len;
+        char encrypted_path[1];
+} __attribute__((__packed__));
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 encrypted_symlink_data_len(u32 l)
+{
+        if (l < EXT4_CRYPTO_BLOCK_SIZE)
+                l = EXT4_CRYPTO_BLOCK_SIZE;
+        return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
+}
+#endif  /* _EXT4_CRYPTO_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bed43081720f..973816bfe4a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1717,12 +1717,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
 {
        unsigned short ext1_ee_len, ext2_ee_len;
-        /*
-         * Make sure that both extents are initialized. We don't merge
-         * unwritten extents so that we can be sure that end_io code has
-         * the extent that was written properly split out and conversion to
-         * initialized is trivial.
-         */
        if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
                return 0;
@@ -3128,6 +3122,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
        ee_len    = ext4_ext_get_actual_len(ex);
        ee_pblock = ext4_ext_pblock(ex);
+        if (ext4_encrypted_inode(inode))
+                return ext4_encrypted_zeroout(inode, ex);
        ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
        if (ret > 0)
                ret = 0;
@@ -4535,19 +4532,7 @@ got_allocated_blocks:
                 */
                reserved_clusters = get_reserved_cluster_alloc(inode,
                                                map->m_lblk, allocated);
-                if (map_from_cluster) {
+                if (!map_from_cluster) {
-                        if (reserved_clusters) {
-                                /*
-                                 * We have clusters reserved for this range.
-                                 * But since we are not doing actual allocation
-                                 * and are simply using blocks from previously
-                                 * allocated cluster, we should release the
-                                 * reservation and not claim quota.
-                                 */
-                                ext4_da_update_reserve_space(inode,
-                                                reserved_clusters, 0);
-                        }
-                } else {
                        BUG_ON(allocated_clusters < reserved_clusters);
                        if (reserved_clusters < allocated_clusters) {
                                struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4803,12 +4788,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
        else
                max_blocks -= lblk;
-        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
-                EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
-                EXT4_EX_NOCACHE;
-        if (mode & FALLOC_FL_KEEP_SIZE)
-                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
        mutex_lock(&inode->i_mutex);
        /*
@@ -4825,15 +4804,28 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                ret = inode_newsize_ok(inode, new_size);
                if (ret)
                        goto out_mutex;
-                /*
-                 * If we have a partial block after EOF we have to allocate
-                 * the entire block.
-                 */
-                if (partial_end)
-                        max_blocks += 1;
        }
+        flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
+        if (mode & FALLOC_FL_KEEP_SIZE)
+                flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+        /* Preallocate the range including the unaligned edges */
+        if (partial_begin || partial_end) {
+                ret = ext4_alloc_file_blocks(file,
+                                round_down(offset, 1 << blkbits) >> blkbits,
+                                (round_up((offset + len), 1 << blkbits) -
+                                 round_down(offset, 1 << blkbits)) >> blkbits,
+                                new_size, flags, mode);
+                if (ret)
+                        goto out_mutex;
+        }
+        /* Zero range excluding the unaligned edges */
        if (max_blocks > 0) {
+                flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+                          EXT4_EX_NOCACHE);
                /* Now release the pages and zero block aligned part of pages*/
                truncate_pagecache_range(inode, start, end - 1);
@@ -4847,19 +4839,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
                                             flags, mode);
                if (ret)
                        goto out_dio;
-                /*
-                 * Remove entire range from the extent status tree.
-                 *
-                 * ext4_es_remove_extent(inode, lblk, max_blocks) is
-                 * NOT sufficient.  I'm not sure why this is the case,
-                 * but let's be conservative and remove the extent
-                 * status tree for the entire inode.  There should be
-                 * no outstanding delalloc extents thanks to the
-                 * filemap_write_and_wait_range() call above.
-                 */
-                ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
-                if (ret)
-                        goto out_dio;
        }
        if (!partial_begin && !partial_end)
                goto out_dio;
@@ -4922,6 +4901,20 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        ext4_lblk_t lblk;
        unsigned int blkbits = inode->i_blkbits;
+        /*
+         * Encrypted inodes can't handle collapse range or insert
+         * range since we would need to re-encrypt blocks with a
+         * different IV or XTS tweak (which are based on the logical
+         * block number).
+         *
+         * XXX It's not clear why zero range isn't working, but we'll
+         * leave it disabled for encrypted inodes for now.  This is a
+         * bug we should fix....
+         */
+        if (ext4_encrypted_inode(inode) &&
+            (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
+                return -EOPNOTSUPP;
        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e04d45733976..d33d5a6852b9 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -9,12 +9,10 @@
 *
 * Ext4 extents status tree core functions.
 */
-#include <linux/rbtree.h>
 #include <linux/list_sort.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include "ext4.h"
-#include "extents_status.h"
 #include <trace/events/ext4.h>
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e576d682b353..0613c256c344 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -20,7 +20,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
 #include <linux/quotaops.h>
@@ -221,6 +220,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+        struct inode *inode = file->f_mapping->host;
+        if (ext4_encrypted_inode(inode)) {
+                int err = ext4_generate_encryption_key(inode);
+                if (err)
+                        return 0;
+        }
        file_accessed(file);
        if (IS_DAX(file_inode(file))) {
                vma->vm_ops = &ext4_dax_vm_ops;
@@ -238,6 +244,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
        struct vfsmount *mnt = filp->f_path.mnt;
        struct path path;
        char buf[64], *cp;
+        int ret;
        if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
                     !(sb->s_flags & MS_RDONLY))) {
@@ -276,11 +283,17 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
         * writing and the journal is present
         */
        if (filp->f_mode & FMODE_WRITE) {
-                int ret = ext4_inode_attach_jinode(inode);
+                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        return ret;
        }
-        return dquot_file_open(inode, filp);
+        ret = dquot_file_open(inode, filp);
+        if (!ret && ext4_encrypted_inode(inode)) {
+                ret = ext4_generate_encryption_key(inode);
+                if (ret)
+                        ret = -EACCES;
+        }
+        return ret;
 }
 /*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f75fa0..e9d632e9aa4b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
-#include <linux/jbd2.h>
 #include <linux/blkdev.h>
 #include "ext4.h"
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3d586f02883e..e026aa941fd5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,6 @@
 */
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/cryptohash.h>
 #include "ext4.h"
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ac644c31ca67..2cf18a2d5c72 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -14,7 +14,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/quotaops.h>
@@ -997,6 +996,12 @@ got:
        ei->i_block_group = group;
        ei->i_last_alloc_group = ~0;
+        /* If the directory encrypted, then we should encrypt the inode. */
+        if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) &&
+            (ext4_encrypted_inode(dir) ||
+             DUMMY_ENCRYPTION_ENABLED(sbi)))
+                ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
                ext4_handle_sync(handle);
@@ -1029,11 +1034,28 @@ got:
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
        ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) &&
+            (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) {
+                ei->i_inline_off = 0;
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                        EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+                        ext4_set_inode_state(inode,
+                        EXT4_STATE_MAY_INLINE_DATA);
+        } else {
+                /* Inline data and encryption are incompatible
+                 * We turn off inline data since encryption is enabled */
+                ei->i_inline_off = 1;
+                if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+                        EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+                        ext4_clear_inode_state(inode,
+                        EXT4_STATE_MAY_INLINE_DATA);
+        }
+#else
        ei->i_inline_off = 0;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
                ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+#endif
        ret = inode;
        err = dquot_alloc_inode(inode);
        if (err)
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 4b143febf21f..feb2cafbeace 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -11,11 +11,13 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
+#include <linux/fiemap.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
 #include "truncate.h"
-#include <linux/fiemap.h>
 #define EXT4_XATTR_SYSTEM_DATA  "data"
 #define EXT4_MIN_INLINE_DATA_SIZE       ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -972,7 +974,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
        offset = 0;
        while ((void *)de < dlimit) {
                de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
-                trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+                trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
                             offset, de_len, de->name_len, de->name,
                             de->name_len, le32_to_cpu(de->inode));
                if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -1014,7 +1016,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
        err = ext4_journal_get_write_access(handle, iloc->bh);
        if (err)
                return err;
-        ext4_insert_dentry(inode, de, inline_size, name, namelen);
+        ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name,
+                           name, namelen);
        ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1327,6 +1330,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
        struct ext4_iloc iloc;
        void *dir_buf = NULL;
        struct ext4_dir_entry_2 fake;
+        struct ext4_str tmp_str;
        ret = ext4_get_inode_loc(inode, &iloc);
        if (ret)
@@ -1398,8 +1402,10 @@ int htree_inlinedir_to_tree(struct file *dir_file,
                        continue;
                if (de->inode == 0)
                        continue;
-                err = ext4_htree_store_dirent(dir_file,
+                tmp_str.name = de->name;
-                                   hinfo->hash, hinfo->minor_hash, de);
+                tmp_str.len = de->name_len;
+                err = ext4_htree_store_dirent(dir_file, hinfo->hash,
+                                              hinfo->minor_hash, de, &tmp_str);
                if (err) {
                        count = err;
                        goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b49cf6e59953..366476e71e10 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -36,7 +35,6 @@
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
-#include <linux/ratelimit.h>
 #include <linux/bitops.h>
 #include "ext4_jbd2.h"
@@ -140,7 +138,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
 /*
 * Test whether an inode is a fast symlink.
 */
-static int ext4_inode_is_fast_symlink(struct inode *inode)
+int ext4_inode_is_fast_symlink(struct inode *inode)
 {
        int ea_blocks = EXT4_I(inode)->i_file_acl ?
                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -887,6 +885,95 @@ int do_journal_get_write_access(handle_t *handle,
 static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
+                                  get_block_t *get_block)
+{
+        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned to = from + len;
+        struct inode *inode = page->mapping->host;
+        unsigned block_start, block_end;
+        sector_t block;
+        int err = 0;
+        unsigned blocksize = inode->i_sb->s_blocksize;
+        unsigned bbits;
+        struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+        bool decrypt = false;
+        BUG_ON(!PageLocked(page));
+        BUG_ON(from > PAGE_CACHE_SIZE);
+        BUG_ON(to > PAGE_CACHE_SIZE);
+        BUG_ON(from > to);
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, blocksize, 0);
+        head = page_buffers(page);
+        bbits = ilog2(blocksize);
+        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+        for (bh = head, block_start = 0; bh != head || !block_start;
+            block++, block_start = block_end, bh = bh->b_this_page) {
+                block_end = block_start + blocksize;
+                if (block_end <= from || block_start >= to) {
+                        if (PageUptodate(page)) {
+                                if (!buffer_uptodate(bh))
+                                        set_buffer_uptodate(bh);
+                        }
+                        continue;
+                }
+                if (buffer_new(bh))
+                        clear_buffer_new(bh);
+                if (!buffer_mapped(bh)) {
+                        WARN_ON(bh->b_size != blocksize);
+                        err = get_block(inode, block, bh, 1);
+                        if (err)
+                                break;
+                        if (buffer_new(bh)) {
+                                unmap_underlying_metadata(bh->b_bdev,
+                                                          bh->b_blocknr);
+                                if (PageUptodate(page)) {
+                                        clear_buffer_new(bh);
+                                        set_buffer_uptodate(bh);
+                                        mark_buffer_dirty(bh);
+                                        continue;
+                                }
+                                if (block_end > to || block_start < from)
+                                        zero_user_segments(page, to, block_end,
+                                                           block_start, from);
+                                continue;
+                        }
+                }
+                if (PageUptodate(page)) {
+                        if (!buffer_uptodate(bh))
+                                set_buffer_uptodate(bh);
+                        continue;
+                }
+                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+                    !buffer_unwritten(bh) &&
+                    (block_start < from || block_end > to)) {
+                        ll_rw_block(READ, 1, &bh);
+                        *wait_bh++ = bh;
+                        decrypt = ext4_encrypted_inode(inode) &&
+                                S_ISREG(inode->i_mode);
+                }
+        }
+        /*
+         * If we issued read requests, let them complete.
+         */
+        while (wait_bh > wait) {
+                wait_on_buffer(*--wait_bh);
+                if (!buffer_uptodate(*wait_bh))
+                        err = -EIO;
+        }
+        if (unlikely(err))
+                page_zero_new_buffers(page, from, to);
+        else if (decrypt)
+                err = ext4_decrypt_one(inode, page);
+        return err;
+}
+#endif
 static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len, unsigned flags,
                            struct page **pagep, void **fsdata)
@@ -949,11 +1036,19 @@ retry_journal:
        /* In case writeback began while the page was unlocked */
        wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if (ext4_should_dioread_nolock(inode))
+                ret = ext4_block_write_begin(page, pos, len,
+                                             ext4_get_block_write);
+        else
+                ret = ext4_block_write_begin(page, pos, len,
+                                             ext4_get_block);
+#else
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
        else
                ret = __block_write_begin(page, pos, len, ext4_get_block);
+#endif
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, page_buffers(page),
                                             from, to, NULL,
@@ -2575,7 +2670,12 @@ retry_journal:
        /* In case writeback began while the page was unlocked */
        wait_for_stable_page(page);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ret = ext4_block_write_begin(page, pos, len,
+                                     ext4_da_get_block_prep);
+#else
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
+#endif
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2821,7 +2921,7 @@ static int ext4_readpage(struct file *file, struct page *page)
                ret = ext4_readpage_inline(inode, page);
        if (ret == -EAGAIN)
-                return mpage_readpage(page, ext4_get_block);
+                return ext4_mpage_readpages(page->mapping, NULL, page, 1);
        return ret;
 }
@@ -2836,7 +2936,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
        if (ext4_has_inline_data(inode))
                return 0;
-        return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
+        return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
 }
 static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3033,6 +3133,9 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
+#endif
        if (IS_DAX(inode))
                ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
                                ext4_end_io_dio, dio_flags);
@@ -3097,6 +3200,11 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+                return 0;
+#endif
        /*
         * If we are doing data journalling we don't support O_DIRECT
         */
@@ -3261,6 +3369,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
                /* Uhhuh. Read error. Complain and punt. */
                if (!buffer_uptodate(bh))
                        goto unlock;
+                if (S_ISREG(inode->i_mode) &&
+                    ext4_encrypted_inode(inode)) {
+                        /* We expect the key to be set. */
+                        BUG_ON(!ext4_has_encryption_key(inode));
+                        BUG_ON(blocksize != PAGE_CACHE_SIZE);
+                        WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+                }
        }
        if (ext4_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
@@ -4096,7 +4211,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
-                if (ext4_inode_is_fast_symlink(inode)) {
+                if (ext4_inode_is_fast_symlink(inode) &&
+                    !ext4_encrypted_inode(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        nd_terminate_link(ei->i_data, inode->i_size,
                                sizeof(ei->i_data) - 1);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f58a0d106726..2cb9e178d1c5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -8,12 +8,12 @@
 */
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/capability.h>
 #include <linux/time.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/file.h>
+#include <linux/random.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -196,6 +196,16 @@ journal_err_out:
        return err;
 }
+static int uuid_is_zero(__u8 u[16])
+{
+        int     i;
+        for (i = 0; i < 16; i++)
+                if (u[i])
+                        return 0;
+        return 1;
+}
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -615,7 +625,78 @@ resizefs_out:
        }
        case EXT4_IOC_PRECACHE_EXTENTS:
                return ext4_ext_precache(inode);
+        case EXT4_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                struct ext4_encryption_policy policy;
+                int err = 0;
+                if (copy_from_user(&policy,
+                                   (struct ext4_encryption_policy __user *)arg,
+                                   sizeof(policy))) {
+                        err = -EFAULT;
+                        goto encryption_policy_out;
+                }
+                err = ext4_process_policy(&policy, inode);
+encryption_policy_out:
+                return err;
+#else
+                return -EOPNOTSUPP;
+#endif
+        }
+        case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+                int err, err2;
+                struct ext4_sb_info *sbi = EXT4_SB(sb);
+                handle_t *handle;
+                if (!ext4_sb_has_crypto(sb))
+                        return -EOPNOTSUPP;
+                if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
+                        err = mnt_want_write_file(filp);
+                        if (err)
+                                return err;
+                        handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+                        if (IS_ERR(handle)) {
+                                err = PTR_ERR(handle);
+                                goto pwsalt_err_exit;
+                        }
+                        err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+                        if (err)
+                                goto pwsalt_err_journal;
+                        generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
+                        err = ext4_handle_dirty_metadata(handle, NULL,
+                                                         sbi->s_sbh);
+                pwsalt_err_journal:
+                        err2 = ext4_journal_stop(handle);
+                        if (err2 && !err)
+                                err = err2;
+                pwsalt_err_exit:
+                        mnt_drop_write_file(filp);
+                        if (err)
+                                return err;
+                }
+                if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt,
+                                 16))
+                        return -EFAULT;
+                return 0;
+        }
+        case EXT4_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                struct ext4_encryption_policy policy;
+                int err = 0;
+                if (!ext4_encrypted_inode(inode))
+                        return -ENOENT;
+                err = ext4_get_policy(inode, &policy);
+                if (err)
+                        return err;
+                if (copy_to_user((void *)arg, &policy, sizeof(policy)))
+                        return -EFAULT;
+                return 0;
+#else
+                return -EOPNOTSUPP;
+#endif
+        }
        default:
                return -ENOTTY;
        }
@@ -680,6 +761,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case FITRIM:
        case EXT4_IOC_RESIZE_FS:
        case EXT4_IOC_PRECACHE_EXTENTS:
+        case EXT4_IOC_SET_ENCRYPTION_POLICY:
+        case EXT4_IOC_GET_ENCRYPTION_PWSALT:
+        case EXT4_IOC_GET_ENCRYPTION_POLICY:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..ef22cd951c0c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/jbd2.h>
 #include <linux/time.h>
 #include <linux/fcntl.h>
 #include <linux/stat.h>
@@ -254,8 +253,9 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct dx_hash_info *hinfo,
                                 struct dx_frame *frame);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
-                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+                       unsigned blocksize, struct dx_hash_info *hinfo,
+                       struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
                struct dx_map_entry *offsets, int count, unsigned blocksize);
@@ -586,8 +586,10 @@ struct stats
        unsigned bcount;
 };
-static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de,
+static struct stats dx_show_leaf(struct inode *dir,
-                                 int size, int show_names)
+                                struct dx_hash_info *hinfo,
+                                struct ext4_dir_entry_2 *de,
+                                int size, int show_names)
 {
        unsigned names = 0, space = 0;
        char *base = (char *) de;
@@ -600,12 +602,80 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                {
                        if (show_names)
                        {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                                int len;
+                                char *name;
+                                struct ext4_str fname_crypto_str
+                                        = {.name = NULL, .len = 0};
+                                struct ext4_fname_crypto_ctx *ctx = NULL;
+                                int res;
+                                name  = de->name;
+                                len = de->name_len;
+                                ctx = ext4_get_fname_crypto_ctx(dir,
+                                                                EXT4_NAME_LEN);
+                                if (IS_ERR(ctx)) {
+                                        printk(KERN_WARNING "Error acquiring"
+                                        " crypto ctxt--skipping crypto\n");
+                                        ctx = NULL;
+                                }
+                                if (ctx == NULL) {
+                                        /* Directory is not encrypted */
+                                        ext4fs_dirhash(de->name,
+                                                de->name_len, &h);
+                                        printk("%*.s:(U)%x.%u ", len,
+                                               name, h.hash,
+                                               (unsigned) ((char *) de
+                                                           - base));
+                                } else {
+                                        /* Directory is encrypted */
+                                        res = ext4_fname_crypto_alloc_buffer(
+                                                ctx, de->name_len,
+                                                &fname_crypto_str);
+                                        if (res < 0) {
+                                                printk(KERN_WARNING "Error "
+                                                        "allocating crypto "
+                                                        "buffer--skipping "
+                                                        "crypto\n");
+                                                ext4_put_fname_crypto_ctx(&ctx);
+                                                ctx = NULL;
+                                        }
+                                        res = ext4_fname_disk_to_usr(ctx, de,
+                                                        &fname_crypto_str);
+                                        if (res < 0) {
+                                                printk(KERN_WARNING "Error "
+                                                        "converting filename "
+                                                        "from disk to usr"
+                                                        "\n");
+                                                name = "??";
+                                                len = 2;
+                                        } else {
+                                                name = fname_crypto_str.name;
+                                                len = fname_crypto_str.len;
+                                        }
+                                        res = ext4_fname_disk_to_hash(ctx, de,
+                                                                      &h);
+                                        if (res < 0) {
+                                                printk(KERN_WARNING "Error "
+                                                        "converting filename "
+                                                        "from disk to htree"
+                                                        "\n");
+                                                h.hash = 0xDEADBEEF;
+                                        }
+                                        printk("%*.s:(E)%x.%u ", len, name,
+                                               h.hash, (unsigned) ((char *) de
+                                                                   - base));
+                                        ext4_put_fname_crypto_ctx(&ctx);
+                                        ext4_fname_crypto_free_buffer(
+                                                &fname_crypto_str);
+                                }
+#else
                                int len = de->name_len;
                                char *name = de->name;
-                                while (len--) printk("%c", *name++);
                                ext4fs_dirhash(de->name, de->name_len, &h);
-                                printk(":%x.%u ", h.hash,
+                                printk("%*.s:%x.%u ", len, name, h.hash,
                                       (unsigned) ((char *) de - base));
+#endif
                        }
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
@@ -623,7 +693,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
        unsigned count = dx_get_count(entries), names = 0, space = 0, i;
        unsigned bcount = 0;
        struct buffer_head *bh;
-        int err;
        printk("%i indexed blocks...\n", count);
        for (i = 0; i < count; i++, entries++)
        {
@@ -637,7 +706,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                        continue;
                stats = levels?
                   dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
-                   dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0);
+                   dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
+                        bh->b_data, blocksize, 0);
                names += stats.names;
                space += stats.space;
                bcount += stats.bcount;
@@ -687,8 +757,28 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
        if (hinfo->hash_version <= DX_HASH_TEA)
                hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if (d_name) {
+                struct ext4_fname_crypto_ctx *ctx = NULL;
+                int res;
+                /* Check if the directory is encrypted */
+                ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+                if (IS_ERR(ctx)) {
+                        ret_err = ERR_PTR(PTR_ERR(ctx));
+                        goto fail;
+                }
+                res = ext4_fname_usr_to_hash(ctx, d_name, hinfo);
+                if (res < 0) {
+                        ret_err = ERR_PTR(res);
+                        goto fail;
+                }
+                ext4_put_fname_crypto_ctx(&ctx);
+        }
+#else
        if (d_name)
                ext4fs_dirhash(d_name->name, d_name->len, hinfo);
+#endif
        hash = hinfo->hash;
        if (root->info.unused_flags & 1) {
@@ -773,6 +863,7 @@ fail:
                brelse(frame->bh);
                frame--;
        }
        if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
                ext4_warning(dir->i_sb,
                             "Corrupt dir inode %lu, running e2fsck is "
@@ -878,6 +969,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de, *top;
        int err = 0, count = 0;
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
        dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
                                                        (unsigned long)block));
@@ -889,6 +982,24 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        /* Check if the directory is encrypted */
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx)) {
+                err = PTR_ERR(ctx);
+                brelse(bh);
+                return err;
+        }
+        if (ctx != NULL) {
+                err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+                                                     &fname_crypto_str);
+                if (err < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        brelse(bh);
+                        return err;
+                }
+        }
+#endif
        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
                                bh->b_data, bh->b_size,
@@ -897,21 +1008,52 @@ static int htree_dirblock_to_tree(struct file *dir_file,
                        /* silently ignore the rest of the block */
                        break;
                }
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                err = ext4_fname_disk_to_hash(ctx, de, hinfo);
+                if (err < 0) {
+                        count = err;
+                        goto errout;
+                }
+#else
                ext4fs_dirhash(de->name, de->name_len, hinfo);
+#endif
                if ((hinfo->hash < start_hash) ||
                    ((hinfo->hash == start_hash) &&
                     (hinfo->minor_hash < start_minor_hash)))
                        continue;
                if (de->inode == 0)
                        continue;
-                if ((err = ext4_htree_store_dirent(dir_file,
+                if (ctx == NULL) {
-                                   hinfo->hash, hinfo->minor_hash, de)) != 0) {
+                        /* Directory is not encrypted */
-                        brelse(bh);
+                        tmp_str.name = de->name;
-                        return err;
+                        tmp_str.len = de->name_len;
+                        err = ext4_htree_store_dirent(dir_file,
+                                   hinfo->hash, hinfo->minor_hash, de,
+                                   &tmp_str);
+                } else {
+                        /* Directory is encrypted */
+                        err = ext4_fname_disk_to_usr(ctx, de,
+                                                     &fname_crypto_str);
+                        if (err < 0) {
+                                count = err;
+                                goto errout;
+                        }
+                        err = ext4_htree_store_dirent(dir_file,
+                                   hinfo->hash, hinfo->minor_hash, de,
+                                        &fname_crypto_str);
+                }
+                if (err != 0) {
+                        count = err;
+                        goto errout;
                }
                count++;
        }
+errout:
        brelse(bh);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ext4_put_fname_crypto_ctx(&ctx);
+        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+#endif
        return count;
 }
@@ -935,6 +1077,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int count = 0;
        int ret, err;
        __u32 hashval;
+        struct ext4_str tmp_str;
        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
@@ -970,14 +1113,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        /* Add '.' and '..' from the htree header */
        if (!start_hash && !start_minor_hash) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-                if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0)
+                tmp_str.name = de->name;
+                tmp_str.len = de->name_len;
+                err = ext4_htree_store_dirent(dir_file, 0, 0,
+                                              de, &tmp_str);
+                if (err != 0)
                        goto errout;
                count++;
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
                de = ext4_next_entry(de, dir->i_sb->s_blocksize);
-                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
+                tmp_str.name = de->name;
+                tmp_str.len = de->name_len;
+                err = ext4_htree_store_dirent(dir_file, 2, 0,
+                                              de, &tmp_str);
+                if (err != 0)
                        goto errout;
                count++;
        }
@@ -1035,17 +1186,33 @@ static inline int search_dirblock(struct buffer_head *bh,
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
-static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
-                       struct dx_hash_info *hinfo,
+                       unsigned blocksize, struct dx_hash_info *hinfo,
                       struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        int err;
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx))
+                return PTR_ERR(ctx);
+#endif
        while ((char *) de < base + blocksize) {
                if (de->name_len && de->inode) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                        err = ext4_fname_disk_to_hash(ctx, de, &h);
+                        if (err < 0) {
+                                ext4_put_fname_crypto_ctx(&ctx);
+                                return err;
+                        }
+#else
                        ext4fs_dirhash(de->name, de->name_len, &h);
+#endif
                        map_tail--;
                        map_tail->hash = h.hash;
                        map_tail->offs = ((char *) de - base)>>2;
@@ -1056,6 +1223,9 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                /* XXX: do we need to check rec_len == 0 case? -Chris */
                de = ext4_next_entry(de, blocksize);
        }
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ext4_put_fname_crypto_ctx(&ctx);
+#endif
        return count;
 }
@@ -1106,57 +1276,107 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
 * `de != NULL' is guaranteed by caller.
 */
-static inline int ext4_match (int len, const char * const name,
+static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx,
-                              struct ext4_dir_entry_2 * de)
+                             struct ext4_str *fname_crypto_str,
+                             int len, const char * const name,
+                             struct ext4_dir_entry_2 *de)
 {
-        if (len != de->name_len)
+        int res;
-                return 0;
        if (!de->inode)
                return 0;
-        return !memcmp(name, de->name, len);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if (ctx) {
+                /* Directory is encrypted */
+                res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str);
+                if (res < 0)
+                        return res;
+                if (len != res)
+                        return 0;
+                res = memcmp(name, fname_crypto_str->name, len);
+                return (res == 0) ? 1 : 0;
+        }
+#endif
+        if (len != de->name_len)
+                return 0;
+        res = memcmp(name, de->name, len);
+        return (res == 0) ? 1 : 0;
 }
 /*
 * Returns 0 if not found, -1 on failure, and 1 on success
 */
-int search_dir(struct buffer_head *bh,
+int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
-               char *search_buf,
+               struct inode *dir, const struct qstr *d_name,
-               int buf_size,
+               unsigned int offset, struct ext4_dir_entry_2 **res_dir)
-               struct inode *dir,
-               const struct qstr *d_name,
-               unsigned int offset,
-               struct ext4_dir_entry_2 **res_dir)
 {
        struct ext4_dir_entry_2 * de;
        char * dlimit;
        int de_len;
        const char *name = d_name->name;
        int namelen = d_name->len;
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+        int res;
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx))
+                return -1;
+        if (ctx != NULL) {
+                /* Allocate buffer to hold maximum name length */
+                res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+                                                     &fname_crypto_str);
+                if (res < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return -1;
+                }
+        }
        de = (struct ext4_dir_entry_2 *)search_buf;
        dlimit = search_buf + buf_size;
        while ((char *) de < dlimit) {
                /* this code is executed quadratically often */
                /* do minimal checking `by hand' */
+                if ((char *) de + de->name_len <= dlimit) {
+                        res = ext4_match(ctx, &fname_crypto_str, namelen,
+                                         name, de);
+                        if (res < 0) {
+                                res = -1;
+                                goto return_result;
+                        }
+                        if (res > 0) {
+                                /* found a match - just to be sure, do
+                                 * a full check */
+                                if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                                bh->b_data,
+                                                 bh->b_size, offset)) {
+                                        res = -1;
+                                        goto return_result;
+                                }
+                                *res_dir = de;
+                                res = 1;
+                                goto return_result;
+                        }
-                if ((char *) de + namelen <= dlimit &&
-                    ext4_match (namelen, name, de)) {
-                        /* found a match - just to be sure, do a full check */
-                        if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
-                                                 bh->b_size, offset))
-                                return -1;
-                        *res_dir = de;
-                        return 1;
                }
                /* prevent looping on a bad block */
                de_len = ext4_rec_len_from_disk(de->rec_len,
                                                dir->i_sb->s_blocksize);
-                if (de_len <= 0)
+                if (de_len <= 0) {
-                        return -1;
+                        res = -1;
+                        goto return_result;
+                }
                offset += de_len;
                de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
        }
-        return 0;
+        res = 0;
+return_result:
+        ext4_put_fname_crypto_ctx(&ctx);
+        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+        return res;
 }
 static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1345,6 +1565,9 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
        ext4_lblk_t block;
        int retval;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        *res_dir = NULL;
+#endif
        frame = dx_probe(d_name, dir, &hinfo, frames);
        if (IS_ERR(frame))
                return (struct buffer_head *) frame;
@@ -1417,6 +1640,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
                                         ino);
                        return ERR_PTR(-EIO);
                }
+                if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
+                    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                     S_ISLNK(inode->i_mode)) &&
+                    !ext4_is_child_context_consistent_with_parent(dir,
+                                                                  inode)) {
+                        iput(inode);
+                        ext4_warning(inode->i_sb,
+                                     "Inconsistent encryption contexts: %lu/%lu\n",
+                                     (unsigned long) dir->i_ino,
+                                     (unsigned long) inode->i_ino);
+                        return ERR_PTR(-EPERM);
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1541,7 +1776,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        /* create map in the end of data2 block */
        map = (struct dx_map_entry *) (data2 + blocksize);
-        count = dx_make_map((struct ext4_dir_entry_2 *) data1,
+        count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
                             blocksize, hinfo, map);
        map -= count;
        dx_sort_map(map, count);
@@ -1564,7 +1799,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                                        hash2, split, count-split));
        /* Fancy dance to stay within two buffers */
-        de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
+        de2 = dx_move_dirents(data1, data2, map + split, count - split,
+                              blocksize);
        de = dx_pack_dirents(data1, blocksize);
        de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
                                           (char *) de,
@@ -1580,8 +1816,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                initialize_dirent_tail(t, blocksize);
        }
-        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
+        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
-        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
+                        blocksize, 1));
+        dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
+                        blocksize, 1));
        /* Which block gets the new entry? */
        if (hinfo->hash >= hash2) {
@@ -1618,15 +1856,48 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
        int nlen, rlen;
        unsigned int offset = 0;
        char *top;
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+        int res;
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx))
+                return -1;
+        if (ctx != NULL) {
+                /* Calculate record length needed to store the entry */
+                res = ext4_fname_crypto_namelen_on_disk(ctx, namelen);
+                if (res < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return res;
+                }
+                reclen = EXT4_DIR_REC_LEN(res);
+                /* Allocate buffer to hold maximum name length */
+                res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+                                                     &fname_crypto_str);
+                if (res < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return -1;
+                }
+        }
        de = (struct ext4_dir_entry_2 *)buf;
        top = buf + buf_size - reclen;
        while ((char *) de <= top) {
                if (ext4_check_dir_entry(dir, NULL, de, bh,
-                                         buf, buf_size, offset))
+                                         buf, buf_size, offset)) {
-                        return -EIO;
+                        res = -EIO;
-                if (ext4_match(namelen, name, de))
+                        goto return_result;
-                        return -EEXIST;
+                }
+                /* Provide crypto context and crypto buffer to ext4 match */
+                res = ext4_match(ctx, &fname_crypto_str, namelen, name, de);
+                if (res < 0)
+                        goto return_result;
+                if (res > 0) {
+                        res = -EEXIST;
+                        goto return_result;
+                }
                nlen = EXT4_DIR_REC_LEN(de->name_len);
                rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
                if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1634,26 +1905,62 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
                de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
                offset += rlen;
        }
-        if ((char *) de > top)
-                return -ENOSPC;
-        *dest_de = de;
+        if ((char *) de > top)
-        return 0;
+                res = -ENOSPC;
+        else {
+                *dest_de = de;
+                res = 0;
+        }
+return_result:
+        ext4_put_fname_crypto_ctx(&ctx);
+        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+        return res;
 }
-void ext4_insert_dentry(struct inode *inode,
+int ext4_insert_dentry(struct inode *dir,
-                        struct ext4_dir_entry_2 *de,
+                       struct inode *inode,
-                        int buf_size,
+                       struct ext4_dir_entry_2 *de,
-                        const char *name, int namelen)
+                       int buf_size,
+                       const struct qstr *iname,
+                       const char *name, int namelen)
 {
        int nlen, rlen;
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
+        struct ext4_str tmp_str;
+        int res;
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx))
+                return -EIO;
+        /* By default, the input name would be written to the disk */
+        tmp_str.name = (unsigned char *)name;
+        tmp_str.len = namelen;
+        if (ctx != NULL) {
+                /* Directory is encrypted */
+                res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
+                                                     &fname_crypto_str);
+                if (res < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return -ENOMEM;
+                }
+                res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str);
+                if (res < 0) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+                        return res;
+                }
+                tmp_str.name = fname_crypto_str.name;
+                tmp_str.len = fname_crypto_str.len;
+        }
        nlen = EXT4_DIR_REC_LEN(de->name_len);
        rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 =
-                                (struct ext4_dir_entry_2 *)((char *)de + nlen);
+                        (struct ext4_dir_entry_2 *)((char *)de + nlen);
                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
                de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
                de = de1;
@@ -1661,9 +1968,14 @@ void ext4_insert_dentry(struct inode *inode,
        de->file_type = EXT4_FT_UNKNOWN;
        de->inode = cpu_to_le32(inode->i_ino);
        ext4_set_de_type(inode->i_sb, de, inode->i_mode);
-        de->name_len = namelen;
+        de->name_len = tmp_str.len;
-        memcpy(de->name, name, namelen);
+        memcpy(de->name, tmp_str.name, tmp_str.len);
+        ext4_put_fname_crypto_ctx(&ctx);
+        ext4_fname_crypto_free_buffer(&fname_crypto_str);
+        return 0;
 }
 /*
 * Add a new entry into a directory (leaf) block.  If de is non-NULL,
 * it points to a directory entry which is guaranteed to be large
@@ -1700,8 +2012,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                return err;
        }
-        /* By now the buffer is marked for journaling */
+        /* By now the buffer is marked for journaling. Due to crypto operations,
-        ext4_insert_dentry(inode, de, blocksize, name, namelen);
+         * the following function call may fail */
+        err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name,
+                                 name, namelen);
+        if (err < 0)
+                return err;
        /*
         * XXX shouldn't update any times until successful
@@ -1733,8 +2049,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                            struct inode *inode, struct buffer_head *bh)
 {
        struct inode    *dir = dentry->d_parent->d_inode;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        int res;
+#else
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
+#endif
        struct buffer_head *bh2;
        struct dx_root  *root;
        struct dx_frame frames[2], *frame;
@@ -1748,7 +2069,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct dx_hash_info hinfo;
        ext4_lblk_t  block;
        struct fake_dirent *fde;
-        int             csum_size = 0;
+        int csum_size = 0;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
+        if (IS_ERR(ctx))
+                return PTR_ERR(ctx);
+#endif
        if (ext4_has_metadata_csum(inode->i_sb))
                csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -1815,7 +2142,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        if (hinfo.hash_version <= DX_HASH_TEA)
                hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
        hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo);
+        if (res < 0) {
+                ext4_put_fname_crypto_ctx(&ctx);
+                ext4_mark_inode_dirty(handle, dir);
+                brelse(bh);
+                return res;
+        }
+        ext4_put_fname_crypto_ctx(&ctx);
+#else
        ext4fs_dirhash(name, namelen, &hinfo);
+#endif
        memset(frames, 0, sizeof(frames));
        frame = frames;
        frame->entries = entries;
@@ -1865,7 +2203,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                          struct inode *inode)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        struct buffer_head *bh;
+        struct buffer_head *bh = NULL;
        struct ext4_dir_entry_2 *de;
        struct ext4_dir_entry_tail *t;
        struct super_block *sb;
@@ -1889,14 +2227,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                        return retval;
                if (retval == 1) {
                        retval = 0;
-                        return retval;
+                        goto out;
                }
        }
        if (is_dx(dir)) {
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
-                        return retval;
+                        goto out;
                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
@@ -1908,14 +2246,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                        return PTR_ERR(bh);
                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
-                if (retval != -ENOSPC) {
+                if (retval != -ENOSPC)
-                        brelse(bh);
+                        goto out;
-                        return retval;
-                }
                if (blocks == 1 && !dx_fallback &&
-                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
-                        return make_indexed_dir(handle, dentry, inode, bh);
+                        retval = make_indexed_dir(handle, dentry, inode, bh);
+                        bh = NULL; /* make_indexed_dir releases bh */
+                        goto out;
+                }
                brelse(bh);
        }
        bh = ext4_append(handle, dir, &block);
@@ -1931,6 +2270,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        }
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+out:
        brelse(bh);
        if (retval == 0)
                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -2237,7 +2577,20 @@ retry:
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
-                err = ext4_add_nondir(handle, dentry, inode);
+                err = 0;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                if (!err && (ext4_encrypted_inode(dir) ||
+                             DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) {
+                        err = ext4_inherit_context(dir, inode);
+                        if (err) {
+                                clear_nlink(inode);
+                                unlock_new_inode(inode);
+                                iput(inode);
+                        }
+                }
+#endif
+                if (!err)
+                        err = ext4_add_nondir(handle, dentry, inode);
                if (!err && IS_DIRSYNC(dir))
                        ext4_handle_sync(handle);
        }
@@ -2418,6 +2771,14 @@ retry:
        err = ext4_init_new_dir(handle, dir, inode);
        if (err)
                goto out_clear_inode;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        if (ext4_encrypted_inode(dir) ||
+            DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) {
+                err = ext4_inherit_context(dir, inode);
+                if (err)
+                        goto out_clear_inode;
+        }
+#endif
        err = ext4_mark_inode_dirty(handle, inode);
        if (!err)
                err = ext4_add_entry(handle, dentry, inode);
@@ -2450,7 +2811,7 @@ out_stop:
 /*
 * routine to check that the specified directory is empty (for rmdir)
 */
-static int empty_dir(struct inode *inode)
+int ext4_empty_dir(struct inode *inode)
 {
        unsigned int offset;
        struct buffer_head *bh;
@@ -2718,7 +3079,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
                goto end_rmdir;
        retval = -ENOTEMPTY;
-        if (!empty_dir(inode))
+        if (!ext4_empty_dir(inode))
                goto end_rmdir;
        handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -2828,16 +3189,25 @@ static int ext4_symlink(struct inode *dir,
 {
        handle_t *handle;
        struct inode *inode;
-        int l, err, retries = 0;
+        int err, len = strlen(symname);
        int credits;
+        bool encryption_required;
-        l = strlen(symname)+1;
+        struct ext4_str disk_link;
-        if (l > dir->i_sb->s_blocksize)
+        struct ext4_encrypted_symlink_data *sd = NULL;
+        disk_link.len = len + 1;
+        disk_link.name = (char *) symname;
+        encryption_required = (ext4_encrypted_inode(dir) ||
+                               DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
+        if (encryption_required)
+                disk_link.len = encrypted_symlink_data_len(len) + 1;
+        if (disk_link.len > dir->i_sb->s_blocksize)
                return -ENAMETOOLONG;
        dquot_initialize(dir);
-        if (l > EXT4_N_BLOCKS * 4) {
+        if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                /*
                 * For non-fast symlinks, we just allocate inode and put it on
                 * orphan list in the first transaction => we need bitmap,
@@ -2856,16 +3226,49 @@ static int ext4_symlink(struct inode *dir,
                credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
                          EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
        }
-retry:
        inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
                                            &dentry->d_name, 0, NULL,
                                            EXT4_HT_DIR, credits);
        handle = ext4_journal_current_handle();
-        err = PTR_ERR(inode);
+        if (IS_ERR(inode)) {
-        if (IS_ERR(inode))
+                if (handle)
-                goto out_stop;
+                        ext4_journal_stop(handle);
+                return PTR_ERR(inode);
+        }
+        if (encryption_required) {
+                struct ext4_fname_crypto_ctx *ctx = NULL;
+                struct qstr istr;
+                struct ext4_str ostr;
+                sd = kzalloc(disk_link.len, GFP_NOFS);
+                if (!sd) {
+                        err = -ENOMEM;
+                        goto err_drop_inode;
+                }
+                err = ext4_inherit_context(dir, inode);
+                if (err)
+                        goto err_drop_inode;
+                ctx = ext4_get_fname_crypto_ctx(inode,
+                                                inode->i_sb->s_blocksize);
+                if (IS_ERR_OR_NULL(ctx)) {
+                        /* We just set the policy, so ctx should not be NULL */
+                        err = (ctx == NULL) ? -EIO : PTR_ERR(ctx);
+                        goto err_drop_inode;
+                }
+                istr.name = (const unsigned char *) symname;
+                istr.len = len;
+                ostr.name = sd->encrypted_path;
+                err = ext4_fname_usr_to_disk(ctx, &istr, &ostr);
+                ext4_put_fname_crypto_ctx(&ctx);
+                if (err < 0)
+                        goto err_drop_inode;
+                sd->len = cpu_to_le16(ostr.len);
+                disk_link.name = (char *) sd;
+        }
-        if (l > EXT4_N_BLOCKS * 4) {
+        if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
                inode->i_op = &ext4_symlink_inode_operations;
                ext4_set_aops(inode);
                /*
@@ -2881,9 +3284,10 @@ retry:
                drop_nlink(inode);
                err = ext4_orphan_add(handle, inode);
                ext4_journal_stop(handle);
+                handle = NULL;
                if (err)
                        goto err_drop_inode;
-                err = __page_symlink(inode, symname, l, 1);
+                err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
                if (err)
                        goto err_drop_inode;
                /*
@@ -2895,34 +3299,37 @@ retry:
                                EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
                if (IS_ERR(handle)) {
                        err = PTR_ERR(handle);
+                        handle = NULL;
                        goto err_drop_inode;
                }
                set_nlink(inode, 1);
                err = ext4_orphan_del(handle, inode);
-                if (err) {
+                if (err)
-                        ext4_journal_stop(handle);
-                        clear_nlink(inode);
                        goto err_drop_inode;
-                }
        } else {
                /* clear the extent format for fast symlink */
                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
-                inode->i_op = &ext4_fast_symlink_inode_operations;
+                inode->i_op = encryption_required ?
-                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
+                        &ext4_symlink_inode_operations :
-                inode->i_size = l-1;
+                        &ext4_fast_symlink_inode_operations;
+                memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
+                       disk_link.len);
+                inode->i_size = disk_link.len - 1;
        }
        EXT4_I(inode)->i_disksize = inode->i_size;
        err = ext4_add_nondir(handle, dentry, inode);
        if (!err && IS_DIRSYNC(dir))
                ext4_handle_sync(handle);
-out_stop:
        if (handle)
                ext4_journal_stop(handle);
-        if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
+        kfree(sd);
-                goto retry;
        return err;
 err_drop_inode:
+        if (handle)
+                ext4_journal_stop(handle);
+        kfree(sd);
+        clear_nlink(inode);
        unlock_new_inode(inode);
        iput(inode);
        return err;
@@ -2937,7 +3344,9 @@ static int ext4_link(struct dentry *old_dentry,
        if (inode->i_nlink >= EXT4_LINK_MAX)
                return -EMLINK;
+        if (ext4_encrypted_inode(dir) &&
+            !ext4_is_child_context_consistent_with_parent(dir, inode))
+                return -EPERM;
        dquot_initialize(dir);
 retry:
@@ -3238,6 +3647,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
                goto end_rename;
+        if ((old.dir != new.dir) &&
+            ext4_encrypted_inode(new.dir) &&
+            !ext4_is_child_context_consistent_with_parent(new.dir,
+                                                          old.inode)) {
+                retval = -EPERM;
+                goto end_rename;
+        }
        new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
                                 &new.de, &new.inlined);
        if (IS_ERR(new.bh)) {
@@ -3258,12 +3675,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
        if (!(flags & RENAME_WHITEOUT)) {
                handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
-                if (IS_ERR(handle))
+                if (IS_ERR(handle)) {
-                        return PTR_ERR(handle);
+                        retval = PTR_ERR(handle);
+                        handle = NULL;
+                        goto end_rename;
+                }
        } else {
                whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
-                if (IS_ERR(whiteout))
+                if (IS_ERR(whiteout)) {
-                        return PTR_ERR(whiteout);
+                        retval = PTR_ERR(whiteout);
+                        whiteout = NULL;
+                        goto end_rename;
+                }
        }
        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
@@ -3272,7 +3695,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old.inode->i_mode)) {
                if (new.inode) {
                        retval = -ENOTEMPTY;
-                        if (!empty_dir(new.inode))
+                        if (!ext4_empty_dir(new.inode))
                                goto end_rename;
                } else {
                        retval = -EMLINK;
@@ -3346,8 +3769,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                ext4_dec_count(handle, old.dir);
                if (new.inode) {
-                        /* checked empty_dir above, can't have another parent,
+                        /* checked ext4_empty_dir above, can't have another
-                         * ext4_dec_count() won't work for many-linked dirs */
+                         * parent, ext4_dec_count() won't work for many-linked
+                         * dirs */
                        clear_nlink(new.inode);
                } else {
                        ext4_inc_count(handle, new.dir);
@@ -3427,8 +3851,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
        handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
                (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
                 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
-        if (IS_ERR(handle))
+        if (IS_ERR(handle)) {
-                return PTR_ERR(handle);
+                retval = PTR_ERR(handle);
+                handle = NULL;
+                goto end_rename;
+        }
        if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
                ext4_handle_sync(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 464984261e69..5765f88b3904 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -8,7 +8,6 @@
 #include <linux/fs.h>
 #include <linux/time.h>
-#include <linux/jbd2.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
@@ -24,7 +23,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/ratelimit.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -68,6 +66,10 @@ static void ext4_finish_bio(struct bio *bio)
        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                struct page *data_page = NULL;
+                struct ext4_crypto_ctx *ctx = NULL;
+#endif
                struct buffer_head *bh, *head;
                unsigned bio_start = bvec->bv_offset;
                unsigned bio_end = bio_start + bvec->bv_len;
@@ -77,6 +79,15 @@ static void ext4_finish_bio(struct bio *bio)
                if (!page)
                        continue;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                if (!page->mapping) {
+                        /* The bounce data pages are unmapped. */
+                        data_page = page;
+                        ctx = (struct ext4_crypto_ctx *)page_private(data_page);
+                        page = ctx->control_page;
+                }
+#endif
                if (error) {
                        SetPageError(page);
                        set_bit(AS_EIO, &page->mapping->flags);
@@ -101,8 +112,13 @@ static void ext4_finish_bio(struct bio *bio)
                } while ((bh = bh->b_this_page) != head);
                bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
                local_irq_restore(flags);
-                if (!under_io)
+                if (!under_io) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                        if (ctx)
+                                ext4_restore_control_page(data_page);
+#endif
                        end_page_writeback(page);
+                }
        }
 }
@@ -377,6 +393,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
 static int io_submit_add_bh(struct ext4_io_submit *io,
                            struct inode *inode,
+                            struct page *page,
                            struct buffer_head *bh)
 {
        int ret;
@@ -390,7 +407,7 @@ submit_and_retry:
                if (ret)
                        return ret;
        }
-        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
+        ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
        if (ret != bh->b_size)
                goto submit_and_retry;
        io->io_next_block++;
@@ -403,6 +420,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                        struct writeback_control *wbc,
                        bool keep_towrite)
 {
+        struct page *data_page = NULL;
        struct inode *inode = page->mapping->host;
        unsigned block_start, blocksize;
        struct buffer_head *bh, *head;
@@ -462,19 +480,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
                set_buffer_async_write(bh);
        } while ((bh = bh->b_this_page) != head);
-        /* Now submit buffers to write */
        bh = head = page_buffers(page);
+        if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+                data_page = ext4_encrypt(inode, page);
+                if (IS_ERR(data_page)) {
+                        ret = PTR_ERR(data_page);
+                        data_page = NULL;
+                        goto out;
+                }
+        }
+        /* Now submit buffers to write */
        do {
                if (!buffer_async_write(bh))
                        continue;
-                ret = io_submit_add_bh(io, inode, bh);
+                ret = io_submit_add_bh(io, inode,
+                                       data_page ? data_page : page, bh);
                if (ret) {
                        /*
                         * We only get here on ENOMEM.  Not much else
                         * we can do but mark the page as dirty, and
                         * better luck next time.
                         */
-                        redirty_page_for_writepage(wbc, page);
                        break;
                }
                nr_submitted++;
@@ -483,6 +511,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
        /* Error stopped previous loop? Clean up buffers... */
        if (ret) {
+        out:
+                if (data_page)
+                        ext4_restore_control_page(data_page);
+                printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
+                redirty_page_for_writepage(wbc, page);
                do {
                        clear_buffer_async_write(bh);
                        bh = bh->b_this_page;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
new file mode 100644
index 000000000000..171b9ac4b45e
--- /dev/null
+++ b/fs/ext4/readpage.c
@@ -0,0 +1,328 @@
+/*
+ * linux/fs/ext4/readpage.c
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This was originally taken from fs/mpage.c
+ *
+ * The intent is the ext4_mpage_readpages() function here is intended
+ * to replace mpage_readpages() in the general case, not just for
+ * encrypted files.  It has some limitations (see below), where it
+ * will fall back to read_block_full_page(), but these limitations
+ * should only be hit when page_size != block_size.
+ *
+ * This will allow us to attach a callback function to support ext4
+ * encryption.
+ *
+ * If anything unusual happens, such as:
+ *
+ * - encountering a page which has buffers
+ * - encountering a page which has a non-hole after a hole
+ * - encountering a page with non-contiguous blocks
+ *
+ * then this code just gives up and calls the buffer_head-based read function.
+ * It does handle a page which has holes at the end - that is a common case:
+ * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/kdev_t.h>
+#include <linux/gfp.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/highmem.h>
+#include <linux/prefetch.h>
+#include <linux/mpage.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/pagevec.h>
+#include <linux/cleancache.h>
+#include "ext4.h"
+/*
+ * Call ext4_decrypt on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        struct ext4_crypto_ctx *ctx =
+                container_of(work, struct ext4_crypto_ctx, work);
+        struct bio      *bio    = ctx->bio;
+        struct bio_vec  *bv;
+        int             i;
+        bio_for_each_segment_all(bv, bio, i) {
+                struct page *page = bv->bv_page;
+                int ret = ext4_decrypt(ctx, page);
+                if (ret) {
+                        WARN_ON_ONCE(1);
+                        SetPageError(page);
+                } else
+                        SetPageUptodate(page);
+                unlock_page(page);
+        }
+        ext4_release_crypto_ctx(ctx);
+        bio_put(bio);
+#else
+        BUG();
+#endif
+}
+static inline bool ext4_bio_encrypted(struct bio *bio)
+{
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        return unlikely(bio->bi_private != NULL);
+#else
+        return false;
+#endif
+}
+/*
+ * I/O completion handler for multipage BIOs.
+ *
+ * The mpage code never puts partial pages into a BIO (except for end-of-file).
+ * If a page does not map to a contiguous run of blocks then it simply falls
+ * back to block_read_full_page().
+ *
+ * Why is this?  If a page's completion depends on a number of different BIOs
+ * which can complete in any order (or at the same time) then determining the
+ * status of that page is hard.  See end_buffer_async_read() for the details.
+ * There is no point in duplicating all that complexity.
+ */
+static void mpage_end_io(struct bio *bio, int err)
+{
+        struct bio_vec *bv;
+        int i;
+        if (ext4_bio_encrypted(bio)) {
+                struct ext4_crypto_ctx *ctx = bio->bi_private;
+                if (err) {
+                        ext4_release_crypto_ctx(ctx);
+                } else {
+                        INIT_WORK(&ctx->work, completion_pages);
+                        ctx->bio = bio;
+                        queue_work(ext4_read_workqueue, &ctx->work);
+                        return;
+                }
+        }
+        bio_for_each_segment_all(bv, bio, i) {
+                struct page *page = bv->bv_page;
+                if (!err) {
+                        SetPageUptodate(page);
+                } else {
+                        ClearPageUptodate(page);
+                        SetPageError(page);
+                }
+                unlock_page(page);
+        }
+        bio_put(bio);
+}
+int ext4_mpage_readpages(struct address_space *mapping,
+                         struct list_head *pages, struct page *page,
+                         unsigned nr_pages)
+{
+        struct bio *bio = NULL;
+        unsigned page_idx;
+        sector_t last_block_in_bio = 0;
+        struct inode *inode = mapping->host;
+        const unsigned blkbits = inode->i_blkbits;
+        const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+        const unsigned blocksize = 1 << blkbits;
+        sector_t block_in_file;
+        sector_t last_block;
+        sector_t last_block_in_file;
+        sector_t blocks[MAX_BUF_PER_PAGE];
+        unsigned page_block;
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        int length;
+        unsigned relative_block = 0;
+        struct ext4_map_blocks map;
+        map.m_pblk = 0;
+        map.m_lblk = 0;
+        map.m_len = 0;
+        map.m_flags = 0;
+        for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
+                int fully_mapped = 1;
+                unsigned first_hole = blocks_per_page;
+                prefetchw(&page->flags);
+                if (pages) {
+                        page = list_entry(pages->prev, struct page, lru);
+                        list_del(&page->lru);
+                        if (add_to_page_cache_lru(page, mapping,
+                                                  page->index, GFP_KERNEL))
+                                goto next_page;
+                }
+                if (page_has_buffers(page))
+                        goto confused;
+                block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+                last_block = block_in_file + nr_pages * blocks_per_page;
+                last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+                if (last_block > last_block_in_file)
+                        last_block = last_block_in_file;
+                page_block = 0;
+                /*
+                 * Map blocks using the previous result first.
+                 */
+                if ((map.m_flags & EXT4_MAP_MAPPED) &&
+                    block_in_file > map.m_lblk &&
+                    block_in_file < (map.m_lblk + map.m_len)) {
+                        unsigned map_offset = block_in_file - map.m_lblk;
+                        unsigned last = map.m_len - map_offset;
+                        for (relative_block = 0; ; relative_block++) {
+                                if (relative_block == last) {
+                                        /* needed? */
+                                        map.m_flags &= ~EXT4_MAP_MAPPED;
+                                        break;
+                                }
+                                if (page_block == blocks_per_page)
+                                        break;
+                                blocks[page_block] = map.m_pblk + map_offset +
+                                        relative_block;
+                                page_block++;
+                                block_in_file++;
+                        }
+                }
+                /*
+                 * Then do more ext4_map_blocks() calls until we are
+                 * done with this page.
+                 */
+                while (page_block < blocks_per_page) {
+                        if (block_in_file < last_block) {
+                                map.m_lblk = block_in_file;
+                                map.m_len = last_block - block_in_file;
+                                if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
+                                set_error_page:
+                                        SetPageError(page);
+                                        zero_user_segment(page, 0,
+                                                          PAGE_CACHE_SIZE);
+                                        unlock_page(page);
+                                        goto next_page;
+                                }
+                        }
+                        if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
+                                fully_mapped = 0;
+                                if (first_hole == blocks_per_page)
+                                        first_hole = page_block;
+                                page_block++;
+                                block_in_file++;
+                                continue;
+                        }
+                        if (first_hole != blocks_per_page)
+                                goto confused;          /* hole -> non-hole */
+                        /* Contiguous blocks? */
+                        if (page_block && blocks[page_block-1] != map.m_pblk-1)
+                                goto confused;
+                        for (relative_block = 0; ; relative_block++) {
+                                if (relative_block == map.m_len) {
+                                        /* needed? */
+                                        map.m_flags &= ~EXT4_MAP_MAPPED;
+                                        break;
+                                } else if (page_block == blocks_per_page)
+                                        break;
+                                blocks[page_block] = map.m_pblk+relative_block;
+                                page_block++;
+                                block_in_file++;
+                        }
+                }
+                if (first_hole != blocks_per_page) {
+                        zero_user_segment(page, first_hole << blkbits,
+                                          PAGE_CACHE_SIZE);
+                        if (first_hole == 0) {
+                                SetPageUptodate(page);
+                                unlock_page(page);
+                                goto next_page;
+                        }
+                } else if (fully_mapped) {
+                        SetPageMappedToDisk(page);
+                }
+                if (fully_mapped && blocks_per_page == 1 &&
+                    !PageUptodate(page) && cleancache_get_page(page) == 0) {
+                        SetPageUptodate(page);
+                        goto confused;
+                }
+                /*
+                 * This page will go to BIO.  Do we need to send this
+                 * BIO off first?
+                 */
+                if (bio && (last_block_in_bio != blocks[0] - 1)) {
+                submit_and_realloc:
+                        submit_bio(READ, bio);
+                        bio = NULL;
+                }
+                if (bio == NULL) {
+                        struct ext4_crypto_ctx *ctx = NULL;
+                        if (ext4_encrypted_inode(inode) &&
+                            S_ISREG(inode->i_mode)) {
+                                ctx = ext4_get_crypto_ctx(inode);
+                                if (IS_ERR(ctx))
+                                        goto set_error_page;
+                        }
+                        bio = bio_alloc(GFP_KERNEL,
+                                min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
+                        if (!bio) {
+                                if (ctx)
+                                        ext4_release_crypto_ctx(ctx);
+                                goto set_error_page;
+                        }
+                        bio->bi_bdev = bdev;
+                        bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+                        bio->bi_end_io = mpage_end_io;
+                        bio->bi_private = ctx;
+                }
+                length = first_hole << blkbits;
+                if (bio_add_page(bio, page, length, 0) < length)
+                        goto submit_and_realloc;
+                if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
+                     (relative_block == map.m_len)) ||
+                    (first_hole != blocks_per_page)) {
+                        submit_bio(READ, bio);
+                        bio = NULL;
+                } else
+                        last_block_in_bio = blocks[blocks_per_page - 1];
+                goto next_page;
+        confused:
+                if (bio) {
+                        submit_bio(READ, bio);
+                        bio = NULL;
+                }
+                if (!PageUptodate(page))
+                        block_read_full_page(page, ext4_get_block);
+                else
+                        unlock_page(page);
+        next_page:
+                if (pages)
+                        page_cache_release(page);
+        }
+        BUG_ON(pages && !list_empty(pages));
+        if (bio)
+                submit_bio(READ, bio);
+        return 0;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d348c7d29d80..821f22dbe825 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -21,7 +21,6 @@
 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
-#include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -323,22 +322,6 @@ static void save_error_info(struct super_block *sb, const char *func,
        ext4_commit_super(sb, 1);
 }
-/*
- * The del_gendisk() function uninitializes the disk-specific data
- * structures, including the bdi structure, without telling anyone
- * else.  Once this happens, any attempt to call mark_buffer_dirty()
- * (for example, by ext4_commit_super), will cause a kernel OOPS.
- * This is a kludge to prevent these oops until we can put in a proper
- * hook in del_gendisk() to inform the VFS and file system layers.
- */
-static int block_device_ejected(struct super_block *sb)
-{
-        struct inode *bd_inode = sb->s_bdev->bd_inode;
-        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
-        return bdi->dev == NULL;
-}
 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 {
        struct super_block              *sb = journal->j_private;
@@ -893,6 +876,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        atomic_set(&ei->i_ioend_count, 0);
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
+#endif
        return &ei->vfs_inode;
 }
@@ -1120,7 +1106,7 @@ enum {
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
        Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-        Opt_data_err_abort, Opt_data_err_ignore,
+        Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
@@ -1211,6 +1197,7 @@ static const match_table_t tokens = {
        {Opt_init_itable, "init_itable"},
        {Opt_noinit_itable, "noinit_itable"},
        {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+        {Opt_test_dummy_encryption, "test_dummy_encryption"},
        {Opt_removed, "check=none"},    /* mount option from ext2/3 */
        {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
        {Opt_removed, "reservation"},   /* mount option from ext2/3 */
@@ -1412,6 +1399,7 @@ static const struct mount_opts {
        {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
        {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
        {Opt_max_dir_size_kb, 0, MOPT_GTE0},
+        {Opt_test_dummy_encryption, 0, MOPT_GTE0},
        {Opt_err, 0, 0}
 };
@@ -1588,6 +1576,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                }
                *journal_ioprio =
                        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+        } else if (token == Opt_test_dummy_encryption) {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+                sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
+                ext4_msg(sb, KERN_WARNING,
+                         "Test dummy encryption mode enabled");
+#else
+                ext4_msg(sb, KERN_WARNING,
+                         "Test dummy encryption mount option ignored");
+#endif
        } else if (m->flags & MOPT_DATAJ) {
                if (is_remount) {
                        if (!sbi->s_journal)
@@ -2685,11 +2682,13 @@ static struct attribute *ext4_attrs[] = {
 EXT4_INFO_ATTR(lazy_itable_init);
 EXT4_INFO_ATTR(batched_discard);
 EXT4_INFO_ATTR(meta_bg_resize);
+EXT4_INFO_ATTR(encryption);
 static struct attribute *ext4_feat_attrs[] = {
        ATTR_LIST(lazy_itable_init),
        ATTR_LIST(batched_discard),
        ATTR_LIST(meta_bg_resize),
+        ATTR_LIST(encryption),
        NULL,
 };
@@ -3448,6 +3447,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (sb->s_bdev->bd_part)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        /* Modes of operations for file and directory encryption. */
+        sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
+        sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
+#endif
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
@@ -3692,6 +3696,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
+            es->s_encryption_level) {
+                ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
+                         es->s_encryption_level);
+                goto failed_mount;
+        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
@@ -4054,6 +4065,13 @@ no_journal:
                }
        }
+        if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
+            !(sb->s_flags & MS_RDONLY) &&
+            !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
+                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+                ext4_commit_super(sb, 1);
+        }
        /*
         * Get the # of file system overhead blocks from the
         * superblock if present.
@@ -4570,7 +4588,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
-        if (!sbh || block_device_ejected(sb))
+        if (!sbh)
                return error;
        if (buffer_write_io_error(sbh)) {
                /*
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ff3711932018..136ca0e911fd 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -18,13 +18,101 @@
 */
 #include <linux/fs.h>
-#include <linux/jbd2.h>
 #include <linux/namei.h>
 #include "ext4.h"
 #include "xattr.h"
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
 static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
+        struct page *cpage = NULL;
+        char *caddr, *paddr = NULL;
+        struct ext4_str cstr, pstr;
+        struct inode *inode = dentry->d_inode;
+        struct ext4_fname_crypto_ctx *ctx = NULL;
+        struct ext4_encrypted_symlink_data *sd;
+        loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
+        int res;
+        u32 plen, max_size = inode->i_sb->s_blocksize;
+        if (!ext4_encrypted_inode(inode))
+                return page_follow_link_light(dentry, nd);
+        ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
+        if (IS_ERR(ctx))
+                return ctx;
+        if (ext4_inode_is_fast_symlink(inode)) {
+                caddr = (char *) EXT4_I(dentry->d_inode)->i_data;
+                max_size = sizeof(EXT4_I(dentry->d_inode)->i_data);
+        } else {
+                cpage = read_mapping_page(inode->i_mapping, 0, NULL);
+                if (IS_ERR(cpage)) {
+                        ext4_put_fname_crypto_ctx(&ctx);
+                        return cpage;
+                }
+                caddr = kmap(cpage);
+                caddr[size] = 0;
+        }
+        /* Symlink is encrypted */
+        sd = (struct ext4_encrypted_symlink_data *)caddr;
+        cstr.name = sd->encrypted_path;
+        cstr.len  = le32_to_cpu(sd->len);
+        if ((cstr.len +
+             sizeof(struct ext4_encrypted_symlink_data) - 1) >
+            max_size) {
+                /* Symlink data on the disk is corrupted */
+                res = -EIO;
+                goto errout;
+        }
+        plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
+                EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
+        paddr = kmalloc(plen + 1, GFP_NOFS);
+        if (!paddr) {
+                res = -ENOMEM;
+                goto errout;
+        }
+        pstr.name = paddr;
+        res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr);
+        if (res < 0)
+                goto errout;
+        /* Null-terminate the name */
+        if (res <= plen)
+                paddr[res] = '\0';
+        nd_set_link(nd, paddr);
+        ext4_put_fname_crypto_ctx(&ctx);
+        if (cpage) {
+                kunmap(cpage);
+                page_cache_release(cpage);
+        }
+        return NULL;
+errout:
+        ext4_put_fname_crypto_ctx(&ctx);
+        if (cpage) {
+                kunmap(cpage);
+                page_cache_release(cpage);
+        }
+        kfree(paddr);
+        return ERR_PTR(res);
+}
+static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
+                          void *cookie)
+{
+        struct page *page = cookie;
+        if (!page) {
+                kfree(nd_get_link(nd));
+        } else {
+                kunmap(page);
+                page_cache_release(page);
+        }
+}
+#endif
+static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
+{
        struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
        nd_set_link(nd, (char *) ei->i_data);
        return NULL;
@@ -32,8 +120,13 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
 const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+        .follow_link    = ext4_follow_link,
+        .put_link       = ext4_put_link,
+#else
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+#endif
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -43,7 +136,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
-        .follow_link    = ext4_follow_link,
+        .follow_link    = ext4_follow_fast_link,
        .setattr        = ext4_setattr,
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1e09fc77395c..759842ff8af0 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -55,7 +55,6 @@
 #include <linux/slab.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
-#include <linux/rwsem.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 #include "xattr.h"
@@ -639,8 +638,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
                free += EXT4_XATTR_LEN(name_len);
        }
        if (i->value) {
-                if (free < EXT4_XATTR_SIZE(i->value_len) ||
+                if (free < EXT4_XATTR_LEN(name_len) +
-                    free < EXT4_XATTR_LEN(name_len) +
                           EXT4_XATTR_SIZE(i->value_len))
                        return -ENOSPC;
        }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 29bedf5589f6..ddc0957760ba 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -23,6 +23,7 @@
 #define EXT4_XATTR_INDEX_SECURITY               6
 #define EXT4_XATTR_INDEX_SYSTEM                 7
 #define EXT4_XATTR_INDEX_RICHACL                8
+#define EXT4_XATTR_INDEX_ENCRYPTION             9
 struct ext4_xattr_header {
        __le32  h_magic;        /* magic number for identification */
@@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
+#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 94e2d2ffabe1..05f0f663f14c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,5 +1,5 @@
 config F2FS_FS
-        tristate "F2FS filesystem support (EXPERIMENTAL)"
+        tristate "F2FS filesystem support"
        depends on BLOCK
        help
          F2FS is based on Log-structured File System (LFS), which supports
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 742202779bd5..4320ffab3495 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -351,13 +351,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
        *acl = f2fs_acl_clone(p, GFP_NOFS);
        if (!*acl)
-                return -ENOMEM;
+                goto no_mem;
        ret = f2fs_acl_create_masq(*acl, mode);
-        if (ret < 0) {
+        if (ret < 0)
-                posix_acl_release(*acl);
+                goto no_mem_clone;
-                return -ENOMEM;
-        }
        if (ret == 0) {
                posix_acl_release(*acl);
@@ -378,6 +376,12 @@ no_acl:
        *default_acl = NULL;
        *acl = NULL;
        return 0;
+no_mem_clone:
+        posix_acl_release(*acl);
+no_mem:
+        posix_acl_release(p);
+        return -ENOMEM;
 }
 int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7f794b72b3b7..a5e17a2a0781 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -276,7 +276,7 @@ continue_unlock:
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
-                        if (f2fs_write_meta_page(page, &wbc)) {
+                        if (mapping->a_ops->writepage(page, &wbc)) {
                                unlock_page(page);
                                break;
                        }
@@ -464,20 +464,19 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
 void recover_orphan_inodes(struct f2fs_sb_info *sbi)
 {
-        block_t start_blk, orphan_blkaddr, i, j;
+        block_t start_blk, orphan_blocks, i, j;
        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
                return;
        set_sbi_flag(sbi, SBI_POR_DOING);
-        start_blk = __start_cp_addr(sbi) + 1 +
+        start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
-                le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+        orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
-        orphan_blkaddr = __start_sum_addr(sbi) - 1;
-        ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
+        ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
-        for (i = 0; i < orphan_blkaddr; i++) {
+        for (i = 0; i < orphan_blocks; i++) {
                struct page *page = get_meta_page(sbi, start_blk + i);
                struct f2fs_orphan_block *orphan_blk;
@@ -615,7 +614,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
        unsigned long blk_size = sbi->blocksize;
        unsigned long long cp1_version = 0, cp2_version = 0;
        unsigned long long cp_start_blk_no;
-        unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+        unsigned int cp_blks = 1 + __cp_payload(sbi);
        block_t cp_blk_no;
        int i;
@@ -796,6 +795,7 @@ retry:
                 * wribacking dentry pages in the freeing inode.
                 */
                f2fs_submit_merged_bio(sbi, DATA, WRITE);
+                cond_resched();
        }
        goto retry;
 }
@@ -884,7 +884,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        __u32 crc32 = 0;
        void *kaddr;
        int i;
-        int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+        int cp_payload_blks = __cp_payload(sbi);
        /*
         * This avoids to conduct wrong roll-forward operations and uses
@@ -1048,17 +1048,18 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        unsigned long long ckpt_ver;
-        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
        mutex_lock(&sbi->cp_mutex);
        if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
-                        cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
+                (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC))
                goto out;
        if (unlikely(f2fs_cp_error(sbi)))
                goto out;
        if (f2fs_readonly(sbi->sb))
                goto out;
+        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
        if (block_operations(sbi))
                goto out;
@@ -1085,6 +1086,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        unblock_operations(sbi);
        stat_inc_cp_count(sbi->stat_info);
+        if (cpc->reason == CP_RECOVERY)
+                f2fs_msg(sbi->sb, KERN_NOTICE,
+                        "checkpoint: version = %llx", ckpt_ver);
 out:
        mutex_unlock(&sbi->cp_mutex);
        trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
@@ -1103,14 +1108,9 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
                im->ino_num = 0;
        }
-        /*
-         * considering 512 blocks in a segment 8 blocks are needed for cp
-         * and log segment summaries. Remaining blocks are used to keep
-         * orphan entries with the limitation one reserved segment
-         * for cp pack we can have max 1020*504 orphan entries
-         */
        sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
-                        NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
+                        NR_CURSEG_TYPE - __cp_payload(sbi)) *
+                                F2FS_ORPHANS_PER_BLOCK;
 }
 int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 319eda511c4f..b91b0e10678e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -25,6 +25,9 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
+static struct kmem_cache *extent_tree_slab;
+static struct kmem_cache *extent_node_slab;
 static void f2fs_read_end_io(struct bio *bio, int err)
 {
        struct bio_vec *bvec;
@@ -197,7 +200,7 @@ alloc_new:
 *  ->node_page
 *    update block addresses in the node page
 */
-static void __set_data_blkaddr(struct dnode_of_data *dn)
+void set_data_blkaddr(struct dnode_of_data *dn)
 {
        struct f2fs_node *rn;
        __le32 *addr_array;
@@ -226,7 +229,7 @@ int reserve_new_block(struct dnode_of_data *dn)
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
        dn->data_blkaddr = NEW_ADDR;
-        __set_data_blkaddr(dn);
+        set_data_blkaddr(dn);
        mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
@@ -248,73 +251,62 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
        return err;
 }
-static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
+static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs,
-                                        struct buffer_head *bh_result)
+                        struct extent_info *ei, struct buffer_head *bh_result)
+{
+        unsigned int blkbits = sb->s_blocksize_bits;
+        size_t max_size = bh_result->b_size;
+        size_t mapped_size;
+        clear_buffer_new(bh_result);
+        map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs);
+        mapped_size = (ei->fofs + ei->len - pgofs) << blkbits;
+        bh_result->b_size = min(max_size, mapped_size);
+}
+static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
+                                                        struct extent_info *ei)
 {
        struct f2fs_inode_info *fi = F2FS_I(inode);
        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr;
-        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+        read_lock(&fi->ext_lock);
-                return 0;
-        read_lock(&fi->ext.ext_lock);
        if (fi->ext.len == 0) {
-                read_unlock(&fi->ext.ext_lock);
+                read_unlock(&fi->ext_lock);
-                return 0;
+                return false;
        }
        stat_inc_total_hit(inode->i_sb);
        start_fofs = fi->ext.fofs;
        end_fofs = fi->ext.fofs + fi->ext.len - 1;
-        start_blkaddr = fi->ext.blk_addr;
+        start_blkaddr = fi->ext.blk;
        if (pgofs >= start_fofs && pgofs <= end_fofs) {
-                unsigned int blkbits = inode->i_sb->s_blocksize_bits;
+                *ei = fi->ext;
-                size_t count;
-                set_buffer_new(bh_result);
-                map_bh(bh_result, inode->i_sb,
-                                start_blkaddr + pgofs - start_fofs);
-                count = end_fofs - pgofs + 1;
-                if (count < (UINT_MAX >> blkbits))
-                        bh_result->b_size = (count << blkbits);
-                else
-                        bh_result->b_size = UINT_MAX;
                stat_inc_read_hit(inode->i_sb);
-                read_unlock(&fi->ext.ext_lock);
+                read_unlock(&fi->ext_lock);
-                return 1;
+                return true;
        }
-        read_unlock(&fi->ext.ext_lock);
+        read_unlock(&fi->ext_lock);
-        return 0;
+        return false;
 }
-void update_extent_cache(struct dnode_of_data *dn)
+static bool update_extent_info(struct inode *inode, pgoff_t fofs,
+                                                                block_t blkaddr)
 {
-        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+        struct f2fs_inode_info *fi = F2FS_I(inode);
-        pgoff_t fofs, start_fofs, end_fofs;
+        pgoff_t start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
        int need_update = true;
-        f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+        write_lock(&fi->ext_lock);
-        /* Update the page address in the parent node */
-        __set_data_blkaddr(dn);
-        if (is_inode_flag_set(fi, FI_NO_EXTENT))
-                return;
-        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-                                                        dn->ofs_in_node;
-        write_lock(&fi->ext.ext_lock);
        start_fofs = fi->ext.fofs;
        end_fofs = fi->ext.fofs + fi->ext.len - 1;
-        start_blkaddr = fi->ext.blk_addr;
+        start_blkaddr = fi->ext.blk;
-        end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
+        end_blkaddr = fi->ext.blk + fi->ext.len - 1;
        /* Drop and initialize the matched extent */
        if (fi->ext.len == 1 && fofs == start_fofs)
@@ -322,24 +314,24 @@ void update_extent_cache(struct dnode_of_data *dn)
        /* Initial extent */
        if (fi->ext.len == 0) {
-                if (dn->data_blkaddr != NULL_ADDR) {
+                if (blkaddr != NULL_ADDR) {
                        fi->ext.fofs = fofs;
-                        fi->ext.blk_addr = dn->data_blkaddr;
+                        fi->ext.blk = blkaddr;
                        fi->ext.len = 1;
                }
                goto end_update;
        }
        /* Front merge */
-        if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
+        if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
                fi->ext.fofs--;
-                fi->ext.blk_addr--;
+                fi->ext.blk--;
                fi->ext.len++;
                goto end_update;
        }
        /* Back merge */
-        if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
+        if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
                fi->ext.len++;
                goto end_update;
        }
@@ -351,8 +343,7 @@ void update_extent_cache(struct dnode_of_data *dn)
                        fi->ext.len = fofs - start_fofs;
                } else {
                        fi->ext.fofs = fofs + 1;
-                        fi->ext.blk_addr = start_blkaddr +
+                        fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
-                                        fofs - start_fofs + 1;
                        fi->ext.len -= fofs - start_fofs + 1;
                }
        } else {
@@ -366,27 +357,583 @@ void update_extent_cache(struct dnode_of_data *dn)
                need_update = true;
        }
 end_update:
-        write_unlock(&fi->ext.ext_lock);
+        write_unlock(&fi->ext_lock);
-        if (need_update)
+        return need_update;
-                sync_inode_page(dn);
+}
+static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
+                                struct extent_tree *et, struct extent_info *ei,
+                                struct rb_node *parent, struct rb_node **p)
+{
+        struct extent_node *en;
+        en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
+        if (!en)
+                return NULL;
+        en->ei = *ei;
+        INIT_LIST_HEAD(&en->list);
+        rb_link_node(&en->rb_node, parent, p);
+        rb_insert_color(&en->rb_node, &et->root);
+        et->count++;
+        atomic_inc(&sbi->total_ext_node);
+        return en;
+}
+static void __detach_extent_node(struct f2fs_sb_info *sbi,
+                                struct extent_tree *et, struct extent_node *en)
+{
+        rb_erase(&en->rb_node, &et->root);
+        et->count--;
+        atomic_dec(&sbi->total_ext_node);
+        if (et->cached_en == en)
+                et->cached_en = NULL;
+}
+static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
+                                                        nid_t ino)
+{
+        struct extent_tree *et;
+        down_read(&sbi->extent_tree_lock);
+        et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+        if (!et) {
+                up_read(&sbi->extent_tree_lock);
+                return NULL;
+        }
+        atomic_inc(&et->refcount);
+        up_read(&sbi->extent_tree_lock);
+        return et;
+}
+static struct extent_tree *__grab_extent_tree(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct extent_tree *et;
+        nid_t ino = inode->i_ino;
+        down_write(&sbi->extent_tree_lock);
+        et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+        if (!et) {
+                et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
+                f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+                memset(et, 0, sizeof(struct extent_tree));
+                et->ino = ino;
+                et->root = RB_ROOT;
+                et->cached_en = NULL;
+                rwlock_init(&et->lock);
+                atomic_set(&et->refcount, 0);
+                et->count = 0;
+                sbi->total_ext_tree++;
+        }
+        atomic_inc(&et->refcount);
+        up_write(&sbi->extent_tree_lock);
+        return et;
+}
+static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
+                                                        unsigned int fofs)
+{
+        struct rb_node *node = et->root.rb_node;
+        struct extent_node *en;
+        if (et->cached_en) {
+                struct extent_info *cei = &et->cached_en->ei;
+                if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
+                        return et->cached_en;
+        }
+        while (node) {
+                en = rb_entry(node, struct extent_node, rb_node);
+                if (fofs < en->ei.fofs) {
+                        node = node->rb_left;
+                } else if (fofs >= en->ei.fofs + en->ei.len) {
+                        node = node->rb_right;
+                } else {
+                        et->cached_en = en;
+                        return en;
+                }
+        }
+        return NULL;
+}
+static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
+                                struct extent_tree *et, struct extent_node *en)
+{
+        struct extent_node *prev;
+        struct rb_node *node;
+        node = rb_prev(&en->rb_node);
+        if (!node)
+                return NULL;
+        prev = rb_entry(node, struct extent_node, rb_node);
+        if (__is_back_mergeable(&en->ei, &prev->ei)) {
+                en->ei.fofs = prev->ei.fofs;
+                en->ei.blk = prev->ei.blk;
+                en->ei.len += prev->ei.len;
+                __detach_extent_node(sbi, et, prev);
+                return prev;
+        }
+        return NULL;
+}
+static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
+                                struct extent_tree *et, struct extent_node *en)
+{
+        struct extent_node *next;
+        struct rb_node *node;
+        node = rb_next(&en->rb_node);
+        if (!node)
+                return NULL;
+        next = rb_entry(node, struct extent_node, rb_node);
+        if (__is_front_mergeable(&en->ei, &next->ei)) {
+                en->ei.len += next->ei.len;
+                __detach_extent_node(sbi, et, next);
+                return next;
+        }
+        return NULL;
+}
+static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
+                                struct extent_tree *et, struct extent_info *ei,
+                                struct extent_node **den)
+{
+        struct rb_node **p = &et->root.rb_node;
+        struct rb_node *parent = NULL;
+        struct extent_node *en;
+        while (*p) {
+                parent = *p;
+                en = rb_entry(parent, struct extent_node, rb_node);
+                if (ei->fofs < en->ei.fofs) {
+                        if (__is_front_mergeable(ei, &en->ei)) {
+                                f2fs_bug_on(sbi, !den);
+                                en->ei.fofs = ei->fofs;
+                                en->ei.blk = ei->blk;
+                                en->ei.len += ei->len;
+                                *den = __try_back_merge(sbi, et, en);
+                                return en;
+                        }
+                        p = &(*p)->rb_left;
+                } else if (ei->fofs >= en->ei.fofs + en->ei.len) {
+                        if (__is_back_mergeable(ei, &en->ei)) {
+                                f2fs_bug_on(sbi, !den);
+                                en->ei.len += ei->len;
+                                *den = __try_front_merge(sbi, et, en);
+                                return en;
+                        }
+                        p = &(*p)->rb_right;
+                } else {
+                        f2fs_bug_on(sbi, 1);
+                }
+        }
+        return __attach_extent_node(sbi, et, ei, parent, p);
+}
+static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
+                                        struct extent_tree *et, bool free_all)
+{
+        struct rb_node *node, *next;
+        struct extent_node *en;
+        unsigned int count = et->count;
+        node = rb_first(&et->root);
+        while (node) {
+                next = rb_next(node);
+                en = rb_entry(node, struct extent_node, rb_node);
+                if (free_all) {
+                        spin_lock(&sbi->extent_lock);
+                        if (!list_empty(&en->list))
+                                list_del_init(&en->list);
+                        spin_unlock(&sbi->extent_lock);
+                }
+                if (free_all || list_empty(&en->list)) {
+                        __detach_extent_node(sbi, et, en);
+                        kmem_cache_free(extent_node_slab, en);
+                }
+                node = next;
+        }
+        return count - et->count;
+}
+static void f2fs_init_extent_tree(struct inode *inode,
+                                                struct f2fs_extent *i_ext)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct extent_tree *et;
+        struct extent_node *en;
+        struct extent_info ei;
+        if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
+                return;
+        et = __grab_extent_tree(inode);
+        write_lock(&et->lock);
+        if (et->count)
+                goto out;
+        set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
+                le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
+        en = __insert_extent_tree(sbi, et, &ei, NULL);
+        if (en) {
+                et->cached_en = en;
+                spin_lock(&sbi->extent_lock);
+                list_add_tail(&en->list, &sbi->extent_list);
+                spin_unlock(&sbi->extent_lock);
+        }
+out:
+        write_unlock(&et->lock);
+        atomic_dec(&et->refcount);
+}
+static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+                                                        struct extent_info *ei)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct extent_tree *et;
+        struct extent_node *en;
+        trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+        et = __find_extent_tree(sbi, inode->i_ino);
+        if (!et)
+                return false;
+        read_lock(&et->lock);
+        en = __lookup_extent_tree(et, pgofs);
+        if (en) {
+                *ei = en->ei;
+                spin_lock(&sbi->extent_lock);
+                if (!list_empty(&en->list))
+                        list_move_tail(&en->list, &sbi->extent_list);
+                spin_unlock(&sbi->extent_lock);
+                stat_inc_read_hit(sbi->sb);
+        }
+        stat_inc_total_hit(sbi->sb);
+        read_unlock(&et->lock);
+        trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
+        atomic_dec(&et->refcount);
+        return en ? true : false;
+}
+static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
+                                                        block_t blkaddr)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct extent_tree *et;
+        struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+        struct extent_node *den = NULL;
+        struct extent_info ei, dei;
+        unsigned int endofs;
+        trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
+        et = __grab_extent_tree(inode);
+        write_lock(&et->lock);
+        /* 1. lookup and remove existing extent info in cache */
+        en = __lookup_extent_tree(et, fofs);
+        if (!en)
+                goto update_extent;
+        dei = en->ei;
+        __detach_extent_node(sbi, et, en);
+        /* 2. if extent can be split more, split and insert the left part */
+        if (dei.len > 1) {
+                /*  insert left part of split extent into cache */
+                if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+                        set_extent_info(&ei, dei.fofs, dei.blk,
+                                                        fofs - dei.fofs);
+                        en1 = __insert_extent_tree(sbi, et, &ei, NULL);
+                }
+                /* insert right part of split extent into cache */
+                endofs = dei.fofs + dei.len - 1;
+                if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
+                        set_extent_info(&ei, fofs + 1,
+                                fofs - dei.fofs + dei.blk, endofs - fofs);
+                        en2 = __insert_extent_tree(sbi, et, &ei, NULL);
+                }
+        }
+update_extent:
+        /* 3. update extent in extent cache */
+        if (blkaddr) {
+                set_extent_info(&ei, fofs, blkaddr, 1);
+                en3 = __insert_extent_tree(sbi, et, &ei, &den);
+        }
+        /* 4. update in global extent list */
+        spin_lock(&sbi->extent_lock);
+        if (en && !list_empty(&en->list))
+                list_del(&en->list);
+        /*
+         * en1 and en2 split from en, they will become more and more smaller
+         * fragments after splitting several times. So if the length is smaller
+         * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
+         */
+        if (en1)
+                list_add_tail(&en1->list, &sbi->extent_list);
+        if (en2)
+                list_add_tail(&en2->list, &sbi->extent_list);
+        if (en3) {
+                if (list_empty(&en3->list))
+                        list_add_tail(&en3->list, &sbi->extent_list);
+                else
+                        list_move_tail(&en3->list, &sbi->extent_list);
+        }
+        if (den && !list_empty(&den->list))
+                list_del(&den->list);
+        spin_unlock(&sbi->extent_lock);
+        /* 5. release extent node */
+        if (en)
+                kmem_cache_free(extent_node_slab, en);
+        if (den)
+                kmem_cache_free(extent_node_slab, den);
+        write_unlock(&et->lock);
+        atomic_dec(&et->refcount);
+}
+void f2fs_preserve_extent_tree(struct inode *inode)
+{
+        struct extent_tree *et;
+        struct extent_info *ext = &F2FS_I(inode)->ext;
+        bool sync = false;
+        if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+                return;
+        et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
+        if (!et) {
+                if (ext->len) {
+                        ext->len = 0;
+                        update_inode_page(inode);
+                }
+                return;
+        }
+        read_lock(&et->lock);
+        if (et->count) {
+                struct extent_node *en;
+                if (et->cached_en) {
+                        en = et->cached_en;
+                } else {
+                        struct rb_node *node = rb_first(&et->root);
+                        if (!node)
+                                node = rb_last(&et->root);
+                        en = rb_entry(node, struct extent_node, rb_node);
+                }
+                if (__is_extent_same(ext, &en->ei))
+                        goto out;
+                *ext = en->ei;
+                sync = true;
+        } else if (ext->len) {
+                ext->len = 0;
+                sync = true;
+        }
+out:
+        read_unlock(&et->lock);
+        atomic_dec(&et->refcount);
+        if (sync)
+                update_inode_page(inode);
+}
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+        struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+        struct extent_node *en, *tmp;
+        unsigned long ino = F2FS_ROOT_INO(sbi);
+        struct radix_tree_iter iter;
+        void **slot;
+        unsigned int found;
+        unsigned int node_cnt = 0, tree_cnt = 0;
+        if (!test_opt(sbi, EXTENT_CACHE))
+                return;
+        if (available_free_memory(sbi, EXTENT_CACHE))
+                return;
+        spin_lock(&sbi->extent_lock);
+        list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
+                if (!nr_shrink--)
+                        break;
+                list_del_init(&en->list);
+        }
+        spin_unlock(&sbi->extent_lock);
+        down_read(&sbi->extent_tree_lock);
+        while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
+                                (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
+                unsigned i;
+                ino = treevec[found - 1]->ino + 1;
+                for (i = 0; i < found; i++) {
+                        struct extent_tree *et = treevec[i];
+                        atomic_inc(&et->refcount);
+                        write_lock(&et->lock);
+                        node_cnt += __free_extent_tree(sbi, et, false);
+                        write_unlock(&et->lock);
+                        atomic_dec(&et->refcount);
+                }
+        }
+        up_read(&sbi->extent_tree_lock);
+        down_write(&sbi->extent_tree_lock);
+        radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
+                                                        F2FS_ROOT_INO(sbi)) {
+                struct extent_tree *et = (struct extent_tree *)*slot;
+                if (!atomic_read(&et->refcount) && !et->count) {
+                        radix_tree_delete(&sbi->extent_tree_root, et->ino);
+                        kmem_cache_free(extent_tree_slab, et);
+                        sbi->total_ext_tree--;
+                        tree_cnt++;
+                }
+        }
+        up_write(&sbi->extent_tree_lock);
+        trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+}
+void f2fs_destroy_extent_tree(struct inode *inode)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct extent_tree *et;
+        unsigned int node_cnt = 0;
+        if (!test_opt(sbi, EXTENT_CACHE))
+                return;
+        et = __find_extent_tree(sbi, inode->i_ino);
+        if (!et)
+                goto out;
+        /* free all extent info belong to this extent tree */
+        write_lock(&et->lock);
+        node_cnt = __free_extent_tree(sbi, et, true);
+        write_unlock(&et->lock);
+        atomic_dec(&et->refcount);
+        /* try to find and delete extent tree entry in radix tree */
+        down_write(&sbi->extent_tree_lock);
+        et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
+        if (!et) {
+                up_write(&sbi->extent_tree_lock);
+                goto out;
+        }
+        f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+        radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+        kmem_cache_free(extent_tree_slab, et);
+        sbi->total_ext_tree--;
+        up_write(&sbi->extent_tree_lock);
+out:
+        trace_f2fs_destroy_extent_tree(inode, node_cnt);
        return;
 }
+void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
+{
+        if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+                f2fs_init_extent_tree(inode, i_ext);
+        write_lock(&F2FS_I(inode)->ext_lock);
+        get_extent_info(&F2FS_I(inode)->ext, *i_ext);
+        write_unlock(&F2FS_I(inode)->ext_lock);
+}
+static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
+                                                        struct extent_info *ei)
+{
+        if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
+                return false;
+        if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
+                return f2fs_lookup_extent_tree(inode, pgofs, ei);
+        return lookup_extent_info(inode, pgofs, ei);
+}
+void f2fs_update_extent_cache(struct dnode_of_data *dn)
+{
+        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
+        pgoff_t fofs;
+        f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
+        if (is_inode_flag_set(fi, FI_NO_EXTENT))
+                return;
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+                                                        dn->ofs_in_node;
+        if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
+                return f2fs_update_extent_tree(dn->inode, fofs,
+                                                        dn->data_blkaddr);
+        if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
+                sync_inode_page(dn);
+}
 struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
 {
        struct address_space *mapping = inode->i_mapping;
        struct dnode_of_data dn;
        struct page *page;
+        struct extent_info ei;
        int err;
        struct f2fs_io_info fio = {
                .type = DATA,
                .rw = sync ? READ_SYNC : READA,
        };
+        /*
+         * If sync is false, it needs to check its block allocation.
+         * This is need and triggered by two flows:
+         *   gc and truncate_partial_data_page.
+         */
+        if (!sync)
+                goto search;
        page = find_get_page(mapping, index);
        if (page && PageUptodate(page))
                return page;
        f2fs_put_page(page, 0);
+search:
+        if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+                dn.data_blkaddr = ei.blk + index - ei.fofs;
+                goto got_it;
+        }
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
@@ -401,6 +948,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
        if (unlikely(dn.data_blkaddr == NEW_ADDR))
                return ERR_PTR(-EINVAL);
+got_it:
        page = grab_cache_page(mapping, index);
        if (!page)
                return ERR_PTR(-ENOMEM);
@@ -435,6 +983,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
        struct address_space *mapping = inode->i_mapping;
        struct dnode_of_data dn;
        struct page *page;
+        struct extent_info ei;
        int err;
        struct f2fs_io_info fio = {
                .type = DATA,
@@ -445,6 +994,11 @@ repeat:
        if (!page)
                return ERR_PTR(-ENOMEM);
+        if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+                dn.data_blkaddr = ei.blk + index - ei.fofs;
+                goto got_it;
+        }
        set_new_dnode(&dn, inode, NULL, NULL, 0);
        err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
        if (err) {
@@ -458,6 +1012,7 @@ repeat:
                return ERR_PTR(-ENOENT);
        }
+got_it:
        if (PageUptodate(page))
                return page;
@@ -569,19 +1124,26 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
+        dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+        if (dn->data_blkaddr == NEW_ADDR)
+                goto alloc;
        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
+alloc:
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
        if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
                seg = CURSEG_DIRECT_IO;
-        allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
+        allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
+                                                                &sum, seg);
        /* direct IO doesn't use extent cache to maximize the performance */
-        __set_data_blkaddr(dn);
+        set_data_blkaddr(dn);
        /* update i_size */
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -615,7 +1177,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
                while (dn.ofs_in_node < end_offset && len) {
-                        if (dn.data_blkaddr == NULL_ADDR) {
+                        block_t blkaddr;
+                        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+                        if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
                                if (__allocate_data_block(&dn))
                                        goto sync_out;
                                allocated = true;
@@ -659,13 +1224,16 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
        int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
        pgoff_t pgofs, end_offset;
        int err = 0, ofs = 1;
+        struct extent_info ei;
        bool allocated = false;
        /* Get the page offset from the block offset(iblock) */
        pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
-        if (check_extent_cache(inode, pgofs, bh_result))
+        if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+                f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result);
                goto out;
+        }
        if (create)
                f2fs_lock_op(F2FS_I_SB(inode));
@@ -682,7 +1250,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
                goto put_out;
        if (dn.data_blkaddr != NULL_ADDR) {
-                set_buffer_new(bh_result);
+                clear_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
        } else if (create) {
                err = __allocate_data_block(&dn);
@@ -727,6 +1295,7 @@ get_next:
                        if (err)
                                goto sync_out;
                        allocated = true;
+                        set_buffer_new(bh_result);
                        blkaddr = dn.data_blkaddr;
                }
                /* Give more consecutive addresses for the readahead */
@@ -813,8 +1382,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
        fio->blk_addr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (fio->blk_addr == NULL_ADDR)
+        if (fio->blk_addr == NULL_ADDR) {
+                ClearPageUptodate(page);
                goto out_writepage;
+        }
        set_page_writeback(page);
@@ -827,10 +1398,15 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
                        need_inplace_update(inode))) {
                rewrite_data_page(page, fio);
                set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
+                trace_f2fs_do_write_data_page(page, IPU);
        } else {
                write_data_page(page, &dn, fio);
-                update_extent_cache(&dn);
+                set_data_blkaddr(&dn);
+                f2fs_update_extent_cache(&dn);
+                trace_f2fs_do_write_data_page(page, OPU);
                set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
+                if (page->index == 0)
+                        set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
        }
 out_writepage:
        f2fs_put_dnode(&dn);
@@ -909,6 +1485,8 @@ done:
        clear_cold_data(page);
 out:
        inode_dec_dirty_pages(inode);
+        if (err)
+                ClearPageUptodate(page);
        unlock_page(page);
        if (need_balance_fs)
                f2fs_balance_fs(sbi);
@@ -935,7 +1513,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 {
        struct inode *inode = mapping->host;
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        bool locked = false;
        int ret;
        long diff;
@@ -950,15 +1527,13 @@ static int f2fs_write_data_pages(struct address_space *mapping,
                        available_free_memory(sbi, DIRTY_DENTS))
                goto skip_write;
+        /* during POR, we don't need to trigger writepage at all. */
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+                goto skip_write;
        diff = nr_pages_to_write(sbi, DATA, wbc);
-        if (!S_ISDIR(inode->i_mode)) {
-                mutex_lock(&sbi->writepages);
-                locked = true;
-        }
        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
-        if (locked)
-                mutex_unlock(&sbi->writepages);
        f2fs_submit_merged_bio(sbi, DATA, WRITE);
@@ -1236,6 +1811,37 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
        return generic_block_bmap(mapping, block, get_data_block);
 }
+void init_extent_cache_info(struct f2fs_sb_info *sbi)
+{
+        INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
+        init_rwsem(&sbi->extent_tree_lock);
+        INIT_LIST_HEAD(&sbi->extent_list);
+        spin_lock_init(&sbi->extent_lock);
+        sbi->total_ext_tree = 0;
+        atomic_set(&sbi->total_ext_node, 0);
+}
+int __init create_extent_cache(void)
+{
+        extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
+                        sizeof(struct extent_tree));
+        if (!extent_tree_slab)
+                return -ENOMEM;
+        extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
+                        sizeof(struct extent_node));
+        if (!extent_node_slab) {
+                kmem_cache_destroy(extent_tree_slab);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void destroy_extent_cache(void)
+{
+        kmem_cache_destroy(extent_node_slab);
+        kmem_cache_destroy(extent_tree_slab);
+}
 const struct address_space_operations f2fs_dblock_aops = {
        .readpage       = f2fs_read_data_page,
        .readpages      = f2fs_read_data_pages,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index e671373cc8ab..f5388f37217e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -35,6 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        /* validation check of the segment numbers */
        si->hit_ext = sbi->read_hit_ext;
        si->total_ext = sbi->total_hit_ext;
+        si->ext_tree = sbi->total_ext_tree;
+        si->ext_node = atomic_read(&sbi->total_ext_node);
        si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
        si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
        si->ndirty_dirs = sbi->n_dirty_dirs;
@@ -185,6 +187,9 @@ get_cache:
        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
        for (i = 0; i <= UPDATE_INO; i++)
                si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+        si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+        si->cache_mem += atomic_read(&sbi->total_ext_node) *
+                                                sizeof(struct extent_node);
        si->page_mem = 0;
        npages = NODE_MAPPING(sbi)->nrpages;
@@ -260,13 +265,20 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "CP calls: %d\n", si->cp_count);
                seq_printf(s, "GC calls: %d (BG: %d)\n",
                           si->call_count, si->bg_gc);
-                seq_printf(s, "  - data segments : %d\n", si->data_segs);
+                seq_printf(s, "  - data segments : %d (%d)\n",
-                seq_printf(s, "  - node segments : %d\n", si->node_segs);
+                                si->data_segs, si->bg_data_segs);
-                seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
+                seq_printf(s, "  - node segments : %d (%d)\n",
-                seq_printf(s, "  - data blocks : %d\n", si->data_blks);
+                                si->node_segs, si->bg_node_segs);
-                seq_printf(s, "  - node blocks : %d\n", si->node_blks);
+                seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
+                                si->bg_data_blks + si->bg_node_blks);
+                seq_printf(s, "  - data blocks : %d (%d)\n", si->data_blks,
+                                si->bg_data_blks);
+                seq_printf(s, "  - node blocks : %d (%d)\n", si->node_blks,
+                                si->bg_node_blks);
                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
                           si->hit_ext, si->total_ext);
+                seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
+                seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
                seq_puts(s, "\nBalancing F2FS Async:\n");
                seq_printf(s, "  - inmem: %4d, wb: %4d\n",
                           si->inmem_pages, si->wb_pages);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b74097a7f6d9..3a3302ab7871 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -59,9 +59,8 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = F2FS_FT_SYMLINK,
 };
-void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
+void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
 {
-        umode_t mode = inode->i_mode;
        de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
 }
@@ -127,22 +126,19 @@ struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
                *max_slots = 0;
        while (bit_pos < d->max) {
                if (!test_bit_le(bit_pos, d->bitmap)) {
-                        if (bit_pos == 0)
-                                max_len = 1;
-                        else if (!test_bit_le(bit_pos - 1, d->bitmap))
-                                max_len++;
                        bit_pos++;
+                        max_len++;
                        continue;
                }
                de = &d->dentry[bit_pos];
                if (early_match_name(name->len, namehash, de) &&
                        !memcmp(d->filename[bit_pos], name->name, name->len))
                        goto found;
-                if (max_slots && *max_slots >= 0 && max_len > *max_slots) {
+                if (max_slots && max_len > *max_slots)
                        *max_slots = max_len;
-                        max_len = 0;
+                max_len = 0;
-                }
                /* remain bug on condition */
                if (unlikely(!de->name_len))
@@ -219,14 +215,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
        unsigned int max_depth;
        unsigned int level;
+        *res_page = NULL;
        if (f2fs_has_inline_dentry(dir))
                return find_in_inline_dir(dir, child, res_page);
        if (npages == 0)
                return NULL;
-        *res_page = NULL;
        name_hash = f2fs_dentry_hash(child);
        max_depth = F2FS_I(dir)->i_current_depth;
@@ -285,7 +281,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        lock_page(page);
        f2fs_wait_on_page_writeback(page, type);
        de->ino = cpu_to_le32(inode->i_ino);
-        set_de_type(de, inode);
+        set_de_type(de, inode->i_mode);
        f2fs_dentry_kunmap(dir, page);
        set_page_dirty(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -331,14 +327,14 @@ void do_make_empty_dir(struct inode *inode, struct inode *parent,
        de->hash_code = 0;
        de->ino = cpu_to_le32(inode->i_ino);
        memcpy(d->filename[0], ".", 1);
-        set_de_type(de, inode);
+        set_de_type(de, inode->i_mode);
        de = &d->dentry[1];
        de->hash_code = 0;
        de->name_len = cpu_to_le16(2);
        de->ino = cpu_to_le32(parent->i_ino);
        memcpy(d->filename[1], "..", 2);
-        set_de_type(de, inode);
+        set_de_type(de, parent->i_mode);
        test_and_set_bit_le(0, (void *)d->bitmap);
        test_and_set_bit_le(1, (void *)d->bitmap);
@@ -435,7 +431,7 @@ error:
 void update_parent_metadata(struct inode *dir, struct inode *inode,
                                                unsigned int current_depth)
 {
-        if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
+        if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
                if (S_ISDIR(inode->i_mode)) {
                        inc_nlink(dir);
                        set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -450,7 +446,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
                set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
        }
-        if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
+        if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
 }
@@ -474,30 +470,47 @@ next:
        goto next;
 }
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
+                                const struct qstr *name, f2fs_hash_t name_hash,
+                                unsigned int bit_pos)
+{
+        struct f2fs_dir_entry *de;
+        int slots = GET_DENTRY_SLOTS(name->len);
+        int i;
+        de = &d->dentry[bit_pos];
+        de->hash_code = name_hash;
+        de->name_len = cpu_to_le16(name->len);
+        memcpy(d->filename[bit_pos], name->name, name->len);
+        de->ino = cpu_to_le32(ino);
+        set_de_type(de, mode);
+        for (i = 0; i < slots; i++)
+                test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
+}
 /*
 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
 * f2fs_unlock_op().
 */
 int __f2fs_add_link(struct inode *dir, const struct qstr *name,
-                                                struct inode *inode)
+                                struct inode *inode, nid_t ino, umode_t mode)
 {
        unsigned int bit_pos;
        unsigned int level;
        unsigned int current_depth;
        unsigned long bidx, block;
        f2fs_hash_t dentry_hash;
-        struct f2fs_dir_entry *de;
        unsigned int nbucket, nblock;
        size_t namelen = name->len;
        struct page *dentry_page = NULL;
        struct f2fs_dentry_block *dentry_blk = NULL;
+        struct f2fs_dentry_ptr d;
        int slots = GET_DENTRY_SLOTS(namelen);
-        struct page *page;
+        struct page *page = NULL;
        int err = 0;
-        int i;
        if (f2fs_has_inline_dentry(dir)) {
-                err = f2fs_add_inline_entry(dir, name, inode);
+                err = f2fs_add_inline_entry(dir, name, inode, ino, mode);
                if (!err || err != -EAGAIN)
                        return err;
                else
@@ -547,30 +560,31 @@ start:
 add_dentry:
        f2fs_wait_on_page_writeback(dentry_page, DATA);
-        down_write(&F2FS_I(inode)->i_sem);
+        if (inode) {
-        page = init_inode_metadata(inode, dir, name, NULL);
+                down_write(&F2FS_I(inode)->i_sem);
-        if (IS_ERR(page)) {
+                page = init_inode_metadata(inode, dir, name, NULL);
-                err = PTR_ERR(page);
+                if (IS_ERR(page)) {
-                goto fail;
+                        err = PTR_ERR(page);
+                        goto fail;
+                }
        }
-        de = &dentry_blk->dentry[bit_pos];
-        de->hash_code = dentry_hash;
+        make_dentry_ptr(&d, (void *)dentry_blk, 1);
-        de->name_len = cpu_to_le16(namelen);
+        f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos);
-        memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
-        de->ino = cpu_to_le32(inode->i_ino);
-        set_de_type(de, inode);
-        for (i = 0; i < slots; i++)
-                test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
        set_page_dirty(dentry_page);
-        /* we don't need to mark_inode_dirty now */
+        if (inode) {
-        F2FS_I(inode)->i_pino = dir->i_ino;
+                /* we don't need to mark_inode_dirty now */
-        update_inode(inode, page);
+                F2FS_I(inode)->i_pino = dir->i_ino;
-        f2fs_put_page(page, 1);
+                update_inode(inode, page);
+                f2fs_put_page(page, 1);
+        }
        update_parent_metadata(dir, inode, current_depth);
 fail:
-        up_write(&F2FS_I(inode)->i_sem);
+        if (inode)
+                up_write(&F2FS_I(inode)->i_sem);
        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
                update_inode_page(dir);
@@ -669,6 +683,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        if (bit_pos == NR_DENTRY_IN_BLOCK) {
                truncate_hole(dir, page->index, page->index + 1);
                clear_page_dirty_for_io(page);
+                ClearPagePrivate(page);
                ClearPageUptodate(page);
                inode_dec_dirty_pages(dir);
        }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fa3313ab0e2..c06a25e5cec3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -50,6 +50,7 @@
 #define F2FS_MOUNT_FLUSH_MERGE          0x00000400
 #define F2FS_MOUNT_NOBARRIER            0x00000800
 #define F2FS_MOUNT_FASTBOOT             0x00001000
+#define F2FS_MOUNT_EXTENT_CACHE         0x00002000
 #define clear_opt(sbi, option)  (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)    (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -102,6 +103,7 @@ enum {
        CP_UMOUNT,
        CP_FASTBOOT,
        CP_SYNC,
+        CP_RECOVERY,
        CP_DISCARD,
 };
@@ -216,6 +218,15 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
 #define F2FS_IOC_ABORT_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 5)
+/*
+ * should be same as XFS_IOC_GOINGDOWN.
+ * Flags for going down operation used by FS_IOC_GOINGDOWN
+ */
+#define F2FS_IOC_SHUTDOWN       _IOR('X', 125, __u32)   /* Shutdown */
+#define F2FS_GOING_DOWN_FULLSYNC        0x0     /* going down with full sync */
+#define F2FS_GOING_DOWN_METASYNC        0x1     /* going down with metadata */
+#define F2FS_GOING_DOWN_NOSYNC          0x2     /* going down */
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
 * ioctl commands in 32 bit emulation
@@ -273,14 +284,34 @@ enum {
 #define MAX_DIR_RA_PAGES        4       /* maximum ra pages of dir */
+/* vector size for gang look-up from extent cache that consists of radix tree */
+#define EXT_TREE_VEC_SIZE       64
 /* for in-memory extent cache entry */
-#define F2FS_MIN_EXTENT_LEN     16      /* minimum extent length */
+#define F2FS_MIN_EXTENT_LEN     64      /* minimum extent length */
+/* number of extent info in extent cache we try to shrink */
+#define EXTENT_CACHE_SHRINK_NUMBER      128
 struct extent_info {
-        rwlock_t ext_lock;      /* rwlock for consistency */
+        unsigned int fofs;              /* start offset in a file */
-        unsigned int fofs;      /* start offset in a file */
+        u32 blk;                        /* start block address of the extent */
-        u32 blk_addr;           /* start block address of the extent */
+        unsigned int len;               /* length of the extent */
-        unsigned int len;       /* length of the extent */
+};
+struct extent_node {
+        struct rb_node rb_node;         /* rb node located in rb-tree */
+        struct list_head list;          /* node in global extent list of sbi */
+        struct extent_info ei;          /* extent info */
+};
+struct extent_tree {
+        nid_t ino;                      /* inode number */
+        struct rb_root root;            /* root of extent info rb-tree */
+        struct extent_node *cached_en;  /* recently accessed extent node */
+        rwlock_t lock;                  /* protect extent info rb-tree */
+        atomic_t refcount;              /* reference count of rb-tree */
+        unsigned int count;             /* # of extent node in rb-tree*/
 };
 /*
@@ -309,6 +340,7 @@ struct f2fs_inode_info {
        nid_t i_xattr_nid;              /* node id that contains xattrs */
        unsigned long long xattr_ver;   /* cp version of xattr modification */
        struct extent_info ext;         /* in-memory extent cache entry */
+        rwlock_t ext_lock;              /* rwlock for single extent cache */
        struct inode_entry *dirty_dir;  /* the pointer of dirty dir */
        struct radix_tree_root inmem_root;      /* radix tree for inmem pages */
@@ -319,21 +351,51 @@ struct f2fs_inode_info {
 static inline void get_extent_info(struct extent_info *ext,
                                        struct f2fs_extent i_ext)
 {
-        write_lock(&ext->ext_lock);
        ext->fofs = le32_to_cpu(i_ext.fofs);
-        ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
+        ext->blk = le32_to_cpu(i_ext.blk);
        ext->len = le32_to_cpu(i_ext.len);
-        write_unlock(&ext->ext_lock);
 }
 static inline void set_raw_extent(struct extent_info *ext,
                                        struct f2fs_extent *i_ext)
 {
-        read_lock(&ext->ext_lock);
        i_ext->fofs = cpu_to_le32(ext->fofs);
-        i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
+        i_ext->blk = cpu_to_le32(ext->blk);
        i_ext->len = cpu_to_le32(ext->len);
-        read_unlock(&ext->ext_lock);
+}
+static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
+                                                u32 blk, unsigned int len)
+{
+        ei->fofs = fofs;
+        ei->blk = blk;
+        ei->len = len;
+}
+static inline bool __is_extent_same(struct extent_info *ei1,
+                                                struct extent_info *ei2)
+{
+        return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
+                                                ei1->len == ei2->len);
+}
+static inline bool __is_extent_mergeable(struct extent_info *back,
+                                                struct extent_info *front)
+{
+        return (back->fofs + back->len == front->fofs &&
+                        back->blk + back->len == front->blk);
+}
+static inline bool __is_back_mergeable(struct extent_info *cur,
+                                                struct extent_info *back)
+{
+        return __is_extent_mergeable(back, cur);
+}
+static inline bool __is_front_mergeable(struct extent_info *cur,
+                                                struct extent_info *front)
+{
+        return __is_extent_mergeable(cur, front);
 }
 struct f2fs_nm_info {
@@ -502,6 +564,10 @@ enum page_type {
        META,
        NR_PAGE_TYPE,
        META_FLUSH,
+        INMEM,          /* the below types are used by tracepoints only. */
+        INMEM_DROP,
+        IPU,
+        OPU,
 };
 struct f2fs_io_info {
@@ -559,7 +625,6 @@ struct f2fs_sb_info {
        struct mutex cp_mutex;                  /* checkpoint procedure lock */
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
-        struct mutex writepages;                /* mutex for writepages() */
        wait_queue_head_t cp_wait;
        struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
@@ -571,6 +636,14 @@ struct f2fs_sb_info {
        struct list_head dir_inode_list;        /* dir inode list */
        spinlock_t dir_inode_lock;              /* for dir inode list lock */
+        /* for extent tree cache */
+        struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+        struct rw_semaphore extent_tree_lock;   /* locking extent radix tree */
+        struct list_head extent_list;           /* lru list for shrinker */
+        spinlock_t extent_lock;                 /* locking extent lru list */
+        int total_ext_tree;                     /* extent tree count */
+        atomic_t total_ext_node;                /* extent info count */
        /* basic filesystem units */
        unsigned int log_sectors_per_block;     /* log2 sectors per block */
        unsigned int log_blocksize;             /* log2 block size */
@@ -920,12 +993,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
        return 0;
 }
+static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
+{
+        return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+}
 static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 {
        struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
        int offset;
-        if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) {
+        if (__cp_payload(sbi) > 0) {
                if (flag == NAT_BITMAP)
                        return &ckpt->sit_nat_version_bitmap;
                else
@@ -1166,8 +1244,10 @@ enum {
        FI_NEED_IPU,            /* used for ipu per file */
        FI_ATOMIC_FILE,         /* indicate atomic file */
        FI_VOLATILE_FILE,       /* indicate volatile file */
+        FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
        FI_DROP_CACHE,          /* drop dirty page cache */
        FI_DATA_EXIST,          /* indicate data exists */
+        FI_INLINE_DOTS,         /* indicate inline dot dentries */
 };
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1204,6 +1284,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
                set_inode_flag(fi, FI_INLINE_DENTRY);
        if (ri->i_inline & F2FS_DATA_EXIST)
                set_inode_flag(fi, FI_DATA_EXIST);
+        if (ri->i_inline & F2FS_INLINE_DOTS)
+                set_inode_flag(fi, FI_INLINE_DOTS);
 }
 static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -1219,6 +1301,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
                ri->i_inline |= F2FS_INLINE_DENTRY;
        if (is_inode_flag_set(fi, FI_DATA_EXIST))
                ri->i_inline |= F2FS_DATA_EXIST;
+        if (is_inode_flag_set(fi, FI_INLINE_DOTS))
+                ri->i_inline |= F2FS_INLINE_DOTS;
 }
 static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1264,6 +1348,11 @@ static inline int f2fs_exist_data(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
 }
+static inline int f2fs_has_inline_dots(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
+}
 static inline bool f2fs_is_atomic_file(struct inode *inode)
 {
        return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
@@ -1274,6 +1363,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
 }
+static inline bool f2fs_is_first_block_written(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
+}
 static inline bool f2fs_is_drop_cache(struct inode *inode)
 {
        return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
@@ -1290,12 +1384,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
 }
-static inline void *inline_dentry_addr(struct page *page)
-{
-        struct f2fs_inode *ri = F2FS_INODE(page);
-        return (void *)&(ri->i_addr[1]);
-}
 static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
 {
        if (!f2fs_has_inline_dentry(dir))
@@ -1363,7 +1451,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
 * dir.c
 */
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
-void set_de_type(struct f2fs_dir_entry *, struct inode *);
+void set_de_type(struct f2fs_dir_entry *, umode_t);
 struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
                        struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1382,7 +1470,10 @@ ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
 void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
                                struct page *, struct inode *);
 int update_dent_inode(struct inode *, const struct qstr *);
-int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
+void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
+                        const struct qstr *, f2fs_hash_t , unsigned int);
+int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
+                        umode_t);
 void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
                                                        struct inode *);
 int f2fs_do_tmpfile(struct inode *, struct inode *);
@@ -1392,7 +1483,7 @@ bool f2fs_empty_dir(struct inode *);
 static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 {
        return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
-                                inode);
+                                inode, inode->i_ino, inode->i_mode);
 }
 /*
@@ -1519,14 +1610,22 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
                                                struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
                                                struct f2fs_io_info *);
+void set_data_blkaddr(struct dnode_of_data *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(struct dnode_of_data *);
+void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
+void f2fs_destroy_extent_tree(struct inode *);
+void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
+void f2fs_update_extent_cache(struct dnode_of_data *);
+void f2fs_preserve_extent_tree(struct inode *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct page *, struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void init_extent_cache_info(struct f2fs_sb_info *);
+int __init create_extent_cache(void);
+void destroy_extent_cache(void);
 void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
 int f2fs_release_page(struct page *, gfp_t);
@@ -1554,7 +1653,7 @@ struct f2fs_stat_info {
        struct f2fs_sb_info *sbi;
        int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
        int main_area_segs, main_area_sections, main_area_zones;
-        int hit_ext, total_ext;
+        int hit_ext, total_ext, ext_tree, ext_node;
        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
        int nats, dirty_nats, sits, dirty_sits, fnids;
        int total_count, utilization;
@@ -1566,7 +1665,9 @@ struct f2fs_stat_info {
        int dirty_count, node_pages, meta_pages;
        int prefree_count, call_count, cp_count;
        int tot_segs, node_segs, data_segs, free_segs, free_secs;
+        int bg_node_segs, bg_data_segs;
        int tot_blks, data_blks, node_blks;
+        int bg_data_blks, bg_node_blks;
        int curseg[NR_CURSEG_TYPE];
        int cursec[NR_CURSEG_TYPE];
        int curzone[NR_CURSEG_TYPE];
@@ -1615,31 +1716,36 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
                ((sbi)->block_count[(curseg)->alloc_type]++)
 #define stat_inc_inplace_blocks(sbi)                                    \
                (atomic_inc(&(sbi)->inplace_count))
-#define stat_inc_seg_count(sbi, type)                                   \
+#define stat_inc_seg_count(sbi, type, gc_type)                          \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
                (si)->tot_segs++;                                       \
-                if (type == SUM_TYPE_DATA)                              \
+                if (type == SUM_TYPE_DATA) {                            \
                        si->data_segs++;                                \
-                else                                                    \
+                        si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
+                } else {                                                \
                        si->node_segs++;                                \
+                        si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \
+                }                                                       \
        } while (0)
 #define stat_inc_tot_blk_count(si, blks)                                \
        (si->tot_blks += (blks))
-#define stat_inc_data_blk_count(sbi, blks)                              \
+#define stat_inc_data_blk_count(sbi, blks, gc_type)                     \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
                stat_inc_tot_blk_count(si, blks);                       \
                si->data_blks += (blks);                                \
+                si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0;    \
        } while (0)
-#define stat_inc_node_blk_count(sbi, blks)                              \
+#define stat_inc_node_blk_count(sbi, blks, gc_type)                     \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
                stat_inc_tot_blk_count(si, blks);                       \
                si->node_blks += (blks);                                \
+                si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0;    \
        } while (0)
 int f2fs_build_stats(struct f2fs_sb_info *);
@@ -1661,10 +1767,10 @@ void f2fs_destroy_root_stats(void);
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
 #define stat_inc_inplace_blocks(sbi)
-#define stat_inc_seg_count(si, type)
+#define stat_inc_seg_count(sbi, type, gc_type)
 #define stat_inc_tot_blk_count(si, blks)
-#define stat_inc_data_blk_count(si, blks)
+#define stat_inc_data_blk_count(sbi, blks, gc_type)
-#define stat_inc_node_blk_count(sbi, blks)
+#define stat_inc_node_blk_count(sbi, blks, gc_type)
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -1688,6 +1794,7 @@ extern struct kmem_cache *inode_entry_slab;
 */
 bool f2fs_may_inline(struct inode *);
 void read_inline_data(struct page *, struct page *);
+bool truncate_inline_inode(struct page *, u64);
 int f2fs_read_inline_data(struct inode *, struct page *);
 int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
 int f2fs_convert_inline_inode(struct inode *);
@@ -1697,7 +1804,8 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
                                                        struct page **);
 struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
 int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
-int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *);
+int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
+                                                nid_t, umode_t);
 void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
                                                struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index df6a0596eccf..a6f3f6186588 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -241,6 +241,8 @@ go_write:
                 * will be used only for fsynced inodes after checkpoint.
                 */
                try_to_fix_pino(inode);
+                clear_inode_flag(fi, FI_APPEND_WRITE);
+                clear_inode_flag(fi, FI_UPDATE_WRITE);
                goto out;
        }
 sync_nodes:
@@ -433,8 +435,12 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                        continue;
                dn->data_blkaddr = NULL_ADDR;
-                update_extent_cache(dn);
+                set_data_blkaddr(dn);
+                f2fs_update_extent_cache(dn);
                invalidate_blocks(sbi, blkaddr);
+                if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
+                        clear_inode_flag(F2FS_I(dn->inode),
+                                                FI_FIRST_BLOCK_WRITTEN);
                nr_free++;
        }
        if (nr_free) {
@@ -454,15 +460,16 @@ void truncate_data_blocks(struct dnode_of_data *dn)
        truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
 }
-static int truncate_partial_data_page(struct inode *inode, u64 from)
+static int truncate_partial_data_page(struct inode *inode, u64 from,
+                                                                bool force)
 {
        unsigned offset = from & (PAGE_CACHE_SIZE - 1);
        struct page *page;
-        if (!offset)
+        if (!offset && !force)
                return 0;
-        page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false);
+        page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force);
        if (IS_ERR(page))
                return 0;
@@ -473,7 +480,8 @@ static int truncate_partial_data_page(struct inode *inode, u64 from)
        f2fs_wait_on_page_writeback(page, DATA);
        zero_user(page, offset, PAGE_CACHE_SIZE - offset);
-        set_page_dirty(page);
+        if (!force)
+                set_page_dirty(page);
 out:
        f2fs_put_page(page, 1);
        return 0;
@@ -487,6 +495,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
        pgoff_t free_from;
        int count = 0, err = 0;
        struct page *ipage;
+        bool truncate_page = false;
        trace_f2fs_truncate_blocks_enter(inode, from);
@@ -502,7 +511,10 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
        }
        if (f2fs_has_inline_data(inode)) {
+                if (truncate_inline_inode(ipage, from))
+                        set_page_dirty(ipage);
                f2fs_put_page(ipage, 1);
+                truncate_page = true;
                goto out;
        }
@@ -533,7 +545,7 @@ out:
        /* lastly zero out the first data page */
        if (!err)
-                err = truncate_partial_data_page(inode, from);
+                err = truncate_partial_data_page(inode, from, truncate_page);
        trace_f2fs_truncate_blocks_exit(inode, err);
        return err;
@@ -997,6 +1009,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
        if (!f2fs_is_volatile_file(inode))
                return 0;
+        if (!f2fs_is_first_block_written(inode))
+                return truncate_partial_data_page(inode, 0, true);
        punch_hole(inode, 0, F2FS_BLKSIZE);
        return 0;
 }
@@ -1029,6 +1044,41 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
        return ret;
 }
+static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
+{
+        struct inode *inode = file_inode(filp);
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct super_block *sb = sbi->sb;
+        __u32 in;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (get_user(in, (__u32 __user *)arg))
+                return -EFAULT;
+        switch (in) {
+        case F2FS_GOING_DOWN_FULLSYNC:
+                sb = freeze_bdev(sb->s_bdev);
+                if (sb && !IS_ERR(sb)) {
+                        f2fs_stop_checkpoint(sbi);
+                        thaw_bdev(sb->s_bdev, sb);
+                }
+                break;
+        case F2FS_GOING_DOWN_METASYNC:
+                /* do checkpoint only */
+                f2fs_sync_fs(sb, 1);
+                f2fs_stop_checkpoint(sbi);
+                break;
+        case F2FS_GOING_DOWN_NOSYNC:
+                f2fs_stop_checkpoint(sbi);
+                break;
+        default:
+                return -EINVAL;
+        }
+        return 0;
+}
 static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -1078,6 +1128,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return f2fs_ioc_release_volatile_write(filp);
        case F2FS_IOC_ABORT_VOLATILE_WRITE:
                return f2fs_ioc_abort_volatile_write(filp);
+        case F2FS_IOC_SHUTDOWN:
+                return f2fs_ioc_shutdown(filp, arg);
        case FITRIM:
                return f2fs_ioc_fitrim(filp, arg);
        default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 76adbc3641f1..ed58211fe79b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -435,7 +435,7 @@ next_step:
                                set_page_dirty(node_page);
                }
                f2fs_put_page(node_page, 1);
-                stat_inc_node_blk_count(sbi, 1);
+                stat_inc_node_blk_count(sbi, 1, gc_type);
        }
        if (initial) {
@@ -622,7 +622,7 @@ next_step:
                        if (IS_ERR(data_page))
                                continue;
                        move_data_page(inode, data_page, gc_type);
-                        stat_inc_data_blk_count(sbi, 1);
+                        stat_inc_data_blk_count(sbi, 1, gc_type);
                }
        }
@@ -680,7 +680,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
        }
        blk_finish_plug(&plug);
-        stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
+        stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
        stat_inc_call_count(sbi->stat_info);
        f2fs_put_page(sum_page, 1);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1484c00133cd..8140e4f0e538 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -21,7 +21,7 @@ bool f2fs_may_inline(struct inode *inode)
        if (f2fs_is_atomic_file(inode))
                return false;
-        if (!S_ISREG(inode->i_mode))
+        if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
                return false;
        if (i_size_read(inode) > MAX_INLINE_DATA)
@@ -50,10 +50,19 @@ void read_inline_data(struct page *page, struct page *ipage)
        SetPageUptodate(page);
 }
-static void truncate_inline_data(struct page *ipage)
+bool truncate_inline_inode(struct page *ipage, u64 from)
 {
+        void *addr;
+        if (from >= MAX_INLINE_DATA)
+                return false;
+        addr = inline_data_addr(ipage);
        f2fs_wait_on_page_writeback(ipage, NODE);
-        memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
+        memset(addr + from, 0, MAX_INLINE_DATA - from);
+        return true;
 }
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -122,7 +131,8 @@ no_update:
        set_page_writeback(page);
        fio.blk_addr = dn->data_blkaddr;
        write_data_page(page, dn, &fio);
-        update_extent_cache(dn);
+        set_data_blkaddr(dn);
+        f2fs_update_extent_cache(dn);
        f2fs_wait_on_page_writeback(page, DATA);
        if (dirty)
                inode_dec_dirty_pages(dn->inode);
@@ -131,7 +141,7 @@ no_update:
        set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
        /* clear inline data and flag after data writeback */
-        truncate_inline_data(dn->inode_page);
+        truncate_inline_inode(dn->inode_page, 0);
 clear_out:
        stat_dec_inline_inode(dn->inode);
        f2fs_clear_inline_inode(dn->inode);
@@ -245,7 +255,7 @@ process_inline:
        if (f2fs_has_inline_data(inode)) {
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(sbi, IS_ERR(ipage));
-                truncate_inline_data(ipage);
+                truncate_inline_inode(ipage, 0);
                f2fs_clear_inline_inode(inode);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
@@ -363,7 +373,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
        set_page_dirty(page);
        /* clear inline dir and flag after data writeback */
-        truncate_inline_data(ipage);
+        truncate_inline_inode(ipage, 0);
        stat_dec_inline_dir(dir);
        clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
@@ -380,21 +390,18 @@ out:
 }
 int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
-                                                struct inode *inode)
+                        struct inode *inode, nid_t ino, umode_t mode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
        struct page *ipage;
        unsigned int bit_pos;
        f2fs_hash_t name_hash;
-        struct f2fs_dir_entry *de;
        size_t namelen = name->len;
        struct f2fs_inline_dentry *dentry_blk = NULL;
+        struct f2fs_dentry_ptr d;
        int slots = GET_DENTRY_SLOTS(namelen);
-        struct page *page;
+        struct page *page = NULL;
        int err = 0;
-        int i;
-        name_hash = f2fs_dentry_hash(name);
        ipage = get_node_page(sbi, dir->i_ino);
        if (IS_ERR(ipage))
@@ -410,32 +417,34 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
                goto out;
        }
-        down_write(&F2FS_I(inode)->i_sem);
+        if (inode) {
-        page = init_inode_metadata(inode, dir, name, ipage);
+                down_write(&F2FS_I(inode)->i_sem);
-        if (IS_ERR(page)) {
+                page = init_inode_metadata(inode, dir, name, ipage);
-                err = PTR_ERR(page);
+                if (IS_ERR(page)) {
-                goto fail;
+                        err = PTR_ERR(page);
+                        goto fail;
+                }
        }
        f2fs_wait_on_page_writeback(ipage, NODE);
-        de = &dentry_blk->dentry[bit_pos];
-        de->hash_code = name_hash;
+        name_hash = f2fs_dentry_hash(name);
-        de->name_len = cpu_to_le16(namelen);
+        make_dentry_ptr(&d, (void *)dentry_blk, 2);
-        memcpy(dentry_blk->filename[bit_pos], name->name, name->len);
+        f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
-        de->ino = cpu_to_le32(inode->i_ino);
-        set_de_type(de, inode);
-        for (i = 0; i < slots; i++)
-                test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
        set_page_dirty(ipage);
        /* we don't need to mark_inode_dirty now */
-        F2FS_I(inode)->i_pino = dir->i_ino;
+        if (inode) {
-        update_inode(inode, page);
+                F2FS_I(inode)->i_pino = dir->i_ino;
-        f2fs_put_page(page, 1);
+                update_inode(inode, page);
+                f2fs_put_page(page, 1);
+        }
        update_parent_metadata(dir, inode, 0);
 fail:
-        up_write(&F2FS_I(inode)->i_sem);
+        if (inode)
+                up_write(&F2FS_I(inode)->i_sem);
        if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
                update_inode(dir, ipage);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2d002e3738a7..e622ec95409e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -51,6 +51,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
        }
 }
+static bool __written_first_block(struct f2fs_inode *ri)
+{
+        block_t addr = le32_to_cpu(ri->i_addr[0]);
+        if (addr != NEW_ADDR && addr != NULL_ADDR)
+                return true;
+        return false;
+}
 static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
 {
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -130,7 +139,8 @@ static int do_read_inode(struct inode *inode)
        fi->i_pino = le32_to_cpu(ri->i_pino);
        fi->i_dir_level = ri->i_dir_level;
-        get_extent_info(&fi->ext, ri->i_ext);
+        f2fs_init_extent_cache(inode, &ri->i_ext);
        get_inline_info(fi, ri);
        /* check data exist */
@@ -140,6 +150,9 @@ static int do_read_inode(struct inode *inode)
        /* get rdev by using inline_info */
        __get_inode_rdev(inode, ri);
+        if (__written_first_block(ri))
+                set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
        f2fs_put_page(node_page, 1);
        stat_inc_inline_inode(inode);
@@ -220,7 +233,11 @@ void update_inode(struct inode *inode, struct page *node_page)
        ri->i_links = cpu_to_le32(inode->i_nlink);
        ri->i_size = cpu_to_le64(i_size_read(inode));
        ri->i_blocks = cpu_to_le64(inode->i_blocks);
+        read_lock(&F2FS_I(inode)->ext_lock);
        set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
+        read_unlock(&F2FS_I(inode)->ext_lock);
        set_raw_inline(F2FS_I(inode), ri);
        ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -328,6 +345,12 @@ void f2fs_evict_inode(struct inode *inode)
 no_delete:
        stat_dec_inline_dir(inode);
        stat_dec_inline_inode(inode);
+        /* update extent info in inode */
+        if (inode->i_nlink)
+                f2fs_preserve_extent_tree(inode);
+        f2fs_destroy_extent_tree(inode);
        invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
        if (xnid)
                invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e79639a9787a..407dde3d7a92 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 #include <linux/ctype.h>
 #include <linux/dcache.h>
+#include <linux/namei.h>
 #include "f2fs.h"
 #include "node.h"
@@ -187,6 +188,44 @@ struct dentry *f2fs_get_parent(struct dentry *child)
        return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
 }
+static int __recover_dot_dentries(struct inode *dir, nid_t pino)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+        struct qstr dot = QSTR_INIT(".", 1);
+        struct qstr dotdot = QSTR_INIT("..", 2);
+        struct f2fs_dir_entry *de;
+        struct page *page;
+        int err = 0;
+        f2fs_lock_op(sbi);
+        de = f2fs_find_entry(dir, &dot, &page);
+        if (de) {
+                f2fs_dentry_kunmap(dir, page);
+                f2fs_put_page(page, 0);
+        } else {
+                err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
+                if (err)
+                        goto out;
+        }
+        de = f2fs_find_entry(dir, &dotdot, &page);
+        if (de) {
+                f2fs_dentry_kunmap(dir, page);
+                f2fs_put_page(page, 0);
+        } else {
+                err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+        }
+out:
+        if (!err) {
+                clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
+                mark_inode_dirty(dir);
+        }
+        f2fs_unlock_op(sbi);
+        return err;
+}
 static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
                unsigned int flags)
 {
@@ -206,6 +245,16 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
                inode = f2fs_iget(dir->i_sb, ino);
                if (IS_ERR(inode))
                        return ERR_CAST(inode);
+                if (f2fs_has_inline_dots(inode)) {
+                        int err;
+                        err = __recover_dot_dentries(inode, dir->i_ino);
+                        if (err) {
+                                iget_failed(inode);
+                                return ERR_PTR(err);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
@@ -247,6 +296,23 @@ fail:
        return err;
 }
+static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+        struct page *page;
+        page = page_follow_link_light(dentry, nd);
+        if (IS_ERR(page))
+                return page;
+        /* this is broken symlink case */
+        if (*nd_get_link(nd) == 0) {
+                kunmap(page);
+                page_cache_release(page);
+                return ERR_PTR(-ENOENT);
+        }
+        return page;
+}
 static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
                                        const char *symname)
 {
@@ -276,6 +342,17 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
+        /*
+         * Let's flush symlink data in order to avoid broken symlink as much as
+         * possible. Nevertheless, fsyncing is the best way, but there is no
+         * way to get a file descriptor in order to flush that.
+         *
+         * Note that, it needs to do dir->fsync to make this recoverable.
+         * If the symlink path is stored into inline_data, there is no
+         * performance regression.
+         */
+        filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1);
        if (IS_DIRSYNC(dir))
                f2fs_sync_fs(sbi->sb, 1);
        return err;
@@ -693,6 +770,8 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        f2fs_unlock_op(sbi);
        alloc_nid_done(sbi, inode->i_ino);
+        stat_inc_inline_inode(inode);
        d_tmpfile(dentry, inode);
        unlock_new_inode(inode);
        return 0;
@@ -729,7 +808,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 const struct inode_operations f2fs_symlink_inode_operations = {
        .readlink       = generic_readlink,
-        .follow_link    = page_follow_link_light,
+        .follow_link    = f2fs_follow_link,
        .put_link       = page_put_link,
        .getattr        = f2fs_getattr,
        .setattr        = f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 97bd9d3db882..8ab0cf1930bd 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -41,7 +41,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
        /* only uses low memory */
        avail_ram = val.totalram - val.totalhigh;
-        /* give 25%, 25%, 50%, 50% memory for each components respectively */
+        /*
+         * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+         */
        if (type == FREE_NIDS) {
                mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
                                                        PAGE_CACHE_SHIFT;
@@ -62,6 +64,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                        mem_size += (sbi->im[i].ino_num *
                                sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+        } else if (type == EXTENT_CACHE) {
+                mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+                                atomic_read(&sbi->total_ext_node) *
+                                sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
        } else {
                if (sbi->sb->s_bdi->dirty_exceeded)
                        return false;
@@ -494,7 +501,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
        /* if inline_data is set, should not report any block indices */
        if (f2fs_has_inline_data(dn->inode) && index) {
-                err = -EINVAL;
+                err = -ENOENT;
                f2fs_put_page(npage[0], 1);
                goto release_out;
        }
@@ -995,6 +1002,7 @@ static int read_node_page(struct page *page, int rw)
        get_node_info(sbi, page->index, &ni);
        if (unlikely(ni.blk_addr == NULL_ADDR)) {
+                ClearPageUptodate(page);
                f2fs_put_page(page, 1);
                return -ENOENT;
        }
@@ -1306,6 +1314,7 @@ static int f2fs_write_node_page(struct page *page,
        /* This page is already truncated */
        if (unlikely(ni.blk_addr == NULL_ADDR)) {
+                ClearPageUptodate(page);
                dec_page_count(sbi, F2FS_DIRTY_NODES);
                unlock_page(page);
                return 0;
@@ -1821,6 +1830,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
        struct f2fs_nat_block *nat_blk;
        struct nat_entry *ne, *cur;
        struct page *page = NULL;
+        struct f2fs_nm_info *nm_i = NM_I(sbi);
        /*
         * there are two steps to flush nat entries:
@@ -1874,7 +1884,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
        f2fs_bug_on(sbi, set->entry_cnt);
+        down_write(&nm_i->nat_tree_lock);
        radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
+        up_write(&nm_i->nat_tree_lock);
        kmem_cache_free(nat_entry_set_slab, set);
 }
@@ -1902,6 +1914,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
                remove_nats_in_journal(sbi);
+        down_write(&nm_i->nat_tree_lock);
        while ((found = __gang_lookup_nat_set(nm_i,
                                        set_idx, SETVEC_SIZE, setvec))) {
                unsigned idx;
@@ -1910,6 +1923,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                        __adjust_nat_entry_set(setvec[idx], &sets,
                                                        MAX_NAT_JENTRIES(sum));
        }
+        up_write(&nm_i->nat_tree_lock);
        /* flush dirty nats in nat entry set */
        list_for_each_entry_safe(set, tmp, &sets, set_list)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index f405bbf2435a..c56026f1725c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -120,6 +120,7 @@ enum mem_type {
        NAT_ENTRIES,    /* indicates the cached nat entry */
        DIRTY_DENTS,    /* indicates dirty dentry pages */
        INO_ENTRIES,    /* indicates inode entries */
+        EXTENT_CACHE,   /* indicates extent cache */
        BASE_CHECK,     /* check kernel status */
 };
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 41afb9534bbd..8d8ea99f2156 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -93,10 +93,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
        }
 retry:
        de = f2fs_find_entry(dir, &name, &page);
-        if (de && inode->i_ino == le32_to_cpu(de->ino)) {
+        if (de && inode->i_ino == le32_to_cpu(de->ino))
-                clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
                goto out_unmap_put;
-        }
        if (de) {
                einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
                if (IS_ERR(einode)) {
@@ -115,7 +114,7 @@ retry:
                iput(einode);
                goto retry;
        }
-        err = __f2fs_add_link(dir, &name, inode);
+        err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
        if (err)
                goto out_err;
@@ -187,11 +186,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                        goto next;
                entry = get_fsync_inode(head, ino_of_node(page));
-                if (entry) {
+                if (!entry) {
-                        if (IS_INODE(page) && is_dent_dnode(page))
-                                set_inode_flag(F2FS_I(entry->inode),
-                                                        FI_INC_LINK);
-                } else {
                        if (IS_INODE(page) && is_dent_dnode(page)) {
                                err = recover_inode_page(sbi, page);
                                if (err)
@@ -212,8 +207,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                        if (IS_ERR(entry->inode)) {
                                err = PTR_ERR(entry->inode);
                                kmem_cache_free(fsync_entry_slab, entry);
-                                if (err == -ENOENT)
+                                if (err == -ENOENT) {
+                                        err = 0;
                                        goto next;
+                                }
                                break;
                        }
                        list_add_tail(&entry->list, head);
@@ -256,6 +253,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
        struct f2fs_summary_block *sum_node;
        struct f2fs_summary sum;
        struct page *sum_page, *node_page;
+        struct dnode_of_data tdn = *dn;
        nid_t ino, nid;
        struct inode *inode;
        unsigned int offset;
@@ -283,17 +281,15 @@ got_it:
        /* Use the locked dnode page and inode */
        nid = le32_to_cpu(sum.nid);
        if (dn->inode->i_ino == nid) {
-                struct dnode_of_data tdn = *dn;
                tdn.nid = nid;
+                if (!dn->inode_page_locked)
+                        lock_page(dn->inode_page);
                tdn.node_page = dn->inode_page;
                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
-                truncate_data_blocks_range(&tdn, 1);
+                goto truncate_out;
-                return 0;
        } else if (dn->nid == nid) {
-                struct dnode_of_data tdn = *dn;
                tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
-                truncate_data_blocks_range(&tdn, 1);
+                goto truncate_out;
-                return 0;
        }
        /* Get the node page */
@@ -317,18 +313,33 @@ got_it:
        bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
                        le16_to_cpu(sum.ofs_in_node);
-        if (ino != dn->inode->i_ino) {
+        /*
-                truncate_hole(inode, bidx, bidx + 1);
+         * if inode page is locked, unlock temporarily, but its reference
+         * count keeps alive.
+         */
+        if (ino == dn->inode->i_ino && dn->inode_page_locked)
+                unlock_page(dn->inode_page);
+        set_new_dnode(&tdn, inode, NULL, NULL, 0);
+        if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
+                goto out;
+        if (tdn.data_blkaddr == blkaddr)
+                truncate_data_blocks_range(&tdn, 1);
+        f2fs_put_dnode(&tdn);
+out:
+        if (ino != dn->inode->i_ino)
                iput(inode);
-        } else {
+        else if (dn->inode_page_locked)
-                struct dnode_of_data tdn;
+                lock_page(dn->inode_page);
-                set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
+        return 0;
-                if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
-                        return 0;
+truncate_out:
-                if (tdn.data_blkaddr != NULL_ADDR)
+        if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
-                        truncate_data_blocks_range(&tdn, 1);
+                truncate_data_blocks_range(&tdn, 1);
-                f2fs_put_page(tdn.node_page, 1);
+        if (dn->inode->i_ino == nid && !dn->inode_page_locked)
-        }
+                unlock_page(dn->inode_page);
        return 0;
 }
@@ -384,7 +395,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                src = datablock_addr(dn.node_page, dn.ofs_in_node);
                dest = datablock_addr(page, dn.ofs_in_node);
-                if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
+                if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
+                        dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) {
                        if (src == NULL_ADDR) {
                                err = reserve_new_block(&dn);
                                /* We should not get -ENOSPC */
@@ -401,14 +414,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        /* write dummy data page */
                        recover_data_page(sbi, NULL, &sum, src, dest);
                        dn.data_blkaddr = dest;
-                        update_extent_cache(&dn);
+                        set_data_blkaddr(&dn);
+                        f2fs_update_extent_cache(&dn);
                        recovered++;
                }
                dn.ofs_in_node++;
        }
-        /* write node page in place */
-        set_summary(&sum, dn.nid, 0, 0);
        if (IS_INODE(dn.node_page))
                sync_inode_page(&dn);
@@ -552,7 +564,7 @@ out:
                mutex_unlock(&sbi->cp_mutex);
        } else if (need_writecp) {
                struct cp_control cpc = {
-                        .reason = CP_SYNC,
+                        .reason = CP_RECOVERY,
                };
                mutex_unlock(&sbi->cp_mutex);
                write_checkpoint(sbi, &cpc);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index daee4ab913da..f939660941bb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -205,6 +205,8 @@ retry:
        list_add_tail(&new->list, &fi->inmem_pages);
        inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
        mutex_unlock(&fi->inmem_lock);
+        trace_f2fs_register_inmem_page(page, INMEM);
 }
 void commit_inmem_pages(struct inode *inode, bool abort)
@@ -238,11 +240,13 @@ void commit_inmem_pages(struct inode *inode, bool abort)
                                f2fs_wait_on_page_writeback(cur->page, DATA);
                                if (clear_page_dirty_for_io(cur->page))
                                        inode_dec_dirty_pages(inode);
+                                trace_f2fs_commit_inmem_page(cur->page, INMEM);
                                do_write_data_page(cur->page, &fio);
                                submit_bio = true;
                        }
                        f2fs_put_page(cur->page, 1);
                } else {
+                        trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
                        put_page(cur->page);
                }
                radix_tree_delete(&fi->inmem_root, cur->page->index);
@@ -277,6 +281,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 {
+        /* try to shrink extent cache when there is no enough memory */
+        f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
        /* check the # of cached NAT entries and prefree segments */
        if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
                        excess_prefree_segs(sbi) ||
@@ -549,7 +556,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
-                if (end - start < cpc->trim_minlen)
+                if (force && end - start < cpc->trim_minlen)
                        continue;
                __add_discard_entry(sbi, cpc, start, end);
@@ -1164,6 +1171,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
+        mutex_lock(&sit_i->sentry_lock);
        /* direct_io'ed data is aligned to the segment for better performance */
        if (direct_io && curseg->next_blkoff)
@@ -1178,7 +1186,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
         */
        __add_sum_entry(sbi, type, sum);
-        mutex_lock(&sit_i->sentry_lock);
        __refresh_next_blkoff(sbi, curseg);
        stat_inc_block_count(sbi, curseg);
@@ -1730,6 +1737,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        mutex_lock(&curseg->curseg_mutex);
        mutex_lock(&sit_i->sentry_lock);
+        if (!sit_i->dirty_sentries)
+                goto out;
        /*
         * add and account sit entries of dirty bitmap in sit entry
         * set temporarily
@@ -1744,9 +1754,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
                remove_sits_in_journal(sbi);
-        if (!sit_i->dirty_sentries)
-                goto out;
        /*
         * there are two steps to flush sit entries:
         * #1, flush sit entries to journal in current cold data summary block.
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7fd35111cf62..85d7fa7514b2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -336,7 +336,8 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
        clear_bit(segno, free_i->free_segmap);
        free_i->free_segments++;
-        next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
+        next = find_next_bit(free_i->free_segmap,
+                        start_segno + sbi->segs_per_sec, start_segno);
        if (next >= start_segno + sbi->segs_per_sec) {
                clear_bit(secno, free_i->free_secmap);
                free_i->free_sections++;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f2fe666a6ea9..160b88346b24 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -57,6 +57,8 @@ enum {
        Opt_flush_merge,
        Opt_nobarrier,
        Opt_fastboot,
+        Opt_extent_cache,
+        Opt_noinline_data,
        Opt_err,
 };
@@ -78,6 +80,8 @@ static match_table_t f2fs_tokens = {
        {Opt_flush_merge, "flush_merge"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_fastboot, "fastboot"},
+        {Opt_extent_cache, "extent_cache"},
+        {Opt_noinline_data, "noinline_data"},
        {Opt_err, NULL},
 };
@@ -367,6 +371,12 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_fastboot:
                        set_opt(sbi, FASTBOOT);
                        break;
+                case Opt_extent_cache:
+                        set_opt(sbi, EXTENT_CACHE);
+                        break;
+                case Opt_noinline_data:
+                        clear_opt(sbi, INLINE_DATA);
+                        break;
                default:
                        f2fs_msg(sb, KERN_ERR,
                                "Unrecognized mount option \"%s\" or missing value",
@@ -392,7 +402,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
        atomic_set(&fi->dirty_pages, 0);
        fi->i_current_depth = 1;
        fi->i_advise = 0;
-        rwlock_init(&fi->ext.ext_lock);
+        rwlock_init(&fi->ext_lock);
        init_rwsem(&fi->i_sem);
        INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
        INIT_LIST_HEAD(&fi->inmem_pages);
@@ -591,6 +601,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",disable_ext_identify");
        if (test_opt(sbi, INLINE_DATA))
                seq_puts(seq, ",inline_data");
+        else
+                seq_puts(seq, ",noinline_data");
        if (test_opt(sbi, INLINE_DENTRY))
                seq_puts(seq, ",inline_dentry");
        if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
@@ -599,6 +611,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",nobarrier");
        if (test_opt(sbi, FASTBOOT))
                seq_puts(seq, ",fastboot");
+        if (test_opt(sbi, EXTENT_CACHE))
+                seq_puts(seq, ",extent_cache");
        seq_printf(seq, ",active_logs=%u", sbi->active_logs);
        return 0;
@@ -959,7 +973,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct buffer_head *raw_super_buf;
        struct inode *root;
        long err = -EINVAL;
-        bool retry = true;
+        bool retry = true, need_fsck = false;
        char *options = NULL;
        int i;
@@ -984,6 +998,7 @@ try_onemore:
        sbi->active_logs = NR_CURSEG_TYPE;
        set_opt(sbi, BG_GC);
+        set_opt(sbi, INLINE_DATA);
 #ifdef CONFIG_F2FS_FS_XATTR
        set_opt(sbi, XATTR_USER);
@@ -1020,7 +1035,6 @@ try_onemore:
        sbi->raw_super = raw_super;
        sbi->raw_super_buf = raw_super_buf;
        mutex_init(&sbi->gc_mutex);
-        mutex_init(&sbi->writepages);
        mutex_init(&sbi->cp_mutex);
        init_rwsem(&sbi->node_write);
        clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -1072,6 +1086,8 @@ try_onemore:
        INIT_LIST_HEAD(&sbi->dir_inode_list);
        spin_lock_init(&sbi->dir_inode_lock);
+        init_extent_cache_info(sbi);
        init_ino_entry_info(sbi);
        /* setup f2fs internal modules */
@@ -1146,9 +1162,6 @@ try_onemore:
        if (err)
                goto free_proc;
-        if (!retry)
-                set_sbi_flag(sbi, SBI_NEED_FSCK);
        /* recover fsynced data */
        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
                /*
@@ -1160,8 +1173,13 @@ try_onemore:
                        err = -EROFS;
                        goto free_kobj;
                }
+                if (need_fsck)
+                        set_sbi_flag(sbi, SBI_NEED_FSCK);
                err = recover_fsync_data(sbi);
                if (err) {
+                        need_fsck = true;
                        f2fs_msg(sb, KERN_ERR,
                                "Cannot recover all fsync data errno=%ld", err);
                        goto free_kobj;
@@ -1212,7 +1230,7 @@ free_sbi:
        /* give only one another chance */
        if (retry) {
-                retry = 0;
+                retry = false;
                shrink_dcache_sb(sb);
                goto try_onemore;
        }
@@ -1278,10 +1296,13 @@ static int __init init_f2fs_fs(void)
        err = create_checkpoint_caches();
        if (err)
                goto free_segment_manager_caches;
+        err = create_extent_cache();
+        if (err)
+                goto free_checkpoint_caches;
        f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
        if (!f2fs_kset) {
                err = -ENOMEM;
-                goto free_checkpoint_caches;
+                goto free_extent_cache;
        }
        err = register_filesystem(&f2fs_fs_type);
        if (err)
@@ -1292,6 +1313,8 @@ static int __init init_f2fs_fs(void)
 free_kset:
        kset_unregister(f2fs_kset);
+free_extent_cache:
+        destroy_extent_cache();
 free_checkpoint_caches:
        destroy_checkpoint_caches();
 free_segment_manager_caches:
@@ -1309,6 +1332,7 @@ static void __exit exit_f2fs_fs(void)
        remove_proc_entry("fs/f2fs", NULL);
        f2fs_destroy_root_stats();
        unregister_filesystem(&f2fs_fs_type);
+        destroy_extent_cache();
        destroy_checkpoint_caches();
        destroy_segment_manager_caches();
        destroy_node_manager_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 5072bf9ae0ef..b0fd2f2d0716 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -135,7 +135,8 @@ static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
        if (strcmp(name, "") != 0)
                return -EINVAL;
-        *((char *)buffer) = F2FS_I(inode)->i_advise;
+        if (buffer)
+                *((char *)buffer) = F2FS_I(inode)->i_advise;
        return sizeof(char);
 }
@@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
                return -EINVAL;
        F2FS_I(inode)->i_advise |= *(char *)value;
+        mark_inode_dirty(inode);
        return 0;
 }
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c98796afb..611b5408f6ec 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
 void pin_remove(struct fs_pin *pin)
 {
        spin_lock(&pin_lock);
-        hlist_del(&pin->m_list);
+        hlist_del_init(&pin->m_list);
-        hlist_del(&pin->s_list);
+        hlist_del_init(&pin->s_list);
        spin_unlock(&pin_lock);
        spin_lock_irq(&pin->wait.lock);
        pin->done = 1;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 762c7a3cf43d..2eac55379239 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
        if (rc) {
                JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
                              __func__, rc, totlen);
-                rc = rc ? rc : -EBADFD;
                goto out;
        }
        rc = save_xattr_ref(c, ref);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 665ef5a05183..a563ddbc19e6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -31,7 +31,7 @@
 static struct hlist_head        nlm_files[FILE_NRHASH];
 static DEFINE_MUTEX(nlm_file_mutex);
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
 static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
 {
        u32 *fhp = (u32*)f->data;
diff --git a/fs/namespace.c b/fs/namespace.c
index 82ef1405260e..1f4f9dac6e5a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 */
 struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
 {
-        struct mount *p, *res;
+        struct mount *p, *res = NULL;
-        res = p = __lookup_mnt(mnt, dentry);
+        p = __lookup_mnt(mnt, dentry);
        if (!p)
                goto out;
+        if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+                res = p;
        hlist_for_each_entry_continue(p, mnt_hash) {
                if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
                        break;
-                res = p;
+                if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+                        res = p;
        }
 out:
        return res;
@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 /*
 * vfsmount lock must be held for write
 */
-static void detach_mnt(struct mount *mnt, struct path *old_path)
+static void unhash_mnt(struct mount *mnt)
 {
-        old_path->dentry = mnt->mnt_mountpoint;
-        old_path->mnt = &mnt->mnt_parent->mnt;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
@@ -811,6 +812,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
 /*
 * vfsmount lock must be held for write
 */
+static void detach_mnt(struct mount *mnt, struct path *old_path)
+{
+        old_path->dentry = mnt->mnt_mountpoint;
+        old_path->mnt = &mnt->mnt_parent->mnt;
+        unhash_mnt(mnt);
+}
+/*
+ * vfsmount lock must be held for write
+ */
+static void umount_mnt(struct mount *mnt)
+{
+        /* old mountpoint will be dropped when we can do that */
+        mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+        unhash_mnt(mnt);
+}
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
        rcu_read_unlock();
        list_del(&mnt->mnt_instance);
+        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
+                struct mount *p, *tmp;
+                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+                        umount_mnt(p);
+                }
+        }
        unlock_mount_hash();
        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
-        struct hlist_head head = unmounted;
+        struct hlist_head head;
-        if (likely(hlist_empty(&head))) {
+        hlist_move_list(&unmounted, &head);
-                up_write(&namespace_sem);
-                return;
-        }
-        head.first->pprev = &head.first;
-        INIT_HLIST_HEAD(&unmounted);
        up_write(&namespace_sem);
+        if (likely(hlist_empty(&head)))
+                return;
        synchronize_rcu();
        group_pin_kill(&head);
@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
        down_write(&namespace_sem);
 }
+enum umount_tree_flags {
+        UMOUNT_SYNC = 1,
+        UMOUNT_PROPAGATE = 2,
+        UMOUNT_CONNECTED = 4,
+};
 /*
 * mount_lock must be held
 * namespace_sem must be held for write
- * how = 0 => just this tree, don't propagate
- * how = 1 => propagate; we know that nobody else has reference to any victims
- * how = 2 => lazy umount
 */
-void umount_tree(struct mount *mnt, int how)
+static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 {
-        HLIST_HEAD(tmp_list);
+        LIST_HEAD(tmp_list);
        struct mount *p;
+        if (how & UMOUNT_PROPAGATE)
+                propagate_mount_unlock(mnt);
+        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
-                hlist_del_init_rcu(&p->mnt_hash);
+                p->mnt.mnt_flags |= MNT_UMOUNT;
-                hlist_add_head(&p->mnt_hash, &tmp_list);
+                list_move(&p->mnt_list, &tmp_list);
        }
-        hlist_for_each_entry(p, &tmp_list, mnt_hash)
+        /* Hide the mounts from mnt_mounts */
+        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
+        }
-        if (how)
+        /* Add propogated mounts to the tmp_list */
+        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);
-        while (!hlist_empty(&tmp_list)) {
+        while (!list_empty(&tmp_list)) {
-                p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+                bool disconnect;
-                hlist_del_init_rcu(&p->mnt_hash);
+                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
-                if (how < 2)
+                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
-                pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
+                disconnect = !(((how & UMOUNT_CONNECTED) &&
+                                mnt_has_parent(p) &&
+                                (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
+                               IS_MNT_LOCKED_AND_LAZY(p));
+                pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+                                 disconnect ? &unmounted : NULL);
                if (mnt_has_parent(p)) {
-                        hlist_del_init(&p->mnt_mp_list);
-                        put_mountpoint(p->mnt_mp);
                        mnt_add_count(p->mnt_parent, -1);
-                        /* old mountpoint will be dropped when we can do that */
+                        if (!disconnect) {
-                        p->mnt_ex_mountpoint = p->mnt_mountpoint;
+                                /* Don't forget about p */
-                        p->mnt_mountpoint = p->mnt.mnt_root;
+                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
-                        p->mnt_parent = p;
+                        } else {
-                        p->mnt_mp = NULL;
+                                umount_mnt(p);
+                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
        }
@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)
        if (flags & MNT_DETACH) {
                if (!list_empty(&mnt->mnt_list))
-                        umount_tree(mnt, 2);
+                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (!list_empty(&mnt->mnt_list))
-                                umount_tree(mnt, 1);
+                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)
        namespace_lock();
        mp = lookup_mountpoint(dentry);
-        if (!mp)
+        if (IS_ERR_OR_NULL(mp))
                goto out_unlock;
        lock_mount_hash();
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
-                umount_tree(mnt, 2);
+                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
+                        struct mount *p, *tmp;
+                        list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
+                                hlist_add_head(&p->mnt_umount.s_list, &unmounted);
+                                umount_mnt(p);
+                        }
+                }
+                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        unlock_mount_hash();
        put_mountpoint(mp);
@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 out:
        if (res) {
                lock_mount_hash();
-                umount_tree(res, 0);
+                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return q;
@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
 {
        struct mount *tree;
        namespace_lock();
-        tree = copy_tree(real_mount(path->mnt), path->dentry,
+        if (!check_mnt(real_mount(path->mnt)))
-                         CL_COPY_ALL | CL_PRIVATE);
+                tree = ERR_PTR(-EINVAL);
+        else
+                tree = copy_tree(real_mount(path->mnt), path->dentry,
+                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        namespace_lock();
        lock_mount_hash();
-        umount_tree(real_mount(mnt), 0);
+        umount_tree(real_mount(mnt), UMOUNT_SYNC);
        unlock_mount_hash();
        namespace_unlock();
 }
@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
-                umount_tree(child, 0);
+                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
-                umount_tree(mnt, 0);
+                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
 out2:
@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
-                umount_tree(mnt, 1);
+                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
-                        umount_tree(m, 1);
+                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index fc2d108f5272..a0b77fc1bd39 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -108,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL
 config NFSD_FAULT_INJECTION
        bool "NFS server manual fault injection"
-        depends on NFSD_V4 && DEBUG_KERNEL
+        depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
        help
          This option enables support for manually injecting faults
          into the NFS server.  This is intended to be used for
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c3e3b6e55ae2..900c3ae94adc 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
        struct svc_export *orig = container_of(a, struct svc_export, h);
        struct svc_export *new = container_of(b, struct svc_export, h);
        return orig->ex_client == new->ex_client &&
-                orig->ex_path.dentry == new->ex_path.dentry &&
+                path_equal(&orig->ex_path, &new->ex_path);
-                orig->ex_path.mnt == new->ex_path.mnt;
 }
 static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
@@ -1159,6 +1158,7 @@ static struct flags {
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
        { NFSEXP_V4ROOT, {"v4root", ""}},
+        { NFSEXP_PNFS, {"pnfs", ""}},
        { 0, {"", ""}}
 };
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 59fd76651781..eaf4605a4b9e 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -499,43 +499,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s
        state->mask.allow |= astate->allow;
 }
-/*
- * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
- * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
- * to traditional read/write/execute permissions.
- *
- * It's problematic to reject acls that use certain mode bits, because it
- * places the burden on users to learn the rules about which bits one
- * particular server sets, without giving the user a lot of help--we return an
- * error that could mean any number of different things.  To make matters
- * worse, the problematic bits might be introduced by some application that's
- * automatically mapping from some other acl model.
- *
- * So wherever possible we accept anything, possibly erring on the side of
- * denying more permissions than necessary.
- *
- * However we do reject *explicit* DENY's of a few bits representing
- * permissions we could never deny:
- */
-static inline int check_deny(u32 mask, int isowner)
-{
-        if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
-                return -EINVAL;
-        if (!isowner)
-                return 0;
-        if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
-                return -EINVAL;
-        return 0;
-}
 static struct posix_acl *
 posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 {
        struct posix_acl_entry *pace;
        struct posix_acl *pacl;
        int nace;
-        int i, error = 0;
+        int i;
        /*
         * ACLs with no ACEs are treated differently in the inheritable
@@ -560,17 +530,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
        pace = pacl->a_entries;
        pace->e_tag = ACL_USER_OBJ;
-        error = check_deny(state->owner.deny, 1);
-        if (error)
-                goto out_err;
        low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
        for (i=0; i < state->users->n; i++) {
                pace++;
                pace->e_tag = ACL_USER;
-                error = check_deny(state->users->aces[i].perms.deny, 0);
-                if (error)
-                        goto out_err;
                low_mode_from_nfs4(state->users->aces[i].perms.allow,
                                        &pace->e_perm, flags);
                pace->e_uid = state->users->aces[i].uid;
@@ -579,18 +543,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
        pace++;
        pace->e_tag = ACL_GROUP_OBJ;
-        error = check_deny(state->group.deny, 0);
-        if (error)
-                goto out_err;
        low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
        add_to_mask(state, &state->group);
        for (i=0; i < state->groups->n; i++) {
                pace++;
                pace->e_tag = ACL_GROUP;
-                error = check_deny(state->groups->aces[i].perms.deny, 0);
-                if (error)
-                        goto out_err;
                low_mode_from_nfs4(state->groups->aces[i].perms.allow,
                                        &pace->e_perm, flags);
                pace->e_gid = state->groups->aces[i].gid;
@@ -605,15 +563,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
        pace++;
        pace->e_tag = ACL_OTHER;
-        error = check_deny(state->other.deny, 0);
-        if (error)
-                goto out_err;
        low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
        return pacl;
-out_err:
-        posix_acl_release(pacl);
-        return ERR_PTR(error);
 }
 static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 92b9d97aff4f..4a8314f08a0e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -470,7 +470,7 @@ out:
                fh_put(resfh);
                kfree(resfh);
        }
-        nfsd4_cleanup_open_state(cstate, open, status);
+        nfsd4_cleanup_open_state(cstate, open);
        nfsd4_bump_seqid(cstate, status);
        return status;
 }
@@ -1030,6 +1030,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
                return status;
        }
+        if (!file)
+                return nfserr_bad_stateid;
        status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
                                     fallocate->falloc_offset,
@@ -1069,6 +1071,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
                return status;
        }
+        if (!file)
+                return nfserr_bad_stateid;
        switch (seek->seek_whence) {
        case NFS4_CONTENT_DATA:
@@ -1815,7 +1819,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
                bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
        }
        if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
-                ret += NFSD4_MAX_SEC_LABEL_LEN + 12;
+                ret += NFS4_MAXLABELLEN + 12;
                bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
        }
        /*
@@ -2282,13 +2286,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_func = (nfsd4op_func)nfsd4_allocate,
                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_ALLOCATE",
-                .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_DEALLOCATE] = {
                .op_func = (nfsd4op_func)nfsd4_deallocate,
                .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
                .op_name = "OP_DEALLOCATE",
-                .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize,
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
        [OP_SEEK] = {
                .op_func = (nfsd4op_func)nfsd4_seek,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 326a545ea7b2..d42786ee39af 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1139,7 +1139,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid)
        return sid->sequence % SESSION_HASH_SIZE;
 }
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
 static inline void
 dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
 {
@@ -4049,7 +4049,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
                status = nfserr_bad_stateid;
                if (nfsd4_is_deleg_cur(open))
                        goto out;
-                status = nfserr_jukebox;
        }
        /*
@@ -4118,7 +4117,7 @@ out:
 }
 void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
-                              struct nfsd4_open *open, __be32 status)
+                              struct nfsd4_open *open)
 {
        if (open->op_openowner) {
                struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5fb7e78169a6..a45032ce7b80 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,7 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                len += 4;
                dummy32 = be32_to_cpup(p++);
                READ_BUF(dummy32);
-                if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
+                if (dummy32 > NFS4_MAXLABELLEN)
                        return nfserr_badlabel;
                len += (XDR_QUADLEN(dummy32) << 2);
                READMEM(buf, dummy32);
@@ -2020,7 +2020,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
         * dentries/path components in an array.
         */
        for (;;) {
-                if (cur.dentry == root->dentry && cur.mnt == root->mnt)
+                if (path_equal(&cur, root))
                        break;
                if (cur.dentry == cur.mnt->mnt_root) {
                        if (follow_up(&cur))
@@ -3422,6 +3422,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        unsigned long maxcount;
        struct xdr_stream *xdr = &resp->xdr;
        struct file *file = read->rd_filp;
+        struct svc_fh *fhp = read->rd_fhp;
        int starting_len = xdr->buf->len;
        struct raparms *ra;
        __be32 *p;
@@ -3445,12 +3446,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
        maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
        maxcount = min_t(unsigned long, maxcount, read->rd_length);
-        if (!read->rd_filp) {
+        if (read->rd_filp)
+                err = nfsd_permission(resp->rqstp, fhp->fh_export,
+                                fhp->fh_dentry,
+                                NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
+        else
                err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
                                                &file, &ra);
-                if (err)
+        if (err)
-                        goto err_truncate;
+                goto err_truncate;
-        }
        if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
                err = nfsd4_encode_splice_read(resp, read, file, maxcount);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index aa47d75ddb26..9690cb4dd588 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1250,15 +1250,15 @@ static int __init init_nfsd(void)
        int retval;
        printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
-        retval = register_cld_notifier();
-        if (retval)
-                return retval;
        retval = register_pernet_subsys(&nfsd_net_ops);
        if (retval < 0)
-                goto out_unregister_notifier;
+                return retval;
-        retval = nfsd4_init_slabs();
+        retval = register_cld_notifier();
        if (retval)
                goto out_unregister_pernet;
+        retval = nfsd4_init_slabs();
+        if (retval)
+                goto out_unregister_notifier;
        retval = nfsd4_init_pnfs();
        if (retval)
                goto out_free_slabs;
@@ -1290,10 +1290,10 @@ out_exit_pnfs:
        nfsd4_exit_pnfs();
 out_free_slabs:
        nfsd4_free_slabs();
-out_unregister_pernet:
-        unregister_pernet_subsys(&nfsd_net_ops);
 out_unregister_notifier:
        unregister_cld_notifier();
+out_unregister_pernet:
+        unregister_pernet_subsys(&nfsd_net_ops);
        return retval;
 }
@@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void)
        nfsd4_exit_pnfs();
        nfsd_fault_inject_cleanup();
        unregister_filesystem(&nfsd_fs_type);
-        unregister_pernet_subsys(&nfsd_net_ops);
        unregister_cld_notifier();
+        unregister_pernet_subsys(&nfsd_net_ops);
 }
 MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 565c4da1a9eb..cf980523898b 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
 #include "export.h"
 #undef ifdebug
-#ifdef NFSD_DEBUG
+#ifdef CONFIG_SUNRPC_DEBUG
 # define ifdebug(flag)          if (nfsd_debug & NFSDDBG_##flag)
 #else
 # define ifdebug(flag)          if (0)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0bda93e58e1b..556ce2e47555 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,7 +40,6 @@
 #include "state.h"
 #include "nfsd.h"
-#define NFSD4_MAX_SEC_LABEL_LEN 2048
 #define NFSD4_MAX_TAGLEN        128
 #define XDR_LEN(n)                     (((n) + 3) & ~3)
@@ -683,7 +682,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
                struct svc_fh *current_fh, struct nfsd4_open *open);
 extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
 extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
-                struct nfsd4_open *open, __be32 status);
+                struct nfsd4_open *open);
 extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
                struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
 extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 8e19b9d7aba8..16eff45727ee 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1312,9 +1312,7 @@ static int o2hb_debug_init(void)
        int ret = -ENOMEM;
        o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
-        if (IS_ERR_OR_NULL(o2hb_debug_dir)) {
+        if (!o2hb_debug_dir) {
-                ret = o2hb_debug_dir ?
-                        PTR_ERR(o2hb_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1327,9 +1325,7 @@ static int o2hb_debug_init(void)
                                                 sizeof(o2hb_live_node_bitmap),
                                                 O2NM_MAX_NODES,
                                                 o2hb_live_node_bitmap);
-        if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) {
+        if (!o2hb_debug_livenodes) {
-                ret = o2hb_debug_livenodes ?
-                        PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1342,9 +1338,7 @@ static int o2hb_debug_init(void)
                                                   sizeof(o2hb_live_region_bitmap),
                                                   O2NM_MAX_REGIONS,
                                                   o2hb_live_region_bitmap);
-        if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) {
+        if (!o2hb_debug_liveregions) {
-                ret = o2hb_debug_liveregions ?
-                        PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1358,9 +1352,7 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_quorum_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_quorum_region_bitmap);
-        if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) {
+        if (!o2hb_debug_quorumregions) {
-                ret = o2hb_debug_quorumregions ?
-                        PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -1374,9 +1366,7 @@ static int o2hb_debug_init(void)
                                          sizeof(o2hb_failed_region_bitmap),
                                          O2NM_MAX_REGIONS,
                                          o2hb_failed_region_bitmap);
-        if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) {
+        if (!o2hb_debug_failedregions) {
-                ret = o2hb_debug_failedregions ?
-                        PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2010,8 +2000,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
        reg->hr_debug_dir =
                debugfs_create_dir(config_item_name(&reg->hr_item), dir);
-        if (IS_ERR_OR_NULL(reg->hr_debug_dir)) {
+        if (!reg->hr_debug_dir) {
-                ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2024,9 +2013,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          O2HB_DB_TYPE_REGION_LIVENODES,
                                          sizeof(reg->hr_live_node_bitmap),
                                          O2NM_MAX_NODES, reg);
-        if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) {
+        if (!reg->hr_debug_livenodes) {
-                ret = reg->hr_debug_livenodes ?
-                        PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2038,9 +2025,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_regnum)),
                                          O2HB_DB_TYPE_REGION_NUMBER,
                                          0, O2NM_MAX_NODES, reg);
-        if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) {
+        if (!reg->hr_debug_regnum) {
-                ret = reg->hr_debug_regnum ?
-                        PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2052,9 +2037,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_elapsed_time)),
                                          O2HB_DB_TYPE_REGION_ELAPSED_TIME,
                                          0, 0, reg);
-        if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) {
+        if (!reg->hr_debug_elapsed_time) {
-                ret = reg->hr_debug_elapsed_time ?
-                        PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
@@ -2066,16 +2049,13 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
                                          sizeof(*(reg->hr_db_pinned)),
                                          O2HB_DB_TYPE_REGION_PINNED,
                                          0, 0, reg);
-        if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) {
+        if (!reg->hr_debug_pinned) {
-                ret = reg->hr_debug_pinned ?
-                        PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
                mlog_errno(ret);
                goto bail;
        }
-        return 0;
+        ret = 0;
 bail:
-        debugfs_remove_recursive(reg->hr_debug_dir);
        return ret;
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 956edf67be20..8b23aa2f52dd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2959,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
                                                         osb->osb_debug_root,
                                                         osb,
                                                         &ocfs2_dlm_debug_fops);
-        if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) {
+        if (!dlm_debug->d_locking_state) {
                ret = -EINVAL;
                mlog(ML_ERROR,
                     "Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 837ddce4b659..403c5660b306 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
                                                 ocfs2_debugfs_root);
-        if (IS_ERR_OR_NULL(osb->osb_debug_root)) {
+        if (!osb->osb_debug_root) {
                status = -EINVAL;
                mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
                goto read_super_error;
@@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
                                            osb->osb_debug_root,
                                            osb,
                                            &ocfs2_osb_debug_fops);
-        if (IS_ERR_OR_NULL(osb->osb_ctxt)) {
+        if (!osb->osb_ctxt) {
                status = -EINVAL;
                mlog_errno(status);
                goto read_super_error;
@@ -1606,9 +1606,8 @@ static int __init ocfs2_init(void)
        }
        ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
-        if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) {
+        if (!ocfs2_debugfs_root) {
-                status = ocfs2_debugfs_root ?
+                status = -ENOMEM;
-                        PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
                mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
                goto out4;
        }
diff --git a/fs/open.c b/fs/open.c
index 6796f04d6032..98e5a52dc68c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
                return -EINVAL;
        /* Return error if mode is not supported */
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+        if (mode & ~FALLOC_FL_SUPPORTED_MASK)
-                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        /* Punch hole and zero range are mutually exclusive */
@@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
            (mode & ~FALLOC_FL_COLLAPSE_RANGE))
                return -EINVAL;
+        /* Insert range should only be used exclusively. */
+        if ((mode & FALLOC_FL_INSERT_RANGE) &&
+            (mode & ~FALLOC_FL_INSERT_RANGE))
+                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
diff --git a/fs/pnode.c b/fs/pnode.c
index 260ac8f898a4..6367e1e435c6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 }
 /*
+ * Clear MNT_LOCKED when it can be shown to be safe.
+ *
+ * mount_lock lock must be held for write
+ */
+void propagate_mount_unlock(struct mount *mnt)
+{
+        struct mount *parent = mnt->mnt_parent;
+        struct mount *m, *child;
+        BUG_ON(parent == mnt);
+        for (m = propagation_next(parent, parent); m;
+                        m = propagation_next(m, parent)) {
+                child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+                if (child)
+                        child->mnt.mnt_flags &= ~MNT_LOCKED;
+        }
+}
+/*
+ * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ */
+static void mark_umount_candidates(struct mount *mnt)
+{
+        struct mount *parent = mnt->mnt_parent;
+        struct mount *m;
+        BUG_ON(parent == mnt);
+        for (m = propagation_next(parent, parent); m;
+                        m = propagation_next(m, parent)) {
+                struct mount *child = __lookup_mnt_last(&m->mnt,
+                                                mnt->mnt_mountpoint);
+                if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+                        SET_MNT_MARK(child);
+                }
+        }
+}
+/*
 * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
 * parent propagates to.
 */
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
                struct mount *child = __lookup_mnt_last(&m->mnt,
                                                mnt->mnt_mountpoint);
                /*
-                 * umount the child only if the child has no
+                 * umount the child only if the child has no children
-                 * other children
+                 * and the child is marked safe to unmount.
                 */
-                if (child && list_empty(&child->mnt_mounts)) {
+                if (!child || !IS_MNT_MARKED(child))
+                        continue;
+                CLEAR_MNT_MARK(child);
+                if (list_empty(&child->mnt_mounts)) {
                        list_del_init(&child->mnt_child);
-                        hlist_del_init_rcu(&child->mnt_hash);
+                        child->mnt.mnt_flags |= MNT_UMOUNT;
-                        hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+                        list_move_tail(&child->mnt_list, &mnt->mnt_list);
                }
        }
 }
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
 *
 * vfsmount lock must be held for write
 */
-int propagate_umount(struct hlist_head *list)
+int propagate_umount(struct list_head *list)
 {
        struct mount *mnt;
-        hlist_for_each_entry(mnt, list, mnt_hash)
+        list_for_each_entry_reverse(mnt, list, mnt_list)
+                mark_umount_candidates(mnt);
+        list_for_each_entry(mnt, list, mnt_list)
                __propagate_umount(mnt);
        return 0;
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 4a246358b031..7114ce6e6b9e 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,6 +19,9 @@
 #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
 #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
 #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
+#define IS_MNT_LOCKED_AND_LAZY(m) \
+        (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)
 #define CL_EXPIRE               0x01
 #define CL_SLAVE                0x02
@@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
                struct hlist_head *);
-int propagate_umount(struct hlist_head *);
+int propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
+void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);
 int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
                        struct mount *);
-void umount_tree(struct mount *, int);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
                         const struct path *root);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a6fbf4472017..516162be1398 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -260,6 +260,7 @@ xfs_alloc_fix_len(
                rlen = rlen - (k - args->mod);
        else
                rlen = rlen - args->prod + (args->mod - k);
+        /* casts to (int) catch length underflows */
        if ((int)rlen < (int)args->minlen)
                return;
        ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
@@ -286,7 +287,8 @@ xfs_alloc_fix_minleft(
        if (diff >= 0)
                return 1;
        args->len += diff;              /* shrink the allocated space */
-        if (args->len >= args->minlen)
+        /* casts to (int) catch length underflows */
+        if ((int)args->len >= (int)args->minlen)
                return 1;
        args->agbno = NULLAGBLOCK;
        return 0;
@@ -315,6 +317,9 @@ xfs_alloc_fixup_trees(
        xfs_agblock_t   nfbno2;         /* second new free startblock */
        xfs_extlen_t    nflen1=0;       /* first new free length */
        xfs_extlen_t    nflen2=0;       /* second new free length */
+        struct xfs_mount *mp;
+        mp = cnt_cur->bc_mp;
        /*
         * Look up the record in the by-size tree if necessary.
@@ -323,13 +328,13 @@ xfs_alloc_fixup_trees(
 #ifdef DEBUG
                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        i == 1 && nfbno1 == fbno && nflen1 == flen);
 #endif
        } else {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        /*
         * Look up the record in the by-block tree if necessary.
@@ -338,13 +343,13 @@ xfs_alloc_fixup_trees(
 #ifdef DEBUG
                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        i == 1 && nfbno1 == fbno && nflen1 == flen);
 #endif
        } else {
                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
 #ifdef DEBUG
@@ -355,7 +360,7 @@ xfs_alloc_fixup_trees(
                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        bnoblock->bb_numrecs == cntblock->bb_numrecs);
        }
 #endif
@@ -386,25 +391,25 @@ xfs_alloc_fixup_trees(
         */
        if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        /*
         * Add new by-size btree entry(s).
         */
        if (nfbno1 != NULLAGBLOCK) {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        if (nfbno2 != NULLAGBLOCK) {
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        /*
         * Fix up the by-block btree entry(s).
@@ -415,7 +420,7 @@ xfs_alloc_fixup_trees(
                 */
                if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        } else {
                /*
                 * Update the by-block entry to start later|be shorter.
@@ -429,10 +434,10 @@ xfs_alloc_fixup_trees(
                 */
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
                if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        }
        return 0;
 }
@@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact(
        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
        if (error)
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
        ASSERT(fbno <= args->agbno);
        /*
@@ -783,7 +788,7 @@ xfs_alloc_find_best_extent(
                error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
                /*
@@ -946,7 +951,7 @@ restart:
                                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
                                                &ltlen, &i)))
                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
                                if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
@@ -966,7 +971,7 @@ restart:
                         */
                        if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                  &ltbnoa, &ltlena);
                        if (ltlena < args->minlen)
@@ -999,7 +1004,7 @@ restart:
                cnt_cur->bc_ptrs[0] = besti;
                if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
                args->len = blen;
                if (!xfs_alloc_fix_minleft(args)) {
@@ -1088,7 +1093,7 @@ restart:
                if (bno_cur_lt) {
                        if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, ltbno, ltlen,
                                                  &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
@@ -1104,7 +1109,7 @@ restart:
                if (bno_cur_gt) {
                        if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, gtbno, gtlen,
                                                  &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
@@ -1303,7 +1308,7 @@ restart:
                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        xfs_alloc_compute_aligned(args, fbno, flen,
                                                  &rbno, &rlen);
@@ -1342,7 +1347,7 @@ restart:
         * This can't happen in the second case above.
         */
        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                        (rlen <= flen && rbno + rlen <= fbno + flen), error0);
        if (rlen < args->maxlen) {
                xfs_agblock_t   bestfbno;
@@ -1362,13 +1367,13 @@ restart:
                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
                                        &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                        if (flen < bestrlen)
                                break;
                        xfs_alloc_compute_aligned(args, fbno, flen,
                                                  &rbno, &rlen);
                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
-                        XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
+                        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
                                (rlen <= flen && rbno + rlen <= fbno + flen),
                                error0);
                        if (rlen > bestrlen) {
@@ -1383,7 +1388,7 @@ restart:
                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
                                &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
                rlen = bestrlen;
                rbno = bestrbno;
                flen = bestflen;
@@ -1408,7 +1413,7 @@ restart:
        if (!xfs_alloc_fix_minleft(args))
                goto out_nominleft;
        rlen = args->len;
-        XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
+        XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
@@ -1422,7 +1427,7 @@ restart:
        cnt_cur = bno_cur = NULL;
        args->len = rlen;
        args->agbno = rbno;
-        XFS_WANT_CORRUPTED_GOTO(
+        XFS_WANT_CORRUPTED_GOTO(args->mp,
                args->agbno + args->len <=
                        be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                error0);
@@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small(
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
        }
        /*
         * Nothing in the btree, try the freelist.  Make sure
@@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small(
                        }
                        args->len = 1;
                        args->agbno = fbno;
-                        XFS_WANT_CORRUPTED_GOTO(
+                        XFS_WANT_CORRUPTED_GOTO(args->mp,
                                args->agbno + args->len <=
                                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
                                error0);
@@ -1579,7 +1584,7 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * It's not contiguous, though.
                 */
@@ -1591,7 +1596,8 @@ xfs_free_ag_extent(
                         * space was invalid, it's (partly) already free.
                         * Very bad.
                         */
-                        XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0);
+                        XFS_WANT_CORRUPTED_GOTO(mp,
+                                                ltbno + ltlen <= bno, error0);
                }
        }
        /*
@@ -1606,7 +1612,7 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * It's not contiguous, though.
                 */
@@ -1618,7 +1624,7 @@ xfs_free_ag_extent(
                         * space was invalid, it's (partly) already free.
                         * Very bad.
                         */
-                        XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0);
+                        XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
                }
        }
        /*
@@ -1635,31 +1641,31 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Delete the old by-size entry on the right.
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
                if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
 #ifdef DEBUG
                /*
                 * Check that this is the right record: delete didn't
@@ -1672,7 +1678,7 @@ xfs_free_ag_extent(
                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
                                        &i)))
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(
+                        XFS_WANT_CORRUPTED_GOTO(mp,
                                i == 1 && xxbno == ltbno && xxlen == ltlen,
                                error0);
                }
@@ -1695,17 +1701,17 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                nbno = ltbno;
                nlen = len + ltlen;
                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1721,10 +1727,10 @@ xfs_free_ag_extent(
                 */
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                /*
                 * Update the starting block and length of the right
                 * neighbor in the by-block tree.
@@ -1743,7 +1749,7 @@ xfs_free_ag_extent(
                nlen = len;
                if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        }
        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
        bno_cur = NULL;
@@ -1752,10 +1758,10 @@ xfs_free_ag_extent(
         */
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
        if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
        cnt_cur = NULL;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 15105dbc9e28..04e79d57bca6 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
                        int move_count);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+/*
+ * attr3 block 'firstused' conversion helpers.
+ *
+ * firstused refers to the offset of the first used byte of the nameval region
+ * of an attr leaf block. The region starts at the tail of the block and expands
+ * backwards towards the middle. As such, firstused is initialized to the block
+ * size for an empty leaf block and is reduced from there.
+ *
+ * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
+ * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
+ * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
+ * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
+ * the attr block size. The following helpers manage the conversion between the
+ * in-core and on-disk formats.
+ */
+static void
+xfs_attr3_leaf_firstused_from_disk(
+        struct xfs_da_geometry          *geo,
+        struct xfs_attr3_icleaf_hdr     *to,
+        struct xfs_attr_leafblock       *from)
+{
+        struct xfs_attr3_leaf_hdr       *hdr3;
+        if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
+                hdr3 = (struct xfs_attr3_leaf_hdr *) from;
+                to->firstused = be16_to_cpu(hdr3->firstused);
+        } else {
+                to->firstused = be16_to_cpu(from->hdr.firstused);
+        }
+        /*
+         * Convert from the magic fsb size value to actual blocksize. This
+         * should only occur for empty blocks when the block size overflows
+         * 16-bits.
+         */
+        if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
+                ASSERT(!to->count && !to->usedbytes);
+                ASSERT(geo->blksize > USHRT_MAX);
+                to->firstused = geo->blksize;
+        }
+}
+static void
+xfs_attr3_leaf_firstused_to_disk(
+        struct xfs_da_geometry          *geo,
+        struct xfs_attr_leafblock       *to,
+        struct xfs_attr3_icleaf_hdr     *from)
+{
+        struct xfs_attr3_leaf_hdr       *hdr3;
+        uint32_t                        firstused;
+        /* magic value should only be seen on disk */
+        ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
+        /*
+         * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
+         * value. This only overflows at the max supported value of 64k. Use the
+         * magic on-disk value to represent block size in this case.
+         */
+        firstused = from->firstused;
+        if (firstused > USHRT_MAX) {
+                ASSERT(from->firstused == geo->blksize);
+                firstused = XFS_ATTR3_LEAF_NULLOFF;
+        }
+        if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
+                hdr3 = (struct xfs_attr3_leaf_hdr *) to;
+                hdr3->firstused = cpu_to_be16(firstused);
+        } else {
+                to->hdr.firstused = cpu_to_be16(firstused);
+        }
+}
 void
 xfs_attr3_leaf_hdr_from_disk(
+        struct xfs_da_geometry          *geo,
        struct xfs_attr3_icleaf_hdr     *to,
        struct xfs_attr_leafblock       *from)
 {
@@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk(
                to->magic = be16_to_cpu(hdr3->info.hdr.magic);
                to->count = be16_to_cpu(hdr3->count);
                to->usedbytes = be16_to_cpu(hdr3->usedbytes);
-                to->firstused = be16_to_cpu(hdr3->firstused);
+                xfs_attr3_leaf_firstused_from_disk(geo, to, from);
                to->holes = hdr3->holes;
                for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk(
        to->magic = be16_to_cpu(from->hdr.info.magic);
        to->count = be16_to_cpu(from->hdr.count);
        to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
-        to->firstused = be16_to_cpu(from->hdr.firstused);
+        xfs_attr3_leaf_firstused_from_disk(geo, to, from);
        to->holes = from->hdr.holes;
        for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk(
 void
 xfs_attr3_leaf_hdr_to_disk(
+        struct xfs_da_geometry          *geo,
        struct xfs_attr_leafblock       *to,
        struct xfs_attr3_icleaf_hdr     *from)
 {
-        int     i;
+        int                             i;
        ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
               from->magic == XFS_ATTR3_LEAF_MAGIC);
@@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk(
                hdr3->info.hdr.magic = cpu_to_be16(from->magic);
                hdr3->count = cpu_to_be16(from->count);
                hdr3->usedbytes = cpu_to_be16(from->usedbytes);
-                hdr3->firstused = cpu_to_be16(from->firstused);
+                xfs_attr3_leaf_firstused_to_disk(geo, to, from);
                hdr3->holes = from->holes;
                hdr3->pad1 = 0;
@@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk(
        to->hdr.info.magic = cpu_to_be16(from->magic);
        to->hdr.count = cpu_to_be16(from->count);
        to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
-        to->hdr.firstused = cpu_to_be16(from->firstused);
+        xfs_attr3_leaf_firstused_to_disk(geo, to, from);
        to->hdr.holes = from->holes;
        to->hdr.pad1 = 0;
@@ -178,7 +254,7 @@ xfs_attr3_leaf_verify(
        struct xfs_attr_leafblock *leaf = bp->b_addr;
        struct xfs_attr3_icleaf_hdr ichdr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        if (xfs_sb_version_hascrc(&mp->m_sb)) {
                struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -757,9 +833,10 @@ xfs_attr_shortform_allfit(
        struct xfs_attr3_icleaf_hdr leafhdr;
        int                     bytes;
        int                     i;
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
        entry = xfs_attr3_leaf_entryp(leaf);
        bytes = sizeof(struct xfs_attr_sf_hdr);
@@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform(
        memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
        leaf = (xfs_attr_leafblock_t *)tmpbuffer;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        entry = xfs_attr3_leaf_entryp(leaf);
        /* XXX (dgc): buffer is about to be marked stale - why zero it? */
@@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node(
        btree = dp->d_ops->node_tree_p(node);
        leaf = bp2->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        /* both on-disk, don't endian-flip twice */
@@ -988,7 +1065,7 @@ xfs_attr3_leaf_create(
        }
        ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
        *bpp = bp;
@@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add(
        trace_xfs_attr_leaf_add(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index >= 0 && args->index <= ichdr.count);
        entsize = xfs_attr_leaf_newentsize(args, NULL);
@@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add(
        tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
 out_log_hdr:
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp,
                XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                xfs_attr3_leaf_hdr_size(leaf)));
@@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact(
                                                ichdr_dst->freemap[0].base;
        /* write the header back to initialise the underlying buffer */
-        xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
        /*
         * Copy all entry's in the same (sorted) order,
@@ -1344,9 +1421,10 @@ xfs_attr_leaf_order(
 {
        struct xfs_attr3_icleaf_hdr ichdr1;
        struct xfs_attr3_icleaf_hdr ichdr2;
+        struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
        return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
 }
@@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance(
        ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
        leaf1 = blk1->bp->b_addr;
        leaf2 = blk2->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
        ASSERT(ichdr2.count == 0);
        args = state->args;
@@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance(
                                        ichdr1.count, count);
        }
-        xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
-        xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
        xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
        xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
@@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall(
         */
        blk = &state->path.blk[ state->path.active-1 ];
        leaf = blk->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
        bytes = xfs_attr3_leaf_hdr_size(leaf) +
                ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
                ichdr.usedbytes;
@@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall(
                if (error)
                        return error;
-                xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
+                xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
                bytes = state->args->geo->blksize -
                        (state->args->geo->blksize >> 2) -
@@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove(
        trace_xfs_attr_leaf_remove(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
        ASSERT(args->index >= 0 && args->index < ichdr.count);
@@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove(
                                tmp = be16_to_cpu(entry->nameidx);
                }
                ichdr.firstused = tmp;
-                if (!ichdr.firstused)
+                ASSERT(ichdr.firstused != 0);
-                        ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
        } else {
                ichdr.holes = 1;        /* mark as needing compaction */
        }
-        xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
+        xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
        xfs_trans_log_buf(args->trans, bp,
                          XFS_DA_LOGRANGE(leaf, &leaf->hdr,
                                          xfs_attr3_leaf_hdr_size(leaf)));
@@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance(
        drop_leaf = drop_blk->bp->b_addr;
        save_leaf = save_blk->bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
-        xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf);
+        xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
        entry = xfs_attr3_leaf_entryp(drop_leaf);
        /*
@@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance(
                tmphdr.firstused = state->args->geo->blksize;
                /* write the header to the temp buffer to initialise it */
-                xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
+                xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
                if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
                                         drop_blk->bp, &drophdr)) {
@@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance(
                kmem_free(tmp_leaf);
        }
-        xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
+        xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
        xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
                                           state->args->geo->blksize - 1);
@@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int(
        trace_xfs_attr_leaf_lookup(args);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        ASSERT(ichdr.count < args->geo->blksize / 8);
@@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue(
        int                     valuelen;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(ichdr.count < args->geo->blksize / 8);
        ASSERT(args->index < ichdr.count);
@@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash(
 {
        struct xfs_attr3_icleaf_hdr ichdr;
        struct xfs_attr_leaf_entry *entries;
+        struct xfs_mount *mp = bp->b_target->bt_mount;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
        entries = xfs_attr3_leaf_entryp(bp->b_addr);
        if (count)
                *count = ichdr.count;
@@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag(
        ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index < ichdr.count);
        ASSERT(args->index >= 0);
@@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag(
        leaf = bp->b_addr;
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
        ASSERT(args->index < ichdr.count);
        ASSERT(args->index >= 0);
 #endif
@@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags(
        entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
 #ifdef DEBUG
-        xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
        ASSERT(args->index < ichdr1.count);
        ASSERT(args->index >= 0);
-        xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2);
+        xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
        ASSERT(args->index2 < ichdr2.count);
        ASSERT(args->index2 >= 0);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..025c4b820c03 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -100,9 +100,11 @@ int	xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
 int     xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
                        struct xfs_buf **bpp);
-void    xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
+void    xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
+                                     struct xfs_attr3_icleaf_hdr *to,
                                     struct xfs_attr_leafblock *from);
-void    xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
+void    xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
+                                   struct xfs_attr_leafblock *to,
                                   struct xfs_attr3_icleaf_hdr *from);
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 61ec015dca16..aeffeaaac0ec 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset(
        }
 }
-/*
- * Debug/sanity checking code
- */
-STATIC int
-xfs_bmap_sanity_check(
-        struct xfs_mount        *mp,
-        struct xfs_buf          *bp,
-        int                     level)
-{
-        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
-        if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
-            block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
-                return 0;
-        if (be16_to_cpu(block->bb_level) != level ||
-            be16_to_cpu(block->bb_numrecs) == 0 ||
-            be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
-                return 0;
-        return 1;
-}
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
@@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents(
                                goto error_norelse;
                }
                block = XFS_BUF_TO_BLOCK(bp);
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, level),
-                        error0);
                if (level == 0)
                        break;
@@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents(
                xfs_check_block(block, mp, 0, 0);
                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
-                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                XFS_WANT_CORRUPTED_GOTO(mp,
+                                        XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
                        bp_release = 0;
                        xfs_trans_brelse(NULL, bp);
@@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree(
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
-                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
                if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
@@ -1311,14 +1285,12 @@ xfs_bmap_read_extents(
                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, level),
-                        error0);
                if (level == 0)
                        break;
                pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
-                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
+                XFS_WANT_CORRUPTED_GOTO(mp,
+                        XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
        }
        /*
@@ -1345,9 +1317,6 @@ xfs_bmap_read_extents(
                                XFS_ERRLEVEL_LOW, ip->i_mount, block);
                        goto error0;
                }
-                XFS_WANT_CORRUPTED_GOTO(
-                        xfs_bmap_sanity_check(mp, bp, 0),
-                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
@@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real(
        xfs_filblks_t           temp=0; /* value for da_new calculations */
        xfs_filblks_t           temp2=0;/* value for da_new calculations */
        int                     tmp_rval;       /* partial logging flags */
+        struct xfs_mount        *mp;
+        mp  = bma->tp ? bma->tp->t_mountp : NULL;
        ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
        ASSERT(bma->idx >= 0);
@@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_delete(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_decrement(bma->cur, 0, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
                                        new->br_startblock,
                                        PREV.br_blockcount +
@@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
                                        LEFT.br_startblock,
                                        LEFT.br_blockcount +
@@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real(
                diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
                        (bma->cur ? bma->cur->bc_private.b.allocated : 0));
                if (diff > 0) {
-                        error = xfs_icsb_modify_counters(bma->ip->i_mount,
+                        error = xfs_mod_fdblocks(bma->ip->i_mount,
-                                        XFS_SBS_FDBLOCKS,
+                                                 -((int64_t)diff), false);
-                                        -((int64_t)diff), 0);
                        ASSERT(!error);
                        if (error)
                                goto done;
@@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real(
                        temp += bma->cur->bc_private.b.allocated;
                ASSERT(temp <= da_old);
                if (temp < da_old)
-                        xfs_icsb_modify_counters(bma->ip->i_mount,
+                        xfs_mod_fdblocks(bma->ip->i_mount,
-                                        XFS_SBS_FDBLOCKS,
+                                        (int64_t)(da_old - temp), false);
-                                        (int64_t)(da_old - temp), 0);
        }
        /* clear out the allocated field, done with it now in any case. */
@@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        /* left is 0, right is 1, prev is 2 */
        int                     rval=0; /* return value (logging flags) */
        int                     state = 0;/* state bits, accessed thru macros */
+        struct xfs_mount        *mp = tp->t_mountp;
        *logflagsp = 0;
@@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount +
@@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
                                LEFT.br_blockcount + PREV.br_blockcount,
@@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_startblock,
                                        RIGHT.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
                                new->br_blockcount + RIGHT.br_blockcount,
@@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock, new->br_blockcount,
                                newext)))
@@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur,
                                PREV.br_startoff + new->br_blockcount,
                                PREV.br_startblock + new->br_blockcount,
@@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = *new;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock,
                                        PREV.br_blockcount, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
                                PREV.br_startblock,
                                PREV.br_blockcount - new->br_blockcount,
@@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real(
                                        PREV.br_startblock, PREV.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        /* new right extent - oldext */
                        if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
                                r[1].br_startblock, r[1].br_blockcount,
@@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real(
                                new->br_startoff - PREV.br_startoff;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        /*
                         * Reset the cursor to the position of the new extent
                         * we are about to insert as we can't trust it after
@@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real(
                                        new->br_startblock, new->br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
                        if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
@@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay(
        }
        if (oldlen != newlen) {
                ASSERT(oldlen > newlen);
-                xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
-                        (int64_t)(oldlen - newlen), 0);
+                                 false);
                /*
                 * Nothing to do for disk quota accounting here.
                 */
@@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real(
        xfs_bmbt_irec_t         right;  /* right neighbor extent entry */
        int                     rval=0; /* return value (logging flags) */
        int                     state;  /* state bits, accessed thru macros */
+        struct xfs_mount        *mp;
+        mp = bma->tp ? bma->tp->t_mountp : NULL;
        ifp = XFS_IFORK_PTR(bma->ip, whichfork);
        ASSERT(bma->idx >= 0);
@@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_delete(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_btree_decrement(bma->cur, 0, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real(
                                        &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, left.br_startoff,
                                        left.br_startblock,
                                        left.br_blockcount +
@@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        error = xfs_bmbt_update(bma->cur, new->br_startoff,
                                        new->br_startblock,
                                        new->br_blockcount +
@@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real(
                                        new->br_blockcount, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
                        bma->cur->bc_rec.b.br_state = new->br_state;
                        error = xfs_btree_insert(bma->cur, &i);
                        if (error)
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                break;
        }
@@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc(
        ASSERT(indlen > 0);
        if (rt) {
-                error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                error = xfs_mod_frextents(mp, -((int64_t)extsz));
-                                          -((int64_t)extsz), 0);
        } else {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
-                                                 -((int64_t)alen), 0);
        }
        if (error)
                goto out_unreserve_quota;
-        error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+        error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
-                                         -((int64_t)indlen), 0);
        if (error)
                goto out_unreserve_blocks;
@@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc(
 out_unreserve_blocks:
        if (rt)
-                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0);
+                xfs_mod_frextents(mp, extsz);
        else
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0);
+                xfs_mod_fdblocks(mp, alen, false);
 out_unreserve_quota:
        if (XFS_IS_QUOTA_ON(mp))
                xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
@@ -4801,7 +4770,7 @@ xfs_bmap_del_extent(
                                        got.br_startblock, got.br_blockcount,
                                        &i)))
                                goto done;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                }
                da_old = da_new = 0;
        } else {
@@ -4835,7 +4804,7 @@ xfs_bmap_del_extent(
                }
                if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                break;
        case 2:
@@ -4935,7 +4904,8 @@ xfs_bmap_del_extent(
                                                        got.br_startblock,
                                                        temp, &i)))
                                                goto done;
-                                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                        XFS_WANT_CORRUPTED_GOTO(mp,
+                                                                i == 1, done);
                                        /*
                                         * Update the btree record back
                                         * to the original value.
@@ -4956,7 +4926,7 @@ xfs_bmap_del_extent(
                                        error = -ENOSPC;
                                        goto done;
                                }
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+                                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
                        } else
                                flags |= xfs_ilog_fext(whichfork);
                        XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5012,10 +4982,8 @@ xfs_bmap_del_extent(
         * Nothing to do for disk quota accounting here.
         */
        ASSERT(da_old >= da_new);
-        if (da_old > da_new) {
+        if (da_old > da_new)
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
-                        (int64_t)(da_old - da_new), 0);
-        }
 done:
        *logflagsp = flags;
        return error;
@@ -5284,14 +5252,13 @@ xfs_bunmapi(
                                rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
                                do_div(rtexts, mp->m_sb.sb_rextsize);
-                                xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS,
+                                xfs_mod_frextents(mp, (int64_t)rtexts);
-                                                (int64_t)rtexts, 0);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_RTBLKS);
                        } else {
-                                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                                xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
-                                                (int64_t)del.br_blockcount, 0);
+                                                 false);
                                (void)xfs_trans_reserve_quota_nblks(NULL,
                                        ip, -((long)del.br_blockcount), 0,
                                        XFS_QMOPT_RES_REGBLKS);
@@ -5453,6 +5420,7 @@ xfs_bmse_merge(
        struct xfs_bmbt_irec            left;
        xfs_filblks_t                   blockcount;
        int                             error, i;
+        struct xfs_mount                *mp = ip->i_mount;
        xfs_bmbt_get_all(gotp, &got);
        xfs_bmbt_get_all(leftp, &left);
@@ -5487,19 +5455,19 @@ xfs_bmse_merge(
                                   got.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        error = xfs_btree_delete(cur, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        /* lookup and update size of the previous extent */
        error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
                                   left.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        left.br_blockcount = blockcount;
@@ -5518,50 +5486,92 @@ xfs_bmse_shift_one(
        int                             *current_ext,
        struct xfs_bmbt_rec_host        *gotp,
        struct xfs_btree_cur            *cur,
-        int                             *logflags)
+        int                             *logflags,
+        enum shift_direction            direction)
 {
        struct xfs_ifork                *ifp;
+        struct xfs_mount                *mp;
        xfs_fileoff_t                   startoff;
-        struct xfs_bmbt_rec_host        *leftp;
+        struct xfs_bmbt_rec_host        *adj_irecp;
        struct xfs_bmbt_irec            got;
-        struct xfs_bmbt_irec            left;
+        struct xfs_bmbt_irec            adj_irec;
        int                             error;
        int                             i;
+        int                             total_extents;
+        mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
        xfs_bmbt_get_all(gotp, &got);
-        startoff = got.br_startoff - offset_shift_fsb;
        /* delalloc extents should be prevented by caller */
-        XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock));
+        XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
-        /*
+        if (direction == SHIFT_LEFT) {
-         * Check for merge if we've got an extent to the left, otherwise make
+                startoff = got.br_startoff - offset_shift_fsb;
-         * sure there's enough room at the start of the file for the shift.
-         */
+                /*
-        if (*current_ext) {
+                 * Check for merge if we've got an extent to the left,
-                /* grab the left extent and check for a large enough hole */
+                 * otherwise make sure there's enough room at the start
-                leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
+                 * of the file for the shift.
-                xfs_bmbt_get_all(leftp, &left);
+                 */
+                if (!*current_ext) {
+                        if (got.br_startoff < offset_shift_fsb)
+                                return -EINVAL;
+                        goto update_current_ext;
+                }
+                /*
+                 * grab the left extent and check for a large
+                 * enough hole.
+                 */
+                adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
+                xfs_bmbt_get_all(adj_irecp, &adj_irec);
-                if (startoff < left.br_startoff + left.br_blockcount)
+                if (startoff <
+                    adj_irec.br_startoff + adj_irec.br_blockcount)
                        return -EINVAL;
                /* check whether to merge the extent or shift it down */
-                if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) {
+                if (xfs_bmse_can_merge(&adj_irec, &got,
+                                       offset_shift_fsb)) {
                        return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
-                                              *current_ext, gotp, leftp, cur,
+                                              *current_ext, gotp, adj_irecp,
-                                              logflags);
+                                              cur, logflags);
                }
-        } else if (got.br_startoff < offset_shift_fsb)
+        } else {
-                return -EINVAL;
+                startoff = got.br_startoff + offset_shift_fsb;
+                /* nothing to move if this is the last extent */
+                if (*current_ext >= (total_extents - 1))
+                        goto update_current_ext;
+                /*
+                 * If this is not the last extent in the file, make sure there
+                 * is enough room between current extent and next extent for
+                 * accommodating the shift.
+                 */
+                adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
+                xfs_bmbt_get_all(adj_irecp, &adj_irec);
+                if (startoff + got.br_blockcount > adj_irec.br_startoff)
+                        return -EINVAL;
+                /*
+                 * Unlike a left shift (which involves a hole punch),
+                 * a right shift does not modify extent neighbors
+                 * in any way. We should never find mergeable extents
+                 * in this scenario. Check anyways and warn if we
+                 * encounter two extents that could be one.
+                 */
+                if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
+                        WARN_ON_ONCE(1);
+        }
        /*
         * Increment the extent index for the next iteration, update the start
         * offset of the in-core extent and update the btree if applicable.
         */
-        (*current_ext)++;
+update_current_ext:
+        if (direction == SHIFT_LEFT)
+                (*current_ext)++;
+        else
+                (*current_ext)--;
        xfs_bmbt_set_startoff(gotp, startoff);
        *logflags |= XFS_ILOG_CORE;
        if (!cur) {
@@ -5573,18 +5583,18 @@ xfs_bmse_shift_one(
                                   got.br_blockcount, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
        got.br_startoff = startoff;
        return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
-                                got.br_blockcount, got.br_state);
+                               got.br_blockcount, got.br_state);
 }
 /*
- * Shift extent records to the left to cover a hole.
+ * Shift extent records to the left/right to cover/create a hole.
 *
 * The maximum number of extents to be shifted in a single operation is
- * @num_exts. @start_fsb specifies the file offset to start the shift and the
+ * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
 * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
 * is the length by which each extent is shifted. If there is no hole to shift
 * the extents into, this will be considered invalid operation and we abort
@@ -5594,12 +5604,13 @@ int
 xfs_bmap_shift_extents(
        struct xfs_trans        *tp,
        struct xfs_inode        *ip,
-        xfs_fileoff_t           start_fsb,
+        xfs_fileoff_t           *next_fsb,
        xfs_fileoff_t           offset_shift_fsb,
        int                     *done,
-        xfs_fileoff_t           *next_fsb,
+        xfs_fileoff_t           stop_fsb,
        xfs_fsblock_t           *firstblock,
        struct xfs_bmap_free    *flist,
+        enum shift_direction    direction,
        int                     num_exts)
 {
        struct xfs_btree_cur            *cur = NULL;
@@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents(
        struct xfs_ifork                *ifp;
        xfs_extnum_t                    nexts = 0;
        xfs_extnum_t                    current_ext;
+        xfs_extnum_t                    total_extents;
+        xfs_extnum_t                    stop_extent;
        int                             error = 0;
        int                             whichfork = XFS_DATA_FORK;
        int                             logflags = 0;
-        int                             total_extents;
        if (unlikely(XFS_TEST_ERROR(
            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents(
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+        ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
+        ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
        ifp = XFS_IFORK_PTR(ip, whichfork);
        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents(
        }
        /*
+         * There may be delalloc extents in the data fork before the range we
+         * are collapsing out, so we cannot use the count of real extents here.
+         * Instead we have to calculate it from the incore fork.
+         */
+        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        if (total_extents == 0) {
+                *done = 1;
+                goto del_cursor;
+        }
+        /*
+         * In case of first right shift, we need to initialize next_fsb
+         */
+        if (*next_fsb == NULLFSBLOCK) {
+                gotp = xfs_iext_get_ext(ifp, total_extents - 1);
+                xfs_bmbt_get_all(gotp, &got);
+                *next_fsb = got.br_startoff;
+                if (stop_fsb > *next_fsb) {
+                        *done = 1;
+                        goto del_cursor;
+                }
+        }
+        /* Lookup the extent index at which we have to stop */
+        if (direction == SHIFT_RIGHT) {
+                gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
+                /* Make stop_extent exclusive of shift range */
+                stop_extent--;
+        } else
+                stop_extent = total_extents;
+        /*
         * Look up the extent index for the fsb where we start shifting. We can
         * henceforth iterate with current_ext as extent list changes are locked
         * out via ilock.
         *
         * gotp can be null in 2 cases: 1) if there are no extents or 2)
-         * start_fsb lies in a hole beyond which there are no extents. Either
+         * *next_fsb lies in a hole beyond which there are no extents. Either
         * way, we are done.
         */
-        gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
+        gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
        if (!gotp) {
                *done = 1;
                goto del_cursor;
        }
-        /*
+        /* some sanity checking before we finally start shifting extents */
-         * There may be delalloc extents in the data fork before the range we
+        if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
-         * are collapsing out, so we cannot use the count of real extents here.
+             (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
-         * Instead we have to calculate it from the incore fork.
+                error = -EIO;
-         */
+                goto del_cursor;
-        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+        }
-        while (nexts++ < num_exts && current_ext < total_extents) {
+        while (nexts++ < num_exts) {
                error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
-                                        &current_ext, gotp, cur, &logflags);
+                                           &current_ext, gotp, cur, &logflags,
+                                           direction);
                if (error)
                        goto del_cursor;
+                /*
+                 * If there was an extent merge during the shift, the extent
+                 * count can change. Update the total and grade the next record.
+                 */
+                if (direction == SHIFT_LEFT) {
+                        total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                        stop_extent = total_extents;
+                }
-                /* update total extent count and grab the next record */
+                if (current_ext == stop_extent) {
-                total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+                        *done = 1;
-                if (current_ext >= total_extents)
+                        *next_fsb = NULLFSBLOCK;
                        break;
+                }
                gotp = xfs_iext_get_ext(ifp, current_ext);
        }
-        /* Check if we are done */
+        if (!*done) {
-        if (current_ext == total_extents) {
-                *done = 1;
-        } else if (next_fsb) {
                xfs_bmbt_get_all(gotp, &got);
                *next_fsb = got.br_startoff;
        }
@@ -5696,3 +5750,189 @@ del_cursor:
        return error;
 }
+/*
+ * Splits an extent into two extents at split_fsb block such that it is
+ * the first block of the current_ext. @current_ext is a target extent
+ * to be split. @split_fsb is a block where the extents is split.
+ * If split_fsb lies in a hole or the first block of extents, just return 0.
+ */
+STATIC int
+xfs_bmap_split_extent_at(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           split_fsb,
+        xfs_fsblock_t           *firstfsb,
+        struct xfs_bmap_free    *free_list)
+{
+        int                             whichfork = XFS_DATA_FORK;
+        struct xfs_btree_cur            *cur = NULL;
+        struct xfs_bmbt_rec_host        *gotp;
+        struct xfs_bmbt_irec            got;
+        struct xfs_bmbt_irec            new; /* split extent */
+        struct xfs_mount                *mp = ip->i_mount;
+        struct xfs_ifork                *ifp;
+        xfs_fsblock_t                   gotblkcnt; /* new block count for got */
+        xfs_extnum_t                    current_ext;
+        int                             error = 0;
+        int                             logflags = 0;
+        int                             i = 0;
+        if (unlikely(XFS_TEST_ERROR(
+            (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+             XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+             mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+                XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
+                                 XFS_ERRLEVEL_LOW, mp);
+                return -EFSCORRUPTED;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        ifp = XFS_IFORK_PTR(ip, whichfork);
+        if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+                /* Read in all the extents */
+                error = xfs_iread_extents(tp, ip, whichfork);
+                if (error)
+                        return error;
+        }
+        /*
+         * gotp can be null in 2 cases: 1) if there are no extents
+         * or 2) split_fsb lies in a hole beyond which there are
+         * no extents. Either way, we are done.
+         */
+        gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
+        if (!gotp)
+                return 0;
+        xfs_bmbt_get_all(gotp, &got);
+        /*
+         * Check split_fsb lies in a hole or the start boundary offset
+         * of the extent.
+         */
+        if (got.br_startoff >= split_fsb)
+                return 0;
+        gotblkcnt = split_fsb - got.br_startoff;
+        new.br_startoff = split_fsb;
+        new.br_startblock = got.br_startblock + gotblkcnt;
+        new.br_blockcount = got.br_blockcount - gotblkcnt;
+        new.br_state = got.br_state;
+        if (ifp->if_flags & XFS_IFBROOT) {
+                cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+                cur->bc_private.b.firstblock = *firstfsb;
+                cur->bc_private.b.flist = free_list;
+                cur->bc_private.b.flags = 0;
+                error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount,
+                                &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+        }
+        xfs_bmbt_set_blockcount(gotp, gotblkcnt);
+        got.br_blockcount = gotblkcnt;
+        logflags = XFS_ILOG_CORE;
+        if (cur) {
+                error = xfs_bmbt_update(cur, got.br_startoff,
+                                got.br_startblock,
+                                got.br_blockcount,
+                                got.br_state);
+                if (error)
+                        goto del_cursor;
+        } else
+                logflags |= XFS_ILOG_DEXT;
+        /* Add new extent */
+        current_ext++;
+        xfs_iext_insert(ip, current_ext, 1, &new, 0);
+        XFS_IFORK_NEXT_SET(ip, whichfork,
+                           XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+        if (cur) {
+                error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
+                                new.br_startblock, new.br_blockcount,
+                                &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
+                cur->bc_rec.b.br_state = new.br_state;
+                error = xfs_btree_insert(cur, &i);
+                if (error)
+                        goto del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
+        }
+        /*
+         * Convert to a btree if necessary.
+         */
+        if (xfs_bmap_needs_btree(ip, whichfork)) {
+                int tmp_logflags; /* partial log flag return val */
+                ASSERT(cur == NULL);
+                error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
+                                &cur, 0, &tmp_logflags, whichfork);
+                logflags |= tmp_logflags;
+        }
+del_cursor:
+        if (cur) {
+                cur->bc_private.b.allocated = 0;
+                xfs_btree_del_cursor(cur,
+                                error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        }
+        if (logflags)
+                xfs_trans_log_inode(tp, ip, logflags);
+        return error;
+}
+int
+xfs_bmap_split_extent(
+        struct xfs_inode        *ip,
+        xfs_fileoff_t           split_fsb)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        struct xfs_bmap_free    free_list;
+        xfs_fsblock_t           firstfsb;
+        int                     committed;
+        int                     error;
+        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+                        XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_bmap_init(&free_list, &firstfsb);
+        error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
+                        &firstfsb, &free_list);
+        if (error)
+                goto out;
+        error = xfs_bmap_finish(&tp, &free_list, &committed);
+        if (error)
+                goto out;
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out:
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+        return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b9d8a499d2c4..6aaa0c1c7200 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
 */
 #define XFS_BMAP_MAX_SHIFT_EXTENTS      1
+enum shift_direction {
+        SHIFT_LEFT = 0,
+        SHIFT_RIGHT,
+};
 #ifdef DEBUG
 void    xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
                int whichfork, unsigned long caller_ip);
@@ -211,8 +216,10 @@ int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
                xfs_extnum_t num);
 uint    xfs_default_attroffset(struct xfs_inode *ip);
 int     xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
-                xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
+                xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
-                int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
+                int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
-                struct xfs_bmap_free *flist, int num_exts);
+                struct xfs_bmap_free *flist, enum shift_direction direction,
+                int num_exts);
+int     xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 81cad433df85..c72283dd8d44 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -168,7 +168,7 @@ xfs_btree_check_lptr(
        xfs_fsblock_t           bno,    /* btree block disk address */
        int                     level)  /* btree block level */
 {
-        XFS_WANT_CORRUPTED_RETURN(
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                level > 0 &&
                bno != NULLFSBLOCK &&
                XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
@@ -187,7 +187,7 @@ xfs_btree_check_sptr(
 {
        xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
-        XFS_WANT_CORRUPTED_RETURN(
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
                level > 0 &&
                bno != NULLAGBLOCK &&
                bno != 0 &&
@@ -1825,7 +1825,7 @@ xfs_btree_lookup(
                        error = xfs_btree_increment(cur, 0, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                        XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                        *stat = 1;
                        return 0;
@@ -2285,7 +2285,7 @@ xfs_btree_rshift(
        if (error)
                goto error0;
        i = xfs_btree_lastrec(tcur, level);
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
        error = xfs_btree_increment(tcur, level, &i);
        if (error)
@@ -3138,7 +3138,7 @@ xfs_btree_insert(
                        goto error0;
                }
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                level++;
                /*
@@ -3582,15 +3582,15 @@ xfs_btree_delrec(
                 * Actually any entry but the first would suffice.
                 */
                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                error = xfs_btree_increment(tcur, level, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                i = xfs_btree_lastrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                /* Grab a pointer to the block. */
                right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3634,12 +3634,12 @@ xfs_btree_delrec(
                rrecs = xfs_btree_get_numrecs(right);
                if (!xfs_btree_ptr_is_null(cur, &lptr)) {
                        i = xfs_btree_firstrec(tcur, level);
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                        error = xfs_btree_decrement(tcur, level, &i);
                        if (error)
                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                        XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                }
        }
@@ -3653,13 +3653,13 @@ xfs_btree_delrec(
                 * previous block.
                 */
                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                error = xfs_btree_decrement(tcur, level, &i);
                if (error)
                        goto error0;
                i = xfs_btree_firstrec(tcur, level);
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
                /* Grab a pointer to the block. */
                left = xfs_btree_get_block(tcur, level, &lbp);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9cb0115c6bd1..2385f8cd08ab 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -538,12 +538,12 @@ xfs_da3_root_split(
        oldroot = blk1->bp->b_addr;
        if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
            oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
-                struct xfs_da3_icnode_hdr nodehdr;
+                struct xfs_da3_icnode_hdr icnodehdr;
-                dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+                dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
                btree = dp->d_ops->node_tree_p(oldroot);
-                size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
+                size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
-                level = nodehdr.level;
+                level = icnodehdr.level;
                /*
                 * we are about to copy oldroot to bp, so set up the type
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..74bcbabfa523 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr {
        __uint16_t      magic;
        __uint16_t      count;
        __uint16_t      usedbytes;
-        __uint16_t      firstused;
+        /*
+         * firstused is 32-bit here instead of 16-bit like the on-disk variant
+         * to support maximum fsb size of 64k without overflow issues throughout
+         * the attr code. Instead, the overflow condition is handled on
+         * conversion to/from disk.
+         */
+        __uint32_t      firstused;
        __u8            holes;
        struct {
                __uint16_t      base;
@@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr {
 };
 /*
+ * Special value to represent fs block size in the leaf header firstused field.
+ * Only used when block size overflows the 2-bytes available on disk.
+ */
+#define XFS_ATTR3_LEAF_NULLOFF  0
+/*
 * Flags used in the leaf_entry[i].flags field.
 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
 * on the system call, they are "or"ed together for various operations.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 5ff31be9b1cd..de1ea16f5748 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -89,7 +89,7 @@ __xfs_dir3_data_check(
                 * so just ensure that the count falls somewhere inside the
                 * block right now.
                 */
-                XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+                XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
                        ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
                break;
        case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -107,21 +107,21 @@ __xfs_dir3_data_check(
        bf = ops->data_bestfree_p(hdr);
        count = lastfree = freeseen = 0;
        if (!bf[0].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
                freeseen |= 1 << 0;
        }
        if (!bf[1].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
                freeseen |= 1 << 1;
        }
        if (!bf[2].length) {
-                XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
+                XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
                freeseen |= 1 << 2;
        }
-        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+        XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
                                                be16_to_cpu(bf[1].length));
-        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+        XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
                                                be16_to_cpu(bf[2].length));
        /*
         * Loop over the data/unused entries.
@@ -134,18 +134,18 @@ __xfs_dir3_data_check(
                 * doesn't need to be there.
                 */
                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+                        XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
-                        XFS_WANT_CORRUPTED_RETURN(
+                        XFS_WANT_CORRUPTED_RETURN(mp,
                                be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
                                               (char *)dup - (char *)hdr);
                        dfp = xfs_dir2_data_freefind(hdr, bf, dup);
                        if (dfp) {
                                i = (int)(dfp - bf);
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        (freeseen & (1 << i)) == 0);
                                freeseen |= 1 << i;
                        } else {
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        be16_to_cpu(dup->length) <=
                                                be16_to_cpu(bf[2].length));
                        }
@@ -160,13 +160,13 @@ __xfs_dir3_data_check(
                 * The linear search is crude but this is DEBUG code.
                 */
                dep = (xfs_dir2_data_entry_t *)p;
-                XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+                XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                        be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
                                               (char *)dep - (char *)hdr);
-                XFS_WANT_CORRUPTED_RETURN(
+                XFS_WANT_CORRUPTED_RETURN(mp,
                                ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
                count++;
                lastfree = 0;
@@ -183,14 +183,15 @@ __xfs_dir3_data_check(
                                    be32_to_cpu(lep[i].hashval) == hash)
                                        break;
                        }
-                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
+                        XFS_WANT_CORRUPTED_RETURN(mp,
+                                                  i < be32_to_cpu(btp->count));
                }
                p += ops->data_entsize(dep->namelen);
        }
        /*
         * Need to have seen all the entries and all the bestfree slots.
         */
-        XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
+        XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
            hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
                for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
@@ -198,13 +199,13 @@ __xfs_dir3_data_check(
                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                stale++;
                        if (i > 0)
-                                XFS_WANT_CORRUPTED_RETURN(
+                                XFS_WANT_CORRUPTED_RETURN(mp,
                                        be32_to_cpu(lep[i].hashval) >=
                                                be32_to_cpu(lep[i - 1].hashval));
                }
-                XFS_WANT_CORRUPTED_RETURN(count ==
+                XFS_WANT_CORRUPTED_RETURN(mp, count ==
                        be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-                XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
        }
        return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8eb718979383..4daaa662337b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -264,68 +264,6 @@ typedef struct xfs_dsb {
        /* must be padded to 64 bit alignment */
 } xfs_dsb_t;
-/*
- * Sequence number values for the fields.
- */
-typedef enum {
-        XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
-        XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
-        XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
-        XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
-        XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
-        XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
-        XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
-        XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
-        XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
-        XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
-        XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
-        XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
-        XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
-        XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
-        XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
-        XFS_SBS_PQUOTINO, XFS_SBS_LSN,
-        XFS_SBS_FIELDCOUNT
-} xfs_sb_field_t;
-/*
- * Mask values, defined based on the xfs_sb_field_t values.
- * Only define the ones we're using.
- */
-#define XFS_SB_MVAL(x)          (1LL << XFS_SBS_ ## x)
-#define XFS_SB_UUID             XFS_SB_MVAL(UUID)
-#define XFS_SB_FNAME            XFS_SB_MVAL(FNAME)
-#define XFS_SB_ROOTINO          XFS_SB_MVAL(ROOTINO)
-#define XFS_SB_RBMINO           XFS_SB_MVAL(RBMINO)
-#define XFS_SB_RSUMINO          XFS_SB_MVAL(RSUMINO)
-#define XFS_SB_VERSIONNUM       XFS_SB_MVAL(VERSIONNUM)
-#define XFS_SB_UQUOTINO         XFS_SB_MVAL(UQUOTINO)
-#define XFS_SB_GQUOTINO         XFS_SB_MVAL(GQUOTINO)
-#define XFS_SB_QFLAGS           XFS_SB_MVAL(QFLAGS)
-#define XFS_SB_SHARED_VN        XFS_SB_MVAL(SHARED_VN)
-#define XFS_SB_UNIT             XFS_SB_MVAL(UNIT)
-#define XFS_SB_WIDTH            XFS_SB_MVAL(WIDTH)
-#define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
-#define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
-#define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        (XFS_SB_MVAL(FEATURES2) | \
-                                 XFS_SB_MVAL(BAD_FEATURES2))
-#define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
-#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
-#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
-#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
-#define XFS_SB_CRC              XFS_SB_MVAL(CRC)
-#define XFS_SB_PQUOTINO         XFS_SB_MVAL(PQUOTINO)
-#define XFS_SB_NUM_BITS         ((int)XFS_SBS_FIELDCOUNT)
-#define XFS_SB_ALL_BITS         ((1LL << XFS_SB_NUM_BITS) - 1)
-#define XFS_SB_MOD_BITS         \
-        (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
-         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
-         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
-         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-         XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-         XFS_SB_PQUOTINO)
 /*
 * Misc. Flags - warning - these will be cleared by xfs_repair unless
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 116ef1ddb3e3..07349a183a11 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc(
         */
        newlen = args.mp->m_ialloc_inos;
        if (args.mp->m_maxicount &&
-            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
+            percpu_counter_read(&args.mp->m_icount) + newlen >
+                                                        args.mp->m_maxicount)
                return -ENOSPC;
        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
        /*
@@ -700,7 +701,7 @@ xfs_ialloc_next_rec(
                error = xfs_inobt_get_rec(cur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        }
        return 0;
@@ -724,7 +725,7 @@ xfs_ialloc_get_rec(
                error = xfs_inobt_get_rec(cur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        }
        return 0;
@@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt(
                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                error = xfs_inobt_get_rec(cur, &rec, &j);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(j == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
                if (rec.ir_freecount > 0) {
                        /*
@@ -944,19 +945,19 @@ newino:
        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
        if (error)
                goto error0;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        for (;;) {
                error = xfs_inobt_get_rec(cur, &rec, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
                if (rec.ir_freecount > 0)
                        break;
                error = xfs_btree_increment(cur, 0, &i);
                if (error)
                        goto error0;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        }
 alloc_inode:
@@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near(
                error = xfs_inobt_get_rec(lcur, rec, &i);
                if (error)
                        return error;
-                XFS_WANT_CORRUPTED_RETURN(i == 1);
+                XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
                /*
                 * See if we've landed in the parent inode record. The finobt
@@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near(
                error = xfs_inobt_get_rec(rcur, &rrec, &j);
                if (error)
                        goto error_rcur;
-                XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+                XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+        XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
        if (i == 1 && j == 1) {
                /*
                 * Both the left and right records are valid. Choose the closer
@@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino(
                        error = xfs_inobt_get_rec(cur, rec, &i);
                        if (error)
                                return error;
-                        XFS_WANT_CORRUPTED_RETURN(i == 1);
+                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
                        return 0;
                }
        }
@@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino(
        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        error = xfs_inobt_get_rec(cur, rec, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        return 0;
 }
@@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt(
        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(i == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
                                   XFS_INODES_PER_CHUNK) == 0);
        rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
-        XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
                                  (rec.ir_freecount == frec->ir_freecount));
        return xfs_inobt_update(cur, &rec);
@@ -1340,7 +1341,8 @@ xfs_dialloc(
         * inode.
         */
        if (mp->m_maxicount &&
-            mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
+            percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos >
+                                                        mp->m_maxicount) {
                noroom = 1;
                okalloc = 0;
        }
@@ -1475,14 +1477,14 @@ xfs_difree_inobt(
                        __func__, error);
                goto error0;
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error) {
                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
                        __func__, error);
                goto error0;
        }
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
        /*
         * Get the offset in the inode chunk.
         */
@@ -1592,7 +1594,7 @@ xfs_difree_finobt(
                 * freed an inode in a previously fully allocated chunk. If not,
                 * something is out of sync.
                 */
-                XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+                XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
                error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
                                             ibtrec->ir_free, &i);
@@ -1613,12 +1615,12 @@ xfs_difree_finobt(
        error = xfs_inobt_get_rec(cur, &rec, &i);
        if (error)
                goto error;
-        XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
        rec.ir_free |= XFS_INOBT_MASK(offset);
        rec.ir_freecount++;
-        XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+        XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
                                (rec.ir_freecount == ibtrec->ir_freecount),
                                error);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b0a5fe95a3e2..dc4bfc5d88fc 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -111,14 +111,6 @@ xfs_mount_validate_sb(
        bool            check_inprogress,
        bool            check_version)
 {
-        /*
-         * If the log device and data device have the
-         * same device number, the log is internal.
-         * Consequently, the sb_logstart should be non-zero.  If
-         * we have a zero sb_logstart in this case, we may be trying to mount
-         * a volume filesystem in a non-volume manner.
-         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
                xfs_warn(mp, "bad magic number");
                return -EWRONGFS;
@@ -743,17 +735,15 @@ xfs_initialize_perag_data(
                btree += pag->pagf_btreeblks;
                xfs_perag_put(pag);
        }
-        /*
-         * Overwrite incore superblock counters with just-read data
+        /* Overwrite incore superblock counters with just-read data */
-         */
        spin_lock(&mp->m_sb_lock);
        sbp->sb_ifree = ifree;
        sbp->sb_icount = ialloc;
        sbp->sb_fdblocks = bfree + bfreelst + btree;
        spin_unlock(&mp->m_sb_lock);
-        /* Fixup the per-cpu counters as well. */
+        xfs_reinit_percpu_counters(mp);
-        xfs_icsb_reinit_counters(mp);
        return 0;
 }
@@ -771,6 +761,10 @@ xfs_log_sb(
        struct xfs_mount        *mp = tp->t_mountp;
        struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
+        mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+        mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+        mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1d8eef9cf0f5..a56960dd1684 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1232,6 +1232,117 @@ xfs_vm_releasepage(
        return try_to_free_buffers(page);
 }
+/*
+ * When we map a DIO buffer, we may need to attach an ioend that describes the
+ * type of write IO we are doing. This passes to the completion function the
+ * operations it needs to perform. If the mapping is for an overwrite wholly
+ * within the EOF then we don't need an ioend and so we don't allocate one.
+ * This avoids the unnecessary overhead of allocating and freeing ioends for
+ * workloads that don't require transactions on IO completion.
+ *
+ * If we get multiple mappings in a single IO, we might be mapping different
+ * types. But because the direct IO can only have a single private pointer, we
+ * need to ensure that:
+ *
+ * a) i) the ioend spans the entire region of unwritten mappings; or
+ *    ii) the ioend spans all the mappings that cross or are beyond EOF; and
+ * b) if it contains unwritten extents, it is *permanently* marked as such
+ *
+ * We could do this by chaining ioends like buffered IO does, but we only
+ * actually get one IO completion callback from the direct IO, and that spans
+ * the entire IO regardless of how many mappings and IOs are needed to complete
+ * the DIO. There is only going to be one reference to the ioend and its life
+ * cycle is constrained by the DIO completion code. hence we don't need
+ * reference counting here.
+ */
+static void
+xfs_map_direct(
+        struct inode            *inode,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset)
+{
+        struct xfs_ioend        *ioend;
+        xfs_off_t               size = bh_result->b_size;
+        int                     type;
+        if (ISUNWRITTEN(imap))
+                type = XFS_IO_UNWRITTEN;
+        else
+                type = XFS_IO_OVERWRITE;
+        trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+        if (bh_result->b_private) {
+                ioend = bh_result->b_private;
+                ASSERT(ioend->io_size > 0);
+                ASSERT(offset >= ioend->io_offset);
+                if (offset + size > ioend->io_offset + ioend->io_size)
+                        ioend->io_size = offset - ioend->io_offset + size;
+                if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
+                        ioend->io_type = XFS_IO_UNWRITTEN;
+                trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
+                                              ioend->io_size, ioend->io_type,
+                                              imap);
+        } else if (type == XFS_IO_UNWRITTEN ||
+                   offset + size > i_size_read(inode)) {
+                ioend = xfs_alloc_ioend(inode, type);
+                ioend->io_offset = offset;
+                ioend->io_size = size;
+                bh_result->b_private = ioend;
+                set_buffer_defer_completion(bh_result);
+                trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
+                                           imap);
+        } else {
+                trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+                                            imap);
+        }
+}
+/*
+ * If this is O_DIRECT or the mpage code calling tell them how large the mapping
+ * is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the mapping
+ * for blocks beyond EOF must be marked new so that sub block regions can be
+ * correctly zeroed. We can't do this for mappings within EOF unless the mapping
+ * was just allocated or is unwritten, otherwise the callers would overwrite
+ * existing data with zeros. Hence we have to split the mapping into a range up
+ * to and including EOF, and a second mapping for beyond EOF.
+ */
+static void
+xfs_map_trim_size(
+        struct inode            *inode,
+        sector_t                iblock,
+        struct buffer_head      *bh_result,
+        struct xfs_bmbt_irec    *imap,
+        xfs_off_t               offset,
+        ssize_t                 size)
+{
+        xfs_off_t               mapping_size;
+        mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
+        mapping_size <<= inode->i_blkbits;
+        ASSERT(mapping_size > 0);
+        if (mapping_size > size)
+                mapping_size = size;
+        if (offset < i_size_read(inode) &&
+            offset + mapping_size >= i_size_read(inode)) {
+                /* limit mapping to block that spans EOF */
+                mapping_size = roundup_64(i_size_read(inode) - offset,
+                                          1 << inode->i_blkbits);
+        }
+        if (mapping_size > LONG_MAX)
+                mapping_size = LONG_MAX;
+        bh_result->b_size = mapping_size;
+}
 STATIC int
 __xfs_get_blocks(
        struct inode            *inode,
@@ -1320,31 +1431,37 @@ __xfs_get_blocks(
                        xfs_iunlock(ip, lockmode);
                }
+                trace_xfs_get_blocks_alloc(ip, offset, size,
-                trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_DELALLOC, &imap);
        } else if (nimaps) {
-                trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+                trace_xfs_get_blocks_found(ip, offset, size,
+                                ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
+                                                   : XFS_IO_OVERWRITE, &imap);
                xfs_iunlock(ip, lockmode);
        } else {
                trace_xfs_get_blocks_notfound(ip, offset, size);
                goto out_unlock;
        }
+        /* trim mapping down to size requested */
+        if (direct || size > (1 << inode->i_blkbits))
+                xfs_map_trim_size(inode, iblock, bh_result,
+                                  &imap, offset, size);
+        /*
+         * For unwritten extents do not report a disk address in the buffered
+         * read case (treat as if we're reading into a hole).
+         */
        if (imap.br_startblock != HOLESTARTBLOCK &&
-            imap.br_startblock != DELAYSTARTBLOCK) {
+            imap.br_startblock != DELAYSTARTBLOCK &&
-                /*
+            (create || !ISUNWRITTEN(&imap))) {
-                 * For unwritten extents do not report a disk address on
+                xfs_map_buffer(inode, bh_result, &imap, offset);
-                 * the read case (treat as if we're reading into a hole).
+                if (ISUNWRITTEN(&imap))
-                 */
-                if (create || !ISUNWRITTEN(&imap))
-                        xfs_map_buffer(inode, bh_result, &imap, offset);
-                if (create && ISUNWRITTEN(&imap)) {
-                        if (direct) {
-                                bh_result->b_private = inode;
-                                set_buffer_defer_completion(bh_result);
-                        }
                        set_buffer_unwritten(bh_result);
-                }
+                /* direct IO needs special help */
+                if (create && direct)
+                        xfs_map_direct(inode, bh_result, &imap, offset);
        }
        /*
@@ -1377,39 +1494,6 @@ __xfs_get_blocks(
                }
        }
-        /*
-         * If this is O_DIRECT or the mpage code calling tell them how large
-         * the mapping is, so that we can avoid repeated get_blocks calls.
-         *
-         * If the mapping spans EOF, then we have to break the mapping up as the
-         * mapping for blocks beyond EOF must be marked new so that sub block
-         * regions can be correctly zeroed. We can't do this for mappings within
-         * EOF unless the mapping was just allocated or is unwritten, otherwise
-         * the callers would overwrite existing data with zeros. Hence we have
-         * to split the mapping into a range up to and including EOF, and a
-         * second mapping for beyond EOF.
-         */
-        if (direct || size > (1 << inode->i_blkbits)) {
-                xfs_off_t               mapping_size;
-                mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-                mapping_size <<= inode->i_blkbits;
-                ASSERT(mapping_size > 0);
-                if (mapping_size > size)
-                        mapping_size = size;
-                if (offset < i_size_read(inode) &&
-                    offset + mapping_size >= i_size_read(inode)) {
-                        /* limit mapping to block that spans EOF */
-                        mapping_size = roundup_64(i_size_read(inode) - offset,
-                                                  1 << inode->i_blkbits);
-                }
-                if (mapping_size > LONG_MAX)
-                        mapping_size = LONG_MAX;
-                bh_result->b_size = mapping_size;
-        }
        return 0;
 out_unlock:
@@ -1440,9 +1524,11 @@ xfs_get_blocks_direct(
 /*
 * Complete a direct I/O write request.
 *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
- * need to issue a transaction to convert the range from unwritten to written
+ * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
- * extents.
+ * wholly within the EOF and so there is nothing for us to do. Note that in this
+ * case the completion can be called in interrupt context, whereas if we have an
+ * ioend we will always be called in task context (i.e. from a workqueue).
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1454,43 +1540,71 @@ xfs_end_io_direct_write(
        struct inode            *inode = file_inode(iocb->ki_filp);
        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_ioend        *ioend = private;
-        if (XFS_FORCED_SHUTDOWN(mp))
+        trace_xfs_gbmap_direct_endio(ip, offset, size,
+                                     ioend ? ioend->io_type : 0, NULL);
+        if (!ioend) {
+                ASSERT(offset + size <= i_size_read(inode));
                return;
+        }
+        if (XFS_FORCED_SHUTDOWN(mp))
+                goto out_end_io;
        /*
-         * While the generic direct I/O code updates the inode size, it does
+         * dio completion end_io functions are only called on writes if more
-         * so only after the end_io handler is called, which means our
+         * than 0 bytes was written.
-         * end_io handler thinks the on-disk size is outside the in-core
-         * size.  To prevent this just update it a little bit earlier here.
         */
+        ASSERT(size > 0);
+        /*
+         * The ioend only maps whole blocks, while the IO may be sector aligned.
+         * Hence the ioend offset/size may not match the IO offset/size exactly.
+         * Because we don't map overwrites within EOF into the ioend, the offset
+         * may not match, but only if the endio spans EOF.  Either way, write
+         * the IO sizes into the ioend so that completion processing does the
+         * right thing.
+         */
+        ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
+        ioend->io_size = size;
+        ioend->io_offset = offset;
+        /*
+         * The ioend tells us whether we are doing unwritten extent conversion
+         * or an append transaction that updates the on-disk file size. These
+         * cases are the only cases where we should *potentially* be needing
+         * to update the VFS inode size.
+         *
+         * We need to update the in-core inode size here so that we don't end up
+         * with the on-disk inode size being outside the in-core inode size. We
+         * have no other method of updating EOF for AIO, so always do it here
+         * if necessary.
+         *
+         * We need to lock the test/set EOF update as we can be racing with
+         * other IO completions here to update the EOF. Failing to serialise
+         * here can result in EOF moving backwards and Bad Things Happen when
+         * that occurs.
+         */
+        spin_lock(&ip->i_flags_lock);
        if (offset + size > i_size_read(inode))
                i_size_write(inode, offset + size);
+        spin_unlock(&ip->i_flags_lock);
        /*
-         * For direct I/O we do not know if we need to allocate blocks or not,
+         * If we are doing an append IO that needs to update the EOF on disk,
-         * so we can't preallocate an append transaction, as that results in
+         * do the transaction reserve now so we can use common end io
-         * nested reservations and log space deadlocks. Hence allocate the
+         * processing. Stashing the error (if there is one) in the ioend will
-         * transaction here. While this is sub-optimal and can block IO
+         * result in the ioend processing passing on the error if it is
-         * completion for some time, we're stuck with doing it this way until
+         * possible as we can't return it from here.
-         * we can pass the ioend to the direct IO allocation callbacks and
-         * avoid nesting that way.
         */
-        if (private && size > 0) {
+        if (ioend->io_type == XFS_IO_OVERWRITE)
-                xfs_iomap_write_unwritten(ip, offset, size);
+                ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
-        } else if (offset + size > ip->i_d.di_size) {
-                struct xfs_trans        *tp;
-                int                     error;
-                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
-                if (error) {
-                        xfs_trans_cancel(tp, 0);
-                        return;
-                }
-                xfs_setfilesize(ip, tp, offset, size);
+out_end_io:
-        }
+        xfs_end_io(&ioend->io_work);
+        return;
 }
 STATIC ssize_t
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 83af4c149635..f9c1c64782d3 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive(
        int                     size;
        int                     tmp;
        int                     i;
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        /*
         * Count the number of "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a43d370d2c58..65fb37a18e92 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        int error, i;
        struct xfs_buf *bp;
        struct xfs_inode        *dp = context->dp;
+        struct xfs_mount        *mp = dp->i_mount;
        trace_xfs_attr_node_list(context);
@@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        case XFS_ATTR_LEAF_MAGIC:
                        case XFS_ATTR3_LEAF_MAGIC:
                                leaf = bp->b_addr;
-                                xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                                xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
+                                                             &leafhdr, leaf);
                                entries = xfs_attr3_leaf_entryp(leaf);
                                if (cursor->hashval > be32_to_cpu(
                                                entries[leafhdr.count - 1].hashval)) {
@@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        xfs_trans_brelse(NULL, bp);
                        return error;
                }
-                xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf);
+                xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
                if (context->seen_enough || leafhdr.forw == 0)
                        break;
                cursor->blkno = leafhdr.forw;
@@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int(
        struct xfs_attr_leaf_entry      *entry;
        int                             retval;
        int                             i;
+        struct xfs_mount                *mp = context->dp->i_mount;
        trace_xfs_attr_list_leaf(context);
        leaf = bp->b_addr;
-        xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
+        xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
        entries = xfs_attr3_leaf_entryp(leaf);
        cursor = context->cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 22a5dcb70b32..a52bbd3abc7d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1376,22 +1376,19 @@ out:
 }
 /*
- * xfs_collapse_file_space()
+ * @next_fsb will keep track of the extent currently undergoing shift.
- *      This routine frees disk space and shift extent for the given file.
+ * @stop_fsb will keep track of the extent at which we have to stop.
- *      The first thing we do is to free data blocks in the specified range
+ * If we are shifting left, we will start with block (offset + len) and
- *      by calling xfs_free_file_space(). It would also sync dirty data
+ * shift each extent till last extent.
- *      and invalidate page cache over the region on which collapse range
+ * If we are shifting right, we will start with last extent inside file space
- *      is working. And Shift extent records to the left to cover a hole.
+ * and continue until we reach the block corresponding to offset.
- * RETURNS:
- *      0 on success
- *      errno on error
- *
 */
-int
+static int
-xfs_collapse_file_space(
+xfs_shift_file_space(
-        struct xfs_inode        *ip,
+        struct xfs_inode        *ip,
-        xfs_off_t               offset,
+        xfs_off_t               offset,
-        xfs_off_t               len)
+        xfs_off_t               len,
+        enum shift_direction    direction)
 {
        int                     done = 0;
        struct xfs_mount        *mp = ip->i_mount;
@@ -1400,21 +1397,26 @@ xfs_collapse_file_space(
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
        int                     committed;
-        xfs_fileoff_t           start_fsb;
+        xfs_fileoff_t           stop_fsb;
        xfs_fileoff_t           next_fsb;
        xfs_fileoff_t           shift_fsb;
-        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
-        trace_xfs_collapse_file_space(ip);
+        if (direction == SHIFT_LEFT) {
+                next_fsb = XFS_B_TO_FSB(mp, offset + len);
+                stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
+        } else {
+                /*
+                 * If right shift, delegate the work of initialization of
+                 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
+                 */
+                next_fsb = NULLFSBLOCK;
+                stop_fsb = XFS_B_TO_FSB(mp, offset);
+        }
-        next_fsb = XFS_B_TO_FSB(mp, offset + len);
        shift_fsb = XFS_B_TO_FSB(mp, len);
-        error = xfs_free_file_space(ip, offset, len);
-        if (error)
-                return error;
        /*
         * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
         * into the accessible region of the file.
@@ -1427,20 +1429,28 @@ xfs_collapse_file_space(
        /*
         * Writeback and invalidate cache for the remainder of the file as we're
-         * about to shift down every extent from the collapse range to EOF. The
+         * about to shift down every extent from offset to EOF.
-         * free of the collapse range above might have already done some of
-         * this, but we shouldn't rely on it to do anything outside of the range
-         * that was freed.
         */
        error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                             offset + len, -1);
+                                             offset, -1);
        if (error)
                return error;
        error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                        (offset + len) >> PAGE_CACHE_SHIFT, -1);
+                                        offset >> PAGE_CACHE_SHIFT, -1);
        if (error)
                return error;
+        /*
+         * The extent shiting code works on extent granularity. So, if
+         * stop_fsb is not the starting block of extent, we need to split
+         * the extent at stop_fsb.
+         */
+        if (direction == SHIFT_RIGHT) {
+                error = xfs_bmap_split_extent(ip, stop_fsb);
+                if (error)
+                        return error;
+        }
        while (!error && !done) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
                /*
@@ -1464,7 +1474,7 @@ xfs_collapse_file_space(
                if (error)
                        goto out;
-                xfs_trans_ijoin(tp, ip, 0);
+                xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
                xfs_bmap_init(&free_list, &first_block);
@@ -1472,10 +1482,9 @@ xfs_collapse_file_space(
                 * We are using the write transaction in which max 2 bmbt
                 * updates are allowed
                 */
-                start_fsb = next_fsb;
+                error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
-                error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
+                                &done, stop_fsb, &first_block, &free_list,
-                                &done, &next_fsb, &first_block, &free_list,
+                                direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
-                                XFS_BMAP_MAX_SHIFT_EXTENTS);
                if (error)
                        goto out;
@@ -1484,18 +1493,70 @@ xfs_collapse_file_space(
                        goto out;
                error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
        }
        return error;
 out:
        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        return error;
 }
 /*
+ * xfs_collapse_file_space()
+ *      This routine frees disk space and shift extent for the given file.
+ *      The first thing we do is to free data blocks in the specified range
+ *      by calling xfs_free_file_space(). It would also sync dirty data
+ *      and invalidate page cache over the region on which collapse range
+ *      is working. And Shift extent records to the left to cover a hole.
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ *
+ */
+int
+xfs_collapse_file_space(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len)
+{
+        int error;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        trace_xfs_collapse_file_space(ip);
+        error = xfs_free_file_space(ip, offset, len);
+        if (error)
+                return error;
+        return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
+}
+/*
+ * xfs_insert_file_space()
+ *      This routine create hole space by shifting extents for the given file.
+ *      The first thing we do is to sync dirty data and invalidate page cache
+ *      over the region on which insert range is working. And split an extent
+ *      to two extents at given offset by calling xfs_bmap_split_extent.
+ *      And shift all extent records which are laying between [offset,
+ *      last allocated extent] to the right to reserve hole range.
+ * RETURNS:
+ *      0 on success
+ *      errno on error
+ */
+int
+xfs_insert_file_space(
+        struct xfs_inode        *ip,
+        loff_t                  offset,
+        loff_t                  len)
+{
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        trace_xfs_insert_file_space(ip);
+        return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
+}
+/*
 * We need to check that the format of the data fork in the temporary inode is
 * valid for the target inode before doing the swap. This is not a problem with
 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
@@ -1599,13 +1660,6 @@ xfs_swap_extent_flush(
        /* Verify O_DIRECT for ftmp */
        if (VFS_I(ip)->i_mapping->nrpages)
                return -EINVAL;
-        /*
-         * Don't try to swap extents on mmap()d files because we can't lock
-         * out races against page faults safely.
-         */
-        if (mapping_mapped(VFS_I(ip)->i_mapping))
-                return -EBUSY;
        return 0;
 }
@@ -1633,13 +1687,14 @@ xfs_swap_extents(
        }
        /*
-         * Lock up the inodes against other IO and truncate to begin with.
+         * Lock the inodes against other IO, page faults and truncate to
-         * Then we can ensure the inodes are flushed and have no page cache
+         * begin with.  Then we can ensure the inodes are flushed and have no
-         * safely. Once we have done this we can take the ilocks and do the rest
+         * page cache safely. Once we have done this we can take the ilocks and
-         * of the checks.
+         * do the rest of the checks.
         */
-        lock_flags = XFS_IOLOCK_EXCL;
+        lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
        xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+        xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
        /* Verify that both files have the same format */
        if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1666,8 +1721,16 @@ xfs_swap_extents(
                xfs_trans_cancel(tp, 0);
                goto out_unlock;
        }
+        /*
+         * Lock and join the inodes to the tansaction so that transaction commit
+         * or cancel will unlock the inodes from this point onwards.
+         */
        xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
        lock_flags |= XFS_ILOCK_EXCL;
+        xfs_trans_ijoin(tp, ip, lock_flags);
+        xfs_trans_ijoin(tp, tip, lock_flags);
        /* Verify all data are being swapped */
        if (sxp->sx_offset != 0 ||
@@ -1720,9 +1783,6 @@ xfs_swap_extents(
                        goto out_trans_cancel;
        }
-        xfs_trans_ijoin(tp, ip, lock_flags);
-        xfs_trans_ijoin(tp, tip, lock_flags);
        /*
         * Before we've swapped the forks, lets set the owners of the forks
         * appropriately. We have to do this as we are demand paging the btree
@@ -1856,5 +1916,5 @@ out_unlock:
 out_trans_cancel:
        xfs_trans_cancel(tp, 0);
-        goto out_unlock;
+        goto out;
 }
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 736429a72a12..af97d9a1dfb4 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,6 +63,8 @@ int	xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
                            xfs_off_t len);
 int     xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
                                xfs_off_t len);
+int     xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
+                                xfs_off_t len);
 /* EOF block manipulation functions */
 bool    xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 507d96a57ac7..092d652bc03d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -537,9 +537,9 @@ xfs_buf_item_push(
        /* has a previous flush failed due to IO errors? */
        if ((bp->b_flags & XBF_WRITE_FAIL) &&
-            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
                xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.",
+"Failing async write on buffer block 0x%llx. Retrying async write.",
                         (long long)bp->b_bn);
        }
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 799e5a2d334d..e85a9519a5ae 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -84,7 +84,7 @@ xfs_trim_extents(
                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
                if (error)
                        goto out_del_cursor;
-                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
                ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
                /*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 3ee186ac1093..338e50bbfd1e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -131,7 +131,7 @@ xfs_error_report(
 {
        if (level <= xfs_error_level) {
                xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
-                "Internal error %s at line %d of file %s.  Caller %pF",
+                "Internal error %s at line %d of file %s.  Caller %pS",
                            tag, linenum, filename, ra);
                xfs_stack_trace();
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 279a76e52791..c0394ed126fc 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 /*
 * Macros to set EFSCORRUPTED & return/branch.
 */
-#define XFS_WANT_CORRUPTED_GOTO(x,l)    \
+#define XFS_WANT_CORRUPTED_GOTO(mp, x, l)       \
        { \
                int fs_is_ok = (x); \
                ASSERT(fs_is_ok); \
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
-                                         XFS_ERRLEVEL_LOW, NULL); \
+                                         XFS_ERRLEVEL_LOW, mp); \
                        error = -EFSCORRUPTED; \
                        goto l; \
                } \
        }
-#define XFS_WANT_CORRUPTED_RETURN(x)    \
+#define XFS_WANT_CORRUPTED_RETURN(mp, x)        \
        { \
                int fs_is_ok = (x); \
                ASSERT(fs_is_ok); \
                if (unlikely(!fs_is_ok)) { \
                        XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
-                                         XFS_ERRLEVEL_LOW, NULL); \
+                                         XFS_ERRLEVEL_LOW, mp); \
                        return -EFSCORRUPTED; \
                } \
        }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f12ad0a8585..8121e75352ee 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
        if (error <= 0)
                return error;
-        error = xfs_break_layouts(inode, iolock);
+        error = xfs_break_layouts(inode, iolock, true);
        if (error)
                return error;
@@ -569,21 +569,42 @@ restart:
         * write.  If zeroing is needed and we are currently holding the
         * iolock shared, we need to update it to exclusive which implies
         * having to redo all checks before.
+         *
+         * We need to serialise against EOF updates that occur in IO
+         * completions here. We want to make sure that nobody is changing the
+         * size while we do this check until we have placed an IO barrier (i.e.
+         * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
+         * The spinlock effectively forms a memory barrier once we have the
+         * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
+         * and hence be able to correctly determine if we need to run zeroing.
         */
+        spin_lock(&ip->i_flags_lock);
        if (iocb->ki_pos > i_size_read(inode)) {
                bool    zero = false;
+                spin_unlock(&ip->i_flags_lock);
                if (*iolock == XFS_IOLOCK_SHARED) {
                        xfs_rw_iunlock(ip, *iolock);
                        *iolock = XFS_IOLOCK_EXCL;
                        xfs_rw_ilock(ip, *iolock);
                        iov_iter_reexpand(from, count);
+                        /*
+                         * We now have an IO submission barrier in place, but
+                         * AIO can do EOF updates during IO completion and hence
+                         * we now need to wait for all of them to drain. Non-AIO
+                         * DIO will have drained before we are given the
+                         * XFS_IOLOCK_EXCL, and so for most cases this wait is a
+                         * no-op.
+                         */
+                        inode_dio_wait(inode);
                        goto restart;
                }
                error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
                if (error)
                        return error;
-        }
+        } else
+                spin_unlock(&ip->i_flags_lock);
        /*
         * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
        int                     iolock;
        size_t                  count = iov_iter_count(from);
        loff_t                  pos = iocb->ki_pos;
+        loff_t                  end;
+        struct iov_iter         data;
        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
                goto out;
        count = iov_iter_count(from);
        pos = iocb->ki_pos;
+        end = pos + count - 1;
        if (mapping->nrpages) {
                ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                    pos, pos + count - 1);
+                                                   pos, end);
                if (ret)
                        goto out;
                /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
                 */
                ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
                                        pos >> PAGE_CACHE_SHIFT,
-                                        (pos + count - 1) >> PAGE_CACHE_SHIFT);
+                                        end >> PAGE_CACHE_SHIFT);
                WARN_ON_ONCE(ret);
                ret = 0;
        }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
        }
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-        ret = generic_file_direct_write(iocb, from, pos);
+        data = *from;
+        ret = mapping->a_ops->direct_IO(iocb, &data, pos);
+        /* see generic_file_direct_write() for why this is necessary */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT,
+                                              end >> PAGE_CACHE_SHIFT);
+        }
+        if (ret > 0) {
+                pos += ret;
+                iov_iter_advance(from, ret);
+                iocb->ki_pos = pos;
+        }
 out:
        xfs_rw_iunlock(ip, iolock);
@@ -822,6 +860,11 @@ xfs_file_write_iter(
        return ret;
 }
+#define XFS_FALLOC_FL_SUPPORTED                                         \
+                (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
+                 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
+                 FALLOC_FL_INSERT_RANGE)
 STATIC long
 xfs_file_fallocate(
        struct file             *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
        enum xfs_prealloc_flags flags = 0;
        uint                    iolock = XFS_IOLOCK_EXCL;
        loff_t                  new_size = 0;
+        bool                    do_file_insert = 0;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
-        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+        if (mode & ~XFS_FALLOC_FL_SUPPORTED)
-                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
        xfs_ilock(ip, iolock);
-        error = xfs_break_layouts(inode, &iolock);
+        error = xfs_break_layouts(inode, &iolock, false);
        if (error)
                goto out_unlock;
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+        iolock |= XFS_MMAPLOCK_EXCL;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
                error = xfs_collapse_file_space(ip, offset, len);
                if (error)
                        goto out_unlock;
+        } else if (mode & FALLOC_FL_INSERT_RANGE) {
+                unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+                new_size = i_size_read(inode) + len;
+                if (offset & blksize_mask || len & blksize_mask) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                /* check the new inode size does not wrap through zero */
+                if (new_size > inode->i_sb->s_maxbytes) {
+                        error = -EFBIG;
+                        goto out_unlock;
+                }
+                /* Offset should be less than i_size */
+                if (offset >= i_size_read(inode)) {
+                        error = -EINVAL;
+                        goto out_unlock;
+                }
+                do_file_insert = 1;
        } else {
                flags |= XFS_PREALLOC_SET;
@@ -907,8 +974,19 @@ xfs_file_fallocate(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
                error = xfs_setattr_size(ip, &iattr);
+                if (error)
+                        goto out_unlock;
        }
+        /*
+         * Perform hole insertion now that the file size has been
+         * updated so that if we crash during the operation we don't
+         * leave shifted extents past EOF and hence losing access to
+         * the data that is contained within them.
+         */
+        if (do_file_insert)
+                error = xfs_insert_file_space(ip, offset, len);
 out_unlock:
        xfs_iunlock(ip, iolock);
        return error;
@@ -997,20 +1075,6 @@ xfs_file_mmap(
 }
 /*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-        struct vm_area_struct   *vma,
-        struct vm_fault         *vmf)
-{
-        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-/*
 * This type is designed to indicate the type of offset we would like
 * to search from page cache for xfs_seek_hole_data().
 */
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
        }
 }
+/*
+ * Locking for serialisation of IO during page faults. This results in a lock
+ * ordering of:
+ *
+ * mmap_sem (MM)
+ *   i_mmap_lock (XFS - truncate serialisation)
+ *     page_lock (MM)
+ *       i_lock (XFS - extent map serialisation)
+ */
+STATIC int
+xfs_filemap_fault(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_fault(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = filemap_fault(vma, vmf);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
+ */
+STATIC int
+xfs_filemap_page_mkwrite(
+        struct vm_area_struct   *vma,
+        struct vm_fault         *vmf)
+{
+        struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
+        int                     error;
+        trace_xfs_filemap_page_mkwrite(ip);
+        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+        error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+        return error;
+}
 const struct file_operations xfs_file_operations = {
        .llseek         = xfs_file_llseek,
        .read_iter      = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
 };
 static const struct vm_operations_struct xfs_file_vm_ops = {
-        .fault          = filemap_fault,
+        .fault          = xfs_filemap_fault,
        .map_pages      = filemap_map_pages,
-        .page_mkwrite   = xfs_vm_page_mkwrite,
+        .page_mkwrite   = xfs_filemap_page_mkwrite,
 };
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a2e86e8a0fea..8f9f854376c6 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -322,7 +322,7 @@ xfs_filestream_lookup_ag(
        pip = xfs_filestream_get_parent(ip);
        if (!pip)
-                goto out;
+                return NULLAGNUMBER;
        mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
        if (mru) {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 74efe5b760dc..cb7e8a29dfb6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -637,12 +637,13 @@ xfs_fs_counts(
        xfs_mount_t             *mp,
        xfs_fsop_counts_t       *cnt)
 {
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
+        cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
+        cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+                                                        XFS_ALLOC_SET_ASIDE(mp);
        spin_lock(&mp->m_sb_lock);
-        cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        cnt->freertx = mp->m_sb.sb_frextents;
-        cnt->freeino = mp->m_sb.sb_ifree;
-        cnt->allocino = mp->m_sb.sb_icount;
        spin_unlock(&mp->m_sb_lock);
        return 0;
 }
@@ -692,14 +693,9 @@ xfs_reserve_blocks(
         * what to do. This means that the amount of free space can
         * change while we do this, so we need to retry if we end up
         * trying to reserve more space than is available.
-         *
-         * We also use the xfs_mod_incore_sb() interface so that we
-         * don't have to care about whether per cpu counter are
-         * enabled, disabled or even compiled in....
         */
 retry:
        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_sync_counters_locked(mp, 0);
        /*
         * If our previous reservation was larger than the current value,
@@ -716,7 +712,8 @@ retry:
        } else {
                __int64_t       free;
-                free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+                free = percpu_counter_sum(&mp->m_fdblocks) -
+                                                        XFS_ALLOC_SET_ASIDE(mp);
                if (!free)
                        goto out; /* ENOSPC and fdblks_delta = 0 */
@@ -755,8 +752,7 @@ out:
                 * the extra reserve blocks from the reserve.....
                 */
                int error;
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
-                                                 fdblks_delta, 0);
                if (error == -ENOSPC)
                        goto retry;
        }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9771b7ef62ed..76a9f2783282 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
        *ipp = ip;
        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
+         * If we have a real type for an on-disk inode, we can setup the inode
         * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
+                xfs_setup_existing_inode(ip);
        return 0;
 out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6163767aa856..d6ebc85192b7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
 }
 /*
- * The xfs inode contains 2 locks: a multi-reader lock called the
+ * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * the i_lock.  This routine allows various combinations of the locks to be
- * allows either or both of the locks to be obtained.
+ * obtained.
 *
- * The 2 locks should always be ordered so that the IO lock is
+ * The 3 locks should always be ordered so that the IO lock is obtained first,
- * obtained first in order to prevent deadlock.
+ * the mmap lock second and the ilock last in order to prevent deadlock.
 *
- * ip -- the inode being locked
+ * Basic locking order:
- * lock_flags -- this parameter indicates the inode's locks
+ *
- *       to be locked.  It can be:
+ * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
- *              XFS_IOLOCK_SHARED,
+ *
- *              XFS_IOLOCK_EXCL,
+ * mmap_sem locking order:
- *              XFS_ILOCK_SHARED,
+ *
- *              XFS_ILOCK_EXCL,
+ * i_iolock -> page lock -> mmap_sem
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * mmap_sem -> i_mmap_lock -> page_lock
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * The difference in mmap_sem locking order mean that we cannot hold the
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
+ * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
+ * in get_user_pages() to map the user pages into the kernel address space for
+ * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * page faults already hold the mmap_sem.
+ *
+ * Hence to serialise fully against both syscall and mmap based IO, we need to
+ * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * taken in places where we need to invalidate the page cache in a race
+ * free manner (e.g. truncate, hole punch and other extent manipulation
+ * functions).
 */
 void
 xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
        if (lock_flags & XFS_ILOCK_EXCL)
                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
        else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
                if (!mrtryaccess(&ip->i_iolock))
                        goto out;
        }
+        if (lock_flags & XFS_MMAPLOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_mmaplock))
+                        goto out_undo_iolock;
+        } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_mmaplock))
+                        goto out_undo_iolock;
+        }
        if (lock_flags & XFS_ILOCK_EXCL) {
                if (!mrtryupdate(&ip->i_lock))
-                        goto out_undo_iolock;
+                        goto out_undo_mmaplock;
        } else if (lock_flags & XFS_ILOCK_SHARED) {
                if (!mrtryaccess(&ip->i_lock))
-                        goto out_undo_iolock;
+                        goto out_undo_mmaplock;
        }
        return 1;
- out_undo_iolock:
+out_undo_mmaplock:
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrunlock_excl(&ip->i_mmaplock);
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mrunlock_shared(&ip->i_mmaplock);
+out_undo_iolock:
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrunlock_excl(&ip->i_iolock);
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mrunlock_shared(&ip->i_iolock);
- out:
+out:
        return 0;
 }
@@ -244,6 +277,8 @@ xfs_iunlock(
         */
        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
+               (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
        else if (lock_flags & XFS_IOLOCK_SHARED)
                mrunlock_shared(&ip->i_iolock);
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrunlock_excl(&ip->i_mmaplock);
+        else if (lock_flags & XFS_MMAPLOCK_SHARED)
+                mrunlock_shared(&ip->i_mmaplock);
        if (lock_flags & XFS_ILOCK_EXCL)
                mrunlock_excl(&ip->i_lock);
        else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+        ASSERT((lock_flags &
+                ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
        if (lock_flags & XFS_ILOCK_EXCL)
                mrdemote(&ip->i_lock);
+        if (lock_flags & XFS_MMAPLOCK_EXCL)
+                mrdemote(&ip->i_mmaplock);
        if (lock_flags & XFS_IOLOCK_EXCL)
                mrdemote(&ip->i_iolock);
@@ -294,6 +337,12 @@ xfs_isilocked(
                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
+        if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
+                if (!(lock_flags & XFS_MMAPLOCK_SHARED))
+                        return !!ip->i_mmaplock.mr_writer;
+                return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
+        }
        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
                if (!(lock_flags & XFS_IOLOCK_SHARED))
                        return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
 #endif
 /*
- * Bump the subclass so xfs_lock_inodes() acquires each lock with
+ * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
- * a different value
+ * value. This shouldn't be called for page fault locking, but we also need to
+ * ensure we don't overrun the number of lockdep subclasses for the iolock or
+ * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
 */
 static inline int
 xfs_lock_inumorder(int lock_mode, int subclass)
 {
-        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
+                ASSERT(subclass + XFS_LOCK_INUMORDER <
+                        (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
+        }
+        if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
+                ASSERT(subclass + XFS_LOCK_INUMORDER <
+                        (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
+                lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
+                                                        XFS_MMAPLOCK_SHIFT;
+        }
        if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
                lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
@@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 }
 /*
- * The following routine will lock n inodes in exclusive mode.
+ * The following routine will lock n inodes in exclusive mode.  We assume the
- * We assume the caller calls us with the inodes in i_ino order.
+ * caller calls us with the inodes in i_ino order.
 *
- * We need to detect deadlock where an inode that we lock
+ * We need to detect deadlock where an inode that we lock is in the AIL and we
- * is in the AIL and we start waiting for another inode that is locked
+ * start waiting for another inode that is locked by a thread in a long running
- * by a thread in a long running transaction (such as truncate). This can
+ * transaction (such as truncate). This can result in deadlock since the long
- * result in deadlock since the long running trans might need to wait
+ * running trans might need to wait for the inode we just locked in order to
- * for the inode we just locked in order to push the tail and free space
+ * push the tail and free space in the log.
- * in the log.
 */
 void
 xfs_lock_inodes(
@@ -348,30 +409,27 @@ xfs_lock_inodes(
        int             attempts = 0, i, j, try_lock;
        xfs_log_item_t  *lp;
-        ASSERT(ips && (inodes >= 2)); /* we need at least two */
+        /* currently supports between 2 and 5 inodes */
+        ASSERT(ips && inodes >= 2 && inodes <= 5);
        try_lock = 0;
        i = 0;
 again:
        for (; i < inodes; i++) {
                ASSERT(ips[i]);
-                if (i && (ips[i] == ips[i-1]))  /* Already locked */
+                if (i && (ips[i] == ips[i - 1]))        /* Already locked */
                        continue;
                /*
-                 * If try_lock is not set yet, make sure all locked inodes
+                 * If try_lock is not set yet, make sure all locked inodes are
-                 * are not in the AIL.
+                 * not in the AIL.  If any are, set try_lock to be used later.
-                 * If any are, set try_lock to be used later.
                 */
                if (!try_lock) {
                        for (j = (i - 1); j >= 0 && !try_lock; j--) {
                                lp = (xfs_log_item_t *)ips[j]->i_itemp;
-                                if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+                                if (lp && (lp->li_flags & XFS_LI_IN_AIL))
                                        try_lock++;
-                                }
                        }
                }
@@ -381,51 +439,42 @@ again:
                 * we can't get any, we must release all we have
                 * and try again.
                 */
+                if (!try_lock) {
+                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
+                        continue;
+                }
+                /* try_lock means we have an inode locked that is in the AIL. */
+                ASSERT(i != 0);
+                if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
+                        continue;
-                if (try_lock) {
+                /*
-                        /* try_lock must be 0 if i is 0. */
+                 * Unlock all previous guys and try again.  xfs_iunlock will try
+                 * to push the tail if the inode is in the AIL.
+                 */
+                attempts++;
+                for (j = i - 1; j >= 0; j--) {
                        /*
-                         * try_lock means we have an inode locked
+                         * Check to see if we've already unlocked this one.  Not
-                         * that is in the AIL.
+                         * the first one going back, and the inode ptr is the
+                         * same.
                         */
-                        ASSERT(i != 0);
+                        if (j != (i - 1) && ips[j] == ips[j + 1])
-                        if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
+                                continue;
-                                attempts++;
+                        xfs_iunlock(ips[j], lock_mode);
-                                /*
+                }
-                                 * Unlock all previous guys and try again.
-                                 * xfs_iunlock will try to push the tail
-                                 * if the inode is in the AIL.
-                                 */
-                                for(j = i - 1; j >= 0; j--) {
-                                        /*
-                                         * Check to see if we've already
-                                         * unlocked this one.
-                                         * Not the first one going back,
-                                         * and the inode ptr is the same.
-                                         */
-                                        if ((j != (i - 1)) && ips[j] ==
-                                                                ips[j+1])
-                                                continue;
-                                        xfs_iunlock(ips[j], lock_mode);
-                                }
-                                if ((attempts % 5) == 0) {
+                if ((attempts % 5) == 0) {
-                                        delay(1); /* Don't just spin the CPU */
+                        delay(1); /* Don't just spin the CPU */
 #ifdef DEBUG
-                                        xfs_lock_delays++;
+                        xfs_lock_delays++;
 #endif
-                                }
-                                i = 0;
-                                try_lock = 0;
-                                goto again;
-                        }
-                } else {
-                        xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
                }
+                i = 0;
+                try_lock = 0;
+                goto again;
        }
 #ifdef DEBUG
@@ -440,10 +489,10 @@ again:
 }
 /*
- * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
- * at a time - the iolock or the ilock, but not both at once. If
+ * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
- * we lock both at once, lockdep will report false positives saying
+ * lock more than one at a time, lockdep will report false positives saying we
- * we have violated locking orders.
+ * have violated locking orders.
 */
 void
 xfs_lock_two_inodes(
@@ -455,8 +504,12 @@ xfs_lock_two_inodes(
        int                     attempts = 0;
        xfs_log_item_t          *lp;
-        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+        if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
-                ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+                ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
+                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
+        } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+                ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
        ASSERT(ip0->i_ino != ip1->i_ino);
        if (ip0->i_ino > ip1->i_ino) {
@@ -818,7 +871,7 @@ xfs_ialloc(
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
        xfs_trans_log_inode(tp, ip, flags);
-        /* now that we have an i_mode we can setup inode ops and unlock */
+        /* now that we have an i_mode we can setup the inode structure */
        xfs_setup_inode(ip);
        *ipp = ip;
@@ -1235,12 +1288,14 @@ xfs_create(
        xfs_trans_cancel(tp, cancel_flags);
 out_release_inode:
        /*
-         * Wait until after the current transaction is aborted to
+         * Wait until after the current transaction is aborted to finish the
-         * release the inode.  This prevents recursive transactions
+         * setup of the inode and release the inode.  This prevents recursive
-         * and deadlocks from xfs_inactive.
+         * transactions and deadlocks from xfs_inactive.
         */
-        if (ip)
+        if (ip) {
+                xfs_finish_inode_setup(ip);
                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -1345,12 +1400,14 @@ xfs_create_tmpfile(
        xfs_trans_cancel(tp, cancel_flags);
 out_release_inode:
        /*
-         * Wait until after the current transaction is aborted to
+         * Wait until after the current transaction is aborted to finish the
-         * release the inode.  This prevents recursive transactions
+         * setup of the inode and release the inode.  This prevents recursive
-         * and deadlocks from xfs_inactive.
+         * transactions and deadlocks from xfs_inactive.
         */
-        if (ip)
+        if (ip) {
+                xfs_finish_inode_setup(ip);
                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
@@ -2611,19 +2668,22 @@ xfs_remove(
 /*
 * Enter all inodes for a rename transaction into a sorted array.
 */
+#define __XFS_SORT_INODES       5
 STATIC void
 xfs_sort_for_rename(
-        xfs_inode_t     *dp1,   /* in: old (source) directory inode */
+        struct xfs_inode        *dp1,   /* in: old (source) directory inode */
-        xfs_inode_t     *dp2,   /* in: new (target) directory inode */
+        struct xfs_inode        *dp2,   /* in: new (target) directory inode */
-        xfs_inode_t     *ip1,   /* in: inode of old entry */
+        struct xfs_inode        *ip1,   /* in: inode of old entry */
-        xfs_inode_t     *ip2,   /* in: inode of new entry, if it
+        struct xfs_inode        *ip2,   /* in: inode of new entry */
-                                   already exists, NULL otherwise. */
+        struct xfs_inode        *wip,   /* in: whiteout inode */
-        xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
+        struct xfs_inode        **i_tab,/* out: sorted array of inodes */
-        int             *num_inodes)  /* out: number of inodes in array */
+        int                     *num_inodes)  /* in/out: inodes in array */
 {
-        xfs_inode_t             *temp;
        int                     i, j;
+        ASSERT(*num_inodes == __XFS_SORT_INODES);
+        memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
        /*
         * i_tab contains a list of pointers to inodes.  We initialize
         * the table here & we'll sort it.  We will then use it to
@@ -2631,25 +2691,24 @@ xfs_sort_for_rename(
         *
         * Note that the table may contain duplicates.  e.g., dp1 == dp2.
         */
-        i_tab[0] = dp1;
+        i = 0;
-        i_tab[1] = dp2;
+        i_tab[i++] = dp1;
-        i_tab[2] = ip1;
+        i_tab[i++] = dp2;
-        if (ip2) {
+        i_tab[i++] = ip1;
-                *num_inodes = 4;
+        if (ip2)
-                i_tab[3] = ip2;
+                i_tab[i++] = ip2;
-        } else {
+        if (wip)
-                *num_inodes = 3;
+                i_tab[i++] = wip;
-                i_tab[3] = NULL;
+        *num_inodes = i;
-        }
        /*
         * Sort the elements via bubble sort.  (Remember, there are at
-         * most 4 elements to sort, so this is adequate.)
+         * most 5 elements to sort, so this is adequate.)
         */
        for (i = 0; i < *num_inodes; i++) {
                for (j = 1; j < *num_inodes; j++) {
                        if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
-                                temp = i_tab[j];
+                                struct xfs_inode *temp = i_tab[j];
                                i_tab[j] = i_tab[j-1];
                                i_tab[j-1] = temp;
                        }
@@ -2657,6 +2716,31 @@ xfs_sort_for_rename(
        }
 }
+static int
+xfs_finish_rename(
+        struct xfs_trans        *tp,
+        struct xfs_bmap_free    *free_list)
+{
+        int                     committed = 0;
+        int                     error;
+        /*
+         * If this is a synchronous mount, make sure that the rename transaction
+         * goes to disk before returning to the user.
+         */
+        if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
+                xfs_trans_set_sync(tp);
+        error = xfs_bmap_finish(&tp, free_list, &committed);
+        if (error) {
+                xfs_bmap_cancel(free_list);
+                xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+                return error;
+        }
+        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+}
 /*
 * xfs_cross_rename()
 *
@@ -2685,14 +2769,14 @@ xfs_cross_rename(
                                ip2->i_ino,
                                first_block, free_list, spaceres);
        if (error)
-                goto out;
+                goto out_trans_abort;
        /* Swap inode number for dirent in second parent */
        error = xfs_dir_replace(tp, dp2, name2,
                                ip1->i_ino,
                                first_block, free_list, spaceres);
        if (error)
-                goto out;
+                goto out_trans_abort;
        /*
         * If we're renaming one or more directories across different parents,
@@ -2707,16 +2791,16 @@ xfs_cross_rename(
                                                dp1->i_ino, first_block,
                                                free_list, spaceres);
                        if (error)
-                                goto out;
+                                goto out_trans_abort;
                        /* transfer ip2 ".." reference to dp1 */
                        if (!S_ISDIR(ip1->i_d.di_mode)) {
                                error = xfs_droplink(tp, dp2);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                                error = xfs_bumplink(tp, dp1);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                        }
                        /*
@@ -2734,16 +2818,16 @@ xfs_cross_rename(
                                                dp2->i_ino, first_block,
                                                free_list, spaceres);
                        if (error)
-                                goto out;
+                                goto out_trans_abort;
                        /* transfer ip1 ".." reference to dp2 */
                        if (!S_ISDIR(ip2->i_d.di_mode)) {
                                error = xfs_droplink(tp, dp1);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                                error = xfs_bumplink(tp, dp2);
                                if (error)
-                                        goto out;
+                                        goto out_trans_abort;
                        }
                        /*
@@ -2771,66 +2855,108 @@ xfs_cross_rename(
        }
        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
-out:
+        return xfs_finish_rename(tp, free_list);
+out_trans_abort:
+        xfs_bmap_cancel(free_list);
+        xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
        return error;
 }
 /*
+ * xfs_rename_alloc_whiteout()
+ *
+ * Return a referenced, unlinked, unlocked inode that that can be used as a
+ * whiteout in a rename transaction. We use a tmpfile inode here so that if we
+ * crash between allocating the inode and linking it into the rename transaction
+ * recovery will free the inode and we won't leak it.
+ */
+static int
+xfs_rename_alloc_whiteout(
+        struct xfs_inode        *dp,
+        struct xfs_inode        **wip)
+{
+        struct xfs_inode        *tmpfile;
+        int                     error;
+        error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
+        if (error)
+                return error;
+        /* Satisfy xfs_bumplink that this is a real tmpfile */
+        xfs_finish_inode_setup(tmpfile);
+        VFS_I(tmpfile)->i_state |= I_LINKABLE;
+        *wip = tmpfile;
+        return 0;
+}
+/*
 * xfs_rename
 */
 int
 xfs_rename(
-        xfs_inode_t     *src_dp,
+        struct xfs_inode        *src_dp,
-        struct xfs_name *src_name,
+        struct xfs_name         *src_name,
-        xfs_inode_t     *src_ip,
+        struct xfs_inode        *src_ip,
-        xfs_inode_t     *target_dp,
+        struct xfs_inode        *target_dp,
-        struct xfs_name *target_name,
+        struct xfs_name         *target_name,
-        xfs_inode_t     *target_ip,
+        struct xfs_inode        *target_ip,
-        unsigned int    flags)
+        unsigned int            flags)
 {
-        xfs_trans_t     *tp = NULL;
+        struct xfs_mount        *mp = src_dp->i_mount;
-        xfs_mount_t     *mp = src_dp->i_mount;
+        struct xfs_trans        *tp;
-        int             new_parent;             /* moving to a new dir */
+        struct xfs_bmap_free    free_list;
-        int             src_is_directory;       /* src_name is a directory */
+        xfs_fsblock_t           first_block;
-        int             error;
+        struct xfs_inode        *wip = NULL;            /* whiteout inode */
-        xfs_bmap_free_t free_list;
+        struct xfs_inode        *inodes[__XFS_SORT_INODES];
-        xfs_fsblock_t   first_block;
+        int                     num_inodes = __XFS_SORT_INODES;
-        int             cancel_flags;
+        bool                    new_parent = (src_dp != target_dp);
-        int             committed;
+        bool                    src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
-        xfs_inode_t     *inodes[4];
+        int                     cancel_flags = 0;
-        int             spaceres;
+        int                     spaceres;
-        int             num_inodes;
+        int                     error;
        trace_xfs_rename(src_dp, target_dp, src_name, target_name);
-        new_parent = (src_dp != target_dp);
+        if ((flags & RENAME_EXCHANGE) && !target_ip)
-        src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
+                return -EINVAL;
-        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
+        /*
+         * If we are doing a whiteout operation, allocate the whiteout inode
+         * we will be placing at the target and ensure the type is set
+         * appropriately.
+         */
+        if (flags & RENAME_WHITEOUT) {
+                ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
+                error = xfs_rename_alloc_whiteout(target_dp, &wip);
+                if (error)
+                        return error;
+                /* setup target dirent info as whiteout */
+                src_name->type = XFS_DIR3_FT_CHRDEV;
+        }
+        xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
                                inodes, &num_inodes);
-        xfs_bmap_init(&free_list, &first_block);
        tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
-        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
        spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
        if (error == -ENOSPC) {
                spaceres = 0;
                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
        }
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, 0);
+                goto out_trans_cancel;
-                goto std_return;
+        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
-        }
        /*
         * Attach the dquots to the inodes
         */
        error = xfs_qm_vop_rename_dqattach(inodes);
-        if (error) {
+        if (error)
-                xfs_trans_cancel(tp, cancel_flags);
+                goto out_trans_cancel;
-                goto std_return;
-        }
        /*
         * Lock all the participating inodes. Depending upon whether
@@ -2851,6 +2977,8 @@ xfs_rename(
        xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
        if (target_ip)
                xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
+        if (wip)
+                xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
        /*
         * If we are using project inheritance, we only allow renames
@@ -2860,24 +2988,16 @@ xfs_rename(
        if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
                     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
                error = -EXDEV;
-                goto error_return;
+                goto out_trans_cancel;
        }
-        /*
+        xfs_bmap_init(&free_list, &first_block);
-         * Handle RENAME_EXCHANGE flags
-         */
+        /* RENAME_EXCHANGE is unique from here on. */
-        if (flags & RENAME_EXCHANGE) {
+        if (flags & RENAME_EXCHANGE)
-                if (target_ip == NULL) {
+                return xfs_cross_rename(tp, src_dp, src_name, src_ip,
-                        error = -EINVAL;
+                                        target_dp, target_name, target_ip,
-                        goto error_return;
+                                        &free_list, &first_block, spaceres);
-                }
-                error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
-                                         target_dp, target_name, target_ip,
-                                         &free_list, &first_block, spaceres);
-                if (error)
-                        goto abort_return;
-                goto finish_rename;
-        }
        /*
         * Set up the target.
@@ -2890,7 +3010,7 @@ xfs_rename(
                if (!spaceres) {
                        error = xfs_dir_canenter(tp, target_dp, target_name);
                        if (error)
-                                goto error_return;
+                                goto out_trans_cancel;
                }
                /*
                 * If target does not exist and the rename crosses
@@ -2901,9 +3021,9 @@ xfs_rename(
                                                src_ip->i_ino, &first_block,
                                                &free_list, spaceres);
                if (error == -ENOSPC)
-                        goto error_return;
+                        goto out_bmap_cancel;
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2911,7 +3031,7 @@ xfs_rename(
                if (new_parent && src_is_directory) {
                        error = xfs_bumplink(tp, target_dp);
                        if (error)
-                                goto abort_return;
+                                goto out_trans_abort;
                }
        } else { /* target_ip != NULL */
                /*
@@ -2926,7 +3046,7 @@ xfs_rename(
                        if (!(xfs_dir_isempty(target_ip)) ||
                            (target_ip->i_d.di_nlink > 2)) {
                                error = -EEXIST;
-                                goto error_return;
+                                goto out_trans_cancel;
                        }
                }
@@ -2943,7 +3063,7 @@ xfs_rename(
                                        src_ip->i_ino,
                                        &first_block, &free_list, spaceres);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                xfs_trans_ichgtime(tp, target_dp,
                                        XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2954,7 +3074,7 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, target_ip);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
                if (src_is_directory) {
                        /*
@@ -2962,7 +3082,7 @@ xfs_rename(
                         */
                        error = xfs_droplink(tp, target_ip);
                        if (error)
-                                goto abort_return;
+                                goto out_trans_abort;
                }
        } /* target_ip != NULL */
@@ -2979,7 +3099,7 @@ xfs_rename(
                                        &first_block, &free_list, spaceres);
                ASSERT(error != -EEXIST);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
        }
        /*
@@ -3005,49 +3125,67 @@ xfs_rename(
                 */
                error = xfs_droplink(tp, src_dp);
                if (error)
-                        goto abort_return;
+                        goto out_trans_abort;
        }
-        error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+        /*
+         * For whiteouts, we only need to update the source dirent with the
+         * inode number of the whiteout inode rather than removing it
+         * altogether.
+         */
+        if (wip) {
+                error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
                                        &first_block, &free_list, spaceres);
+        } else
+                error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
+                                           &first_block, &free_list, spaceres);
        if (error)
-                goto abort_return;
+                goto out_trans_abort;
-        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-        if (new_parent)
-                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-finish_rename:
        /*
-         * If this is a synchronous mount, make sure that the
+         * For whiteouts, we need to bump the link count on the whiteout inode.
-         * rename transaction goes to disk before returning to
+         * This means that failures all the way up to this point leave the inode
-         * the user.
+         * on the unlinked list and so cleanup is a simple matter of dropping
+         * the remaining reference to it. If we fail here after bumping the link
+         * count, we're shutting down the filesystem so we'll never see the
+         * intermediate state on disk.
         */
-        if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+        if (wip) {
-                xfs_trans_set_sync(tp);
+                ASSERT(wip->i_d.di_nlink == 0);
-        }
+                error = xfs_bumplink(tp, wip);
+                if (error)
+                        goto out_trans_abort;
+                error = xfs_iunlink_remove(tp, wip);
+                if (error)
+                        goto out_trans_abort;
+                xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
-        error = xfs_bmap_finish(&tp, &free_list, &committed);
+                /*
-        if (error) {
+                 * Now we have a real link, clear the "I'm a tmpfile" state
-                xfs_bmap_cancel(&free_list);
+                 * flag from the inode so it doesn't accidentally get misused in
-                xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
+                 * future.
-                                 XFS_TRANS_ABORT));
+                 */
-                goto std_return;
+                VFS_I(wip)->i_state &= ~I_LINKABLE;
        }
-        /*
+        xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-         * trans_commit will unlock src_ip, target_ip & decrement
+        xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-         * the vnode references.
+        if (new_parent)
-         */
+                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
-        return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- abort_return:
+        error = xfs_finish_rename(tp, &free_list);
+        if (wip)
+                IRELE(wip);
+        return error;
+out_trans_abort:
        cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_bmap_cancel:
        xfs_bmap_cancel(&free_list);
+out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
- std_return:
+        if (wip)
+                IRELE(wip);
        return error;
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a1cd55f3f351..8f22d20368d8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
        struct xfs_inode_log_item *i_itemp;     /* logging information */
        mrlock_t                i_lock;         /* inode lock */
        mrlock_t                i_iolock;       /* inode IO lock */
+        mrlock_t                i_mmaplock;     /* inode mmap IO lock */
        atomic_t                i_pincount;     /* inode pin count */
        spinlock_t              i_flags_lock;   /* inode i_flags lock */
        /* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define XFS_IOLOCK_SHARED       (1<<1)
 #define XFS_ILOCK_EXCL          (1<<2)
 #define XFS_ILOCK_SHARED        (1<<3)
+#define XFS_MMAPLOCK_EXCL       (1<<4)
+#define XFS_MMAPLOCK_SHARED     (1<<5)
 #define XFS_LOCK_MASK           (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
-                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)
+                                | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
+                                | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
 #define XFS_LOCK_FLAGS \
        { XFS_IOLOCK_EXCL,      "IOLOCK_EXCL" }, \
        { XFS_IOLOCK_SHARED,    "IOLOCK_SHARED" }, \
        { XFS_ILOCK_EXCL,       "ILOCK_EXCL" }, \
-        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }
+        { XFS_ILOCK_SHARED,     "ILOCK_SHARED" }, \
+        { XFS_MMAPLOCK_EXCL,    "MMAPLOCK_EXCL" }, \
+        { XFS_MMAPLOCK_SHARED,  "MMAPLOCK_SHARED" }
 /*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 #define XFS_IOLOCK_SHIFT        16
 #define XFS_IOLOCK_PARENT       (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_SHIFT      20
 #define XFS_ILOCK_SHIFT         24
 #define XFS_ILOCK_PARENT        (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
 #define XFS_ILOCK_RTBITMAP      (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
 #define XFS_ILOCK_RTSUM         (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
-#define XFS_IOLOCK_DEP_MASK     0x00ff0000
+#define XFS_IOLOCK_DEP_MASK     0x000f0000
+#define XFS_MMAPLOCK_DEP_MASK   0x00f00000
 #define XFS_ILOCK_DEP_MASK      0xff000000
-#define XFS_LOCK_DEP_MASK       (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK)
+#define XFS_LOCK_DEP_MASK       (XFS_IOLOCK_DEP_MASK | \
+                                 XFS_MMAPLOCK_DEP_MASK | \
+                                 XFS_ILOCK_DEP_MASK)
-#define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
+#define XFS_IOLOCK_DEP(flags)   (((flags) & XFS_IOLOCK_DEP_MASK) \
-#define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
+                                        >> XFS_IOLOCK_SHIFT)
+#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \
+                                        >> XFS_MMAPLOCK_SHIFT)
+#define XFS_ILOCK_DEP(flags)    (((flags) & XFS_ILOCK_DEP_MASK) \
+                                        >> XFS_ILOCK_SHIFT)
 /*
 * For multiple groups support: if S_ISGID bit is set in the parent
@@ -391,6 +406,28 @@ int	xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
 int     xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
+/* from xfs_iops.c */
+/*
+ * When setting up a newly allocated inode, we need to call
+ * xfs_finish_inode_setup() once the inode is fully instantiated at
+ * the VFS level to prevent the rest of the world seeing the inode
+ * before we've completed instantiation. Otherwise we can do it
+ * the moment the inode lookup is complete.
+ */
+extern void xfs_setup_inode(struct xfs_inode *ip);
+static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
+{
+        xfs_iflags_clear(ip, XFS_INEW);
+        barrier();
+        unlock_new_inode(VFS_I(ip));
+}
+static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
+{
+        xfs_setup_inode(ip);
+        xfs_finish_inode_setup(ip);
+}
 #define IHOLD(ip) \
 do { \
        ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae45eb3..5f4a396f5186 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -631,7 +631,7 @@ xfs_ioc_space(
        if (filp->f_flags & O_DSYNC)
                flags |= XFS_PREALLOC_SYNC;
-        if (ioflags & XFS_IO_INVIS)     
+        if (ioflags & XFS_IO_INVIS)
                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
@@ -639,10 +639,13 @@ xfs_ioc_space(
                return error;
        xfs_ilock(ip, iolock);
-        error = xfs_break_layouts(inode, &iolock);
+        error = xfs_break_layouts(inode, &iolock, false);
        if (error)
                goto out_unlock;
+        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+        iolock |= XFS_MMAPLOCK_EXCL;
        switch (bf->l_whence) {
        case 0: /*SEEK_SET*/
                break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ccb1dd0d509e..38e633bad8c2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -460,8 +460,7 @@ xfs_iomap_prealloc_size(
        alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
                                       alloc_blocks);
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        freesp = percpu_counter_read_positive(&mp->m_fdblocks);
-        freesp = mp->m_sb.sb_fdblocks;
        if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
                shift = 2;
                if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a90331422..2f1839e4dd1b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
        else
                d_instantiate(dentry, inode);
+        xfs_finish_inode_setup(ip);
 out_free_acl:
        if (default_acl)
                posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
        return error;
 out_cleanup_inode:
+        xfs_finish_inode_setup(ip);
        if (!tmpfile)
                xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
                goto out_cleanup_inode;
        d_instantiate(dentry, inode);
+        xfs_finish_inode_setup(cip);
        return 0;
 out_cleanup_inode:
+        xfs_finish_inode_setup(cip);
        xfs_cleanup_inode(dir, inode, dentry);
        iput(inode);
 out:
@@ -389,7 +394,7 @@ xfs_vn_rename(
        struct xfs_name oname;
        struct xfs_name nname;
-        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;
        /* if we are exchanging files, we need to set i_mode of both files */
@@ -766,6 +771,7 @@ xfs_setattr_size(
                return error;
        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+        ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
        ASSERT(S_ISREG(ip->i_d.di_mode));
        ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
                ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -829,55 +835,27 @@ xfs_setattr_size(
        inode_dio_wait(inode);
        /*
-         * Do all the page cache truncate work outside the transaction context
+         * We've already locked out new page faults, so now we can safely remove
-         * as the "lock" order is page lock->log space reservation.  i.e.
+         * pages from the page cache knowing they won't get refaulted until we
-         * locking pages inside the transaction can ABBA deadlock with
+         * drop the XFS_MMAP_EXCL lock after the extent manipulations are
-         * writeback. We have to do the VFS inode size update before we truncate
+         * complete. The truncate_setsize() call also cleans partial EOF page
-         * the pagecache, however, to avoid racing with page faults beyond the
+         * PTEs on extending truncates and hence ensures sub-page block size
-         * new EOF they are not serialised against truncate operations except by
+         * filesystems are correctly handled, too.
-         * page locks and size updates.
         *
-         * Hence we are in a situation where a truncate can fail with ENOMEM
+         * We have to do all the page cache truncate work outside the
-         * from xfs_trans_reserve(), but having already truncated the in-memory
+         * transaction context as the "lock" order is page lock->log space
-         * version of the file (i.e. made user visible changes). There's not
+         * reservation as defined by extent allocation in the writeback path.
-         * much we can do about this, except to hope that the caller sees ENOMEM
+         * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
-         * and retries the truncate operation.
+         * having already truncated the in-memory version of the file (i.e. made
+         * user visible changes). There's not much we can do about this, except
+         * to hope that the caller sees ENOMEM and retries the truncate
+         * operation.
         */
        error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
        if (error)
                return error;
        truncate_setsize(inode, newsize);
-        /*
-         * The "we can't serialise against page faults" pain gets worse.
-         *
-         * If the file is mapped then we have to clean the page at the old EOF
-         * when extending the file. Extending the file can expose changes the
-         * underlying page mapping (e.g. from beyond EOF to a hole or
-         * unwritten), and so on the next attempt to write to that page we need
-         * to remap it for write. i.e. we need .page_mkwrite() to be called.
-         * Hence we need to clean the page to clean the pte and so a new write
-         * fault will be triggered appropriately.
-         *
-         * If we do it before we change the inode size, then we can race with a
-         * page fault that maps the page with exactly the same problem. If we do
-         * it after we change the file size, then a new page fault can come in
-         * and allocate space before we've run the rest of the truncate
-         * transaction. That's kinda grotesque, but it's better than have data
-         * over a hole, and so that's the lesser evil that has been chosen here.
-         *
-         * The real solution, however, is to have some mechanism for locking out
-         * page faults while a truncate is in progress.
-         */
-        if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
-                error = filemap_write_and_wait_range(
-                                VFS_I(ip)->i_mapping,
-                                round_down(oldsize, PAGE_CACHE_SIZE),
-                                round_up(oldsize, PAGE_CACHE_SIZE) - 1);
-                if (error)
-                        return error;
-        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
        if (error)
@@ -975,9 +953,13 @@ xfs_vn_setattr(
                uint            iolock = XFS_IOLOCK_EXCL;
                xfs_ilock(ip, iolock);
-                error = xfs_break_layouts(dentry->d_inode, &iolock);
+                error = xfs_break_layouts(dentry->d_inode, &iolock, true);
-                if (!error)
+                if (!error) {
+                        xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+                        iolock |= XFS_MMAPLOCK_EXCL;
                        error = xfs_setattr_size(ip, iattr);
+                }
                xfs_iunlock(ip, iolock);
        } else {
                error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags(
 }
 /*
- * Initialize the Linux inode, set up the operation vectors and
+ * Initialize the Linux inode and set up the operation vectors.
- * unlock the inode.
 *
- * When reading existing inodes from disk this is called directly
+ * When reading existing inodes from disk this is called directly from xfs_iget,
- * from xfs_iget, when creating a new inode it is called from
+ * when creating a new inode it is called from xfs_ialloc after setting up the
- * xfs_ialloc after setting up the inode.
+ * inode. These callers have different criteria for clearing XFS_INEW, so leave
- *
+ * it up to the caller to deal with unlocking the inode appropriately.
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
 */
 void
 xfs_setup_inode(
@@ -1324,9 +1302,4 @@ xfs_setup_inode(
                inode_has_no_xattr(inode);
                cache_no_acl(inode);
        }
-        xfs_iflags_clear(ip, XFS_INEW);
-        barrier();
-        unlock_new_inode(inode);
 }
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ea7a98e9cb70..a0f84abb0d09 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
 extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-extern void xfs_setup_inode(struct xfs_inode *);
 /*
 * Internal setattr interfaces.
 */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 82e314258f73..80429891dc9b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk(
        error = xfs_inobt_get_rec(cur, irec, &stat);
        if (error)
                return error;
-        XFS_WANT_CORRUPTED_RETURN(stat == 1);
+        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
        /* Check if the record contains the inode in request */
        if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index c31d2c2eadc4..7c7842c85a08 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t;
 #undef XFS_NATIVE_HOST
 #endif
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
-#else
-#undef  HAVE_PERCPU_SB  /* per cpu superblock counters are a 2.6 feature */
-#endif
 #define irix_sgid_inherit       xfs_params.sgid_inherit.val
 #define irix_symlink_mode       xfs_params.symlink_mode.val
 #define xfs_panic_mask          xfs_params.panic_mask.val
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5a945fc3bdc..4f5784f85a5b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4463,10 +4463,10 @@ xlog_do_recover(
        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
+        xfs_reinit_percpu_counters(log->l_mp);
        xfs_buf_relse(bp);
-        /* We've re-read the superblock so re-initialize per-cpu counters */
-        xfs_icsb_reinit_counters(log->l_mp);
        xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4fa80e63eea2..2ce7ee3b4ec1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,18 +43,6 @@
 #include "xfs_sysfs.h"
-#ifdef HAVE_PERCPU_SB
-STATIC void     xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
-                                                int);
-STATIC void     xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
-                                                int);
-STATIC void     xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-#else
-#define xfs_icsb_balance_counter(mp, a, b)              do { } while (0)
-#define xfs_icsb_balance_counter_locked(mp, a, b)       do { } while (0)
-#endif
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
 static int xfs_uuid_table_size;
 static uuid_t *xfs_uuid_table;
@@ -347,8 +335,7 @@ reread:
                goto reread;
        }
-        /* Initialize per-cpu counters */
+        xfs_reinit_percpu_counters(mp);
-        xfs_icsb_reinit_counters(mp);
        /* no need to be quiet anymore, so reset the buf ops */
        bp->b_ops = &xfs_sb_buf_ops;
@@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
-        xfs_icsb_sync_counters(mp, 0);
        /*
         * we don't need to do this if we are updating the superblock
         * counters on every modification.
@@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp)
        return xfs_sync_sb(mp, true);
 }
-/*
+int
- * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply
+xfs_mod_icount(
- * a delta to a specified field in the in-core superblock.  Simply
+        struct xfs_mount        *mp,
- * switch on the field indicated and apply the delta to that field.
+        int64_t                 delta)
- * Fields are not allowed to dip below zero, so if the delta would
- * do this do not apply it and return EINVAL.
- *
- * The m_sb_lock must be held when this routine is called.
- */
-STATIC int
-xfs_mod_incore_sb_unlocked(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int64_t         delta,
-        int             rsvd)
 {
-        int             scounter;       /* short counter for 32 bit fields */
+        /* deltas are +/-64, hence the large batch size of 128. */
-        long long       lcounter;       /* long counter for 64 bit fields */
+        __percpu_counter_add(&mp->m_icount, delta, 128);
-        long long       res_used, rem;
+        if (percpu_counter_compare(&mp->m_icount, 0) < 0) {
-        /*
-         * With the in-core superblock spin lock held, switch
-         * on the indicated field.  Apply the delta to the
-         * proper field.  If the fields value would dip below
-         * 0, then do not apply the delta and return EINVAL.
-         */
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                lcounter = (long long)mp->m_sb.sb_icount;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_icount = lcounter;
-                return 0;
-        case XFS_SBS_IFREE:
-                lcounter = (long long)mp->m_sb.sb_ifree;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_ifree = lcounter;
-                return 0;
-        case XFS_SBS_FDBLOCKS:
-                lcounter = (long long)
-                        mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
-                if (delta > 0) {                /* Putting blocks back */
-                        if (res_used > delta) {
-                                mp->m_resblks_avail += delta;
-                        } else {
-                                rem = delta - res_used;
-                                mp->m_resblks_avail = mp->m_resblks;
-                                lcounter += rem;
-                        }
-                } else {                                /* Taking blocks away */
-                        lcounter += delta;
-                        if (lcounter >= 0) {
-                                mp->m_sb.sb_fdblocks = lcounter +
-                                                        XFS_ALLOC_SET_ASIDE(mp);
-                                return 0;
-                        }
-                        /*
-                         * We are out of blocks, use any available reserved
-                         * blocks if were allowed to.
-                         */
-                        if (!rsvd)
-                                return -ENOSPC;
-                        lcounter = (long long)mp->m_resblks_avail + delta;
-                        if (lcounter >= 0) {
-                                mp->m_resblks_avail = lcounter;
-                                return 0;
-                        }
-                        printk_once(KERN_WARNING
-                                "Filesystem \"%s\": reserve blocks depleted! "
-                                "Consider increasing reserve pool size.",
-                                mp->m_fsname);
-                        return -ENOSPC;
-                }
-                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-                return 0;
-        case XFS_SBS_FREXTENTS:
-                lcounter = (long long)mp->m_sb.sb_frextents;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        return -ENOSPC;
-                }
-                mp->m_sb.sb_frextents = lcounter;
-                return 0;
-        case XFS_SBS_DBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_dblocks;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_dblocks = lcounter;
-                return 0;
-        case XFS_SBS_AGCOUNT:
-                scounter = mp->m_sb.sb_agcount;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_agcount = scounter;
-                return 0;
-        case XFS_SBS_IMAX_PCT:
-                scounter = mp->m_sb.sb_imax_pct;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_imax_pct = scounter;
-                return 0;
-        case XFS_SBS_REXTSIZE:
-                scounter = mp->m_sb.sb_rextsize;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextsize = scounter;
-                return 0;
-        case XFS_SBS_RBMBLOCKS:
-                scounter = mp->m_sb.sb_rbmblocks;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rbmblocks = scounter;
-                return 0;
-        case XFS_SBS_RBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_rblocks;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rblocks = lcounter;
-                return 0;
-        case XFS_SBS_REXTENTS:
-                lcounter = (long long)mp->m_sb.sb_rextents;
-                lcounter += delta;
-                if (lcounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextents = lcounter;
-                return 0;
-        case XFS_SBS_REXTSLOG:
-                scounter = mp->m_sb.sb_rextslog;
-                scounter += delta;
-                if (scounter < 0) {
-                        ASSERT(0);
-                        return -EINVAL;
-                }
-                mp->m_sb.sb_rextslog = scounter;
-                return 0;
-        default:
                ASSERT(0);
+                percpu_counter_add(&mp->m_icount, -delta);
                return -EINVAL;
        }
+        return 0;
 }
-/*
- * xfs_mod_incore_sb() is used to change a field in the in-core
- * superblock structure by the specified delta.  This modification
- * is protected by the m_sb_lock.  Just use the xfs_mod_incore_sb_unlocked()
- * routine to do the work.
- */
 int
-xfs_mod_incore_sb(
+xfs_mod_ifree(
        struct xfs_mount        *mp,
-        xfs_sb_field_t          field,
+        int64_t                 delta)
-        int64_t                 delta,
-        int                     rsvd)
 {
-        int                     status;
+        percpu_counter_add(&mp->m_ifree, delta);
+        if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
-#ifdef HAVE_PERCPU_SB
+                ASSERT(0);
-        ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
+                percpu_counter_add(&mp->m_ifree, -delta);
-#endif
+                return -EINVAL;
-        spin_lock(&mp->m_sb_lock);
+        }
-        status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
+        return 0;
-        spin_unlock(&mp->m_sb_lock);
-        return status;
 }
-/*
- * Change more than one field in the in-core superblock structure at a time.
- *
- * The fields and changes to those fields are specified in the array of
- * xfs_mod_sb structures passed in.  Either all of the specified deltas
- * will be applied or none of them will.  If any modified field dips below 0,
- * then all modifications will be backed out and EINVAL will be returned.
- *
- * Note that this function may not be used for the superblock values that
- * are tracked with the in-memory per-cpu counters - a direct call to
- * xfs_icsb_modify_counters is required for these.
- */
 int
-xfs_mod_incore_sb_batch(
+xfs_mod_fdblocks(
        struct xfs_mount        *mp,
-        xfs_mod_sb_t            *msb,
+        int64_t                 delta,
-        uint                    nmsb,
+        bool                    rsvd)
-        int                     rsvd)
 {
-        xfs_mod_sb_t            *msbp;
+        int64_t                 lcounter;
-        int                     error = 0;
+        long long               res_used;
+        s32                     batch;
+        if (delta > 0) {
+                /*
+                 * If the reserve pool is depleted, put blocks back into it
+                 * first. Most of the time the pool is full.
+                 */
+                if (likely(mp->m_resblks == mp->m_resblks_avail)) {
+                        percpu_counter_add(&mp->m_fdblocks, delta);
+                        return 0;
+                }
+                spin_lock(&mp->m_sb_lock);
+                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
+                if (res_used > delta) {
+                        mp->m_resblks_avail += delta;
+                } else {
+                        delta -= res_used;
+                        mp->m_resblks_avail = mp->m_resblks;
+                        percpu_counter_add(&mp->m_fdblocks, delta);
+                }
+                spin_unlock(&mp->m_sb_lock);
+                return 0;
+        }
        /*
-         * Loop through the array of mod structures and apply each individually.
+         * Taking blocks away, need to be more accurate the closer we
-         * If any fail, then back out all those which have already been applied.
+         * are to zero.
-         * Do all of this within the scope of the m_sb_lock so that all of the
+         *
-         * changes will be atomic.
+         * batch size is set to a maximum of 1024 blocks - if we are
+         * allocating of freeing extents larger than this then we aren't
+         * going to be hammering the counter lock so a lock per update
+         * is not a problem.
+         *
+         * If the counter has a value of less than 2 * max batch size,
+         * then make everything serialise as we are real close to
+         * ENOSPC.
+         */
+#define __BATCH 1024
+        if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0)
+                batch = 1;
+        else
+                batch = __BATCH;
+#undef __BATCH
+        __percpu_counter_add(&mp->m_fdblocks, delta, batch);
+        if (percpu_counter_compare(&mp->m_fdblocks,
+                                   XFS_ALLOC_SET_ASIDE(mp)) >= 0) {
+                /* we had space! */
+                return 0;
+        }
+        /*
+         * lock up the sb for dipping into reserves before releasing the space
+         * that took us to ENOSPC.
         */
        spin_lock(&mp->m_sb_lock);
-        for (msbp = msb; msbp < (msb + nmsb); msbp++) {
+        percpu_counter_add(&mp->m_fdblocks, -delta);
-                ASSERT(msbp->msb_field < XFS_SBS_ICOUNT ||
+        if (!rsvd)
-                       msbp->msb_field > XFS_SBS_FDBLOCKS);
+                goto fdblocks_enospc;
-                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+        lcounter = (long long)mp->m_resblks_avail + delta;
-                                                   msbp->msb_delta, rsvd);
+        if (lcounter >= 0) {
-                if (error)
+                mp->m_resblks_avail = lcounter;
-                        goto unwind;
+                spin_unlock(&mp->m_sb_lock);
+                return 0;
        }
+        printk_once(KERN_WARNING
+                "Filesystem \"%s\": reserve blocks depleted! "
+                "Consider increasing reserve pool size.",
+                mp->m_fsname);
+fdblocks_enospc:
        spin_unlock(&mp->m_sb_lock);
-        return 0;
+        return -ENOSPC;
+}
-unwind:
+int
-        while (--msbp >= msb) {
+xfs_mod_frextents(
-                error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
+        struct xfs_mount        *mp,
-                                                   -msbp->msb_delta, rsvd);
+        int64_t                 delta)
-                ASSERT(error == 0);
+{
-        }
+        int64_t                 lcounter;
+        int                     ret = 0;
+        spin_lock(&mp->m_sb_lock);
+        lcounter = mp->m_sb.sb_frextents + delta;
+        if (lcounter < 0)
+                ret = -ENOSPC;
+        else
+                mp->m_sb.sb_frextents = lcounter;
        spin_unlock(&mp->m_sb_lock);
-        return error;
+        return ret;
 }
 /*
@@ -1407,573 +1275,3 @@ xfs_dev_is_read_only(
        }
        return 0;
 }
-#ifdef HAVE_PERCPU_SB
-/*
- * Per-cpu incore superblock counters
- *
- * Simple concept, difficult implementation
- *
- * Basically, replace the incore superblock counters with a distributed per cpu
- * counter for contended fields (e.g.  free block count).
- *
- * Difficulties arise in that the incore sb is used for ENOSPC checking, and
- * hence needs to be accurately read when we are running low on space. Hence
- * there is a method to enable and disable the per-cpu counters based on how
- * much "stuff" is available in them.
- *
- * Basically, a counter is enabled if there is enough free resource to justify
- * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
- * ENOSPC), then we disable the counters to synchronise all callers and
- * re-distribute the available resources.
- *
- * If, once we redistributed the available resources, we still get a failure,
- * we disable the per-cpu counter and go through the slow path.
- *
- * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain its resources back to
- * the global superblock. We do this after disabling the counter to prevent
- * more threads from queueing up on the counter.
- *
- * Essentially, this means that we still need a lock in the fast path to enable
- * synchronisation between the global counters and the per-cpu counters. This
- * is not a problem because the lock will be local to a CPU almost all the time
- * and have little contention except when we get to ENOSPC conditions.
- *
- * Basically, this lock becomes a barrier that enables us to lock out the fast
- * path while we do things like enabling and disabling counters and
- * synchronising the counters.
- *
- * Locking rules:
- *
- *      1. m_sb_lock before picking up per-cpu locks
- *      2. per-cpu locks always picked up via for_each_online_cpu() order
- *      3. accurate counter sync requires m_sb_lock + per cpu locks
- *      4. modifying per-cpu counters requires holding per-cpu lock
- *      5. modifying global counters requires holding m_sb_lock
- *      6. enabling or disabling a counter requires holding the m_sb_lock 
- *         and _none_ of the per-cpu locks.
- *
- * Disabled counters are only ever re-enabled by a balance operation
- * that results in more free resources per CPU than a given threshold.
- * To ensure counters don't remain disabled, they are rebalanced when
- * the global resource goes above a higher threshold (i.e. some hysteresis
- * is present to prevent thrashing).
- */
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * hot-plug CPU notifier support.
- *
- * We need a notifier per filesystem as we need to be able to identify
- * the filesystem to balance the counters out. This is achieved by
- * having a notifier block embedded in the xfs_mount_t and doing pointer
- * magic to get the mount pointer from the notifier block address.
- */
-STATIC int
-xfs_icsb_cpu_notify(
-        struct notifier_block *nfb,
-        unsigned long action,
-        void *hcpu)
-{
-        xfs_icsb_cnts_t *cntp;
-        xfs_mount_t     *mp;
-        mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
-        cntp = (xfs_icsb_cnts_t *)
-                        per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                /* Easy Case - initialize the area and locks, and
-                 * then rebalance when online does everything else for us. */
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-                break;
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                xfs_icsb_lock(mp);
-                xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-                xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-                xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-                xfs_icsb_unlock(mp);
-                break;
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /* Disable all the counters, then fold the dead cpu's
-                 * count into the total on the global superblock and
-                 * re-enable the counters. */
-                xfs_icsb_lock(mp);
-                spin_lock(&mp->m_sb_lock);
-                xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
-                xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
-                xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
-                mp->m_sb.sb_icount += cntp->icsb_icount;
-                mp->m_sb.sb_ifree += cntp->icsb_ifree;
-                mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
-                xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
-                spin_unlock(&mp->m_sb_lock);
-                xfs_icsb_unlock(mp);
-                break;
-        }
-        return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-int
-xfs_icsb_init_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
-        if (mp->m_sb_cnts == NULL)
-                return -ENOMEM;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-        }
-        mutex_init(&mp->m_icsb_mutex);
-        /*
-         * start with all counters disabled so that the
-         * initial balance kicks us off correctly
-         */
-        mp->m_icsb_counters = -1;
-#ifdef CONFIG_HOTPLUG_CPU
-        mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-        mp->m_icsb_notifier.priority = 0;
-        register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-        return 0;
-}
-void
-xfs_icsb_reinit_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_lock(mp);
-        /*
-         * start with all counters disabled so that the
-         * initial balance kicks us off correctly
-         */
-        mp->m_icsb_counters = -1;
-        xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-        xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-        xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-        xfs_icsb_unlock(mp);
-}
-void
-xfs_icsb_destroy_counters(
-        xfs_mount_t     *mp)
-{
-        if (mp->m_sb_cnts) {
-                unregister_hotcpu_notifier(&mp->m_icsb_notifier);
-                free_percpu(mp->m_sb_cnts);
-        }
-        mutex_destroy(&mp->m_icsb_mutex);
-}
-STATIC void
-xfs_icsb_lock_cntr(
-        xfs_icsb_cnts_t *icsbp)
-{
-        while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
-                ndelay(1000);
-        }
-}
-STATIC void
-xfs_icsb_unlock_cntr(
-        xfs_icsb_cnts_t *icsbp)
-{
-        clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
-}
-STATIC void
-xfs_icsb_lock_all_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                xfs_icsb_lock_cntr(cntp);
-        }
-}
-STATIC void
-xfs_icsb_unlock_all_counters(
-        xfs_mount_t     *mp)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                xfs_icsb_unlock_cntr(cntp);
-        }
-}
-STATIC void
-xfs_icsb_count(
-        xfs_mount_t     *mp,
-        xfs_icsb_cnts_t *cnt,
-        int             flags)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
-        if (!(flags & XFS_ICSB_LAZY_COUNT))
-                xfs_icsb_lock_all_counters(mp);
-        for_each_online_cpu(i) {
-                cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-                cnt->icsb_icount += cntp->icsb_icount;
-                cnt->icsb_ifree += cntp->icsb_ifree;
-                cnt->icsb_fdblocks += cntp->icsb_fdblocks;
-        }
-        if (!(flags & XFS_ICSB_LAZY_COUNT))
-                xfs_icsb_unlock_all_counters(mp);
-}
-STATIC int
-xfs_icsb_counter_disabled(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field)
-{
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        return test_bit(field, &mp->m_icsb_counters);
-}
-STATIC void
-xfs_icsb_disable_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field)
-{
-        xfs_icsb_cnts_t cnt;
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        /*
-         * If we are already disabled, then there is nothing to do
-         * here. We check before locking all the counters to avoid
-         * the expensive lock operation when being called in the
-         * slow path and the counter is already disabled. This is
-         * safe because the only time we set or clear this state is under
-         * the m_icsb_mutex.
-         */
-        if (xfs_icsb_counter_disabled(mp, field))
-                return;
-        xfs_icsb_lock_all_counters(mp);
-        if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
-                /* drain back to superblock */
-                xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
-                switch(field) {
-                case XFS_SBS_ICOUNT:
-                        mp->m_sb.sb_icount = cnt.icsb_icount;
-                        break;
-                case XFS_SBS_IFREE:
-                        mp->m_sb.sb_ifree = cnt.icsb_ifree;
-                        break;
-                case XFS_SBS_FDBLOCKS:
-                        mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-                        break;
-                default:
-                        BUG();
-                }
-        }
-        xfs_icsb_unlock_all_counters(mp);
-}
-STATIC void
-xfs_icsb_enable_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        uint64_t        count,
-        uint64_t        resid)
-{
-        xfs_icsb_cnts_t *cntp;
-        int             i;
-        ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-        xfs_icsb_lock_all_counters(mp);
-        for_each_online_cpu(i) {
-                cntp = per_cpu_ptr(mp->m_sb_cnts, i);
-                switch (field) {
-                case XFS_SBS_ICOUNT:
-                        cntp->icsb_icount = count + resid;
-                        break;
-                case XFS_SBS_IFREE:
-                        cntp->icsb_ifree = count + resid;
-                        break;
-                case XFS_SBS_FDBLOCKS:
-                        cntp->icsb_fdblocks = count + resid;
-                        break;
-                default:
-                        BUG();
-                        break;
-                }
-                resid = 0;
-        }
-        clear_bit(field, &mp->m_icsb_counters);
-        xfs_icsb_unlock_all_counters(mp);
-}
-void
-xfs_icsb_sync_counters_locked(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        xfs_icsb_cnts_t cnt;
-        xfs_icsb_count(mp, &cnt, flags);
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
-                mp->m_sb.sb_icount = cnt.icsb_icount;
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
-                mp->m_sb.sb_ifree = cnt.icsb_ifree;
-        if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
-                mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-}
-/*
- * Accurate update of per-cpu counters to incore superblock
- */
-void
-xfs_icsb_sync_counters(
-        xfs_mount_t     *mp,
-        int             flags)
-{
-        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_sync_counters_locked(mp, flags);
-        spin_unlock(&mp->m_sb_lock);
-}
-/*
- * Balance and enable/disable counters as necessary.
- *
- * Thresholds for re-enabling counters are somewhat magic.  inode counts are
- * chosen to be the same number as single on disk allocation chunk per CPU, and
- * free blocks is something far enough zero that we aren't going thrash when we
- * get near ENOSPC. We also need to supply a minimum we require per cpu to
- * prevent looping endlessly when xfs_alloc_space asks for more than will
- * be distributed to a single CPU but each CPU has enough blocks to be
- * reenabled.
- *
- * Note that we can be called when counters are already disabled.
- * xfs_icsb_disable_counter() optimises the counter locking in this case to
- * prevent locking every per-cpu counter needlessly.
- */
-#define XFS_ICSB_INO_CNTR_REENABLE      (uint64_t)64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
-                (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
-STATIC void
-xfs_icsb_balance_counter_locked(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int             min_per_cpu)
-{
-        uint64_t        count, resid;
-        int             weight = num_online_cpus();
-        uint64_t        min = (uint64_t)min_per_cpu;
-        /* disable counter and sync counter */
-        xfs_icsb_disable_counter(mp, field);
-        /* update counters  - first CPU gets residual*/
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                count = mp->m_sb.sb_icount;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                        return;
-                break;
-        case XFS_SBS_IFREE:
-                count = mp->m_sb.sb_ifree;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-                        return;
-                break;
-        case XFS_SBS_FDBLOCKS:
-                count = mp->m_sb.sb_fdblocks;
-                resid = do_div(count, weight);
-                if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
-                        return;
-                break;
-        default:
-                BUG();
-                count = resid = 0;      /* quiet, gcc */
-                break;
-        }
-        xfs_icsb_enable_counter(mp, field, count, resid);
-}
-STATIC void
-xfs_icsb_balance_counter(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  fields,
-        int             min_per_cpu)
-{
-        spin_lock(&mp->m_sb_lock);
-        xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
-        spin_unlock(&mp->m_sb_lock);
-}
-int
-xfs_icsb_modify_counters(
-        xfs_mount_t     *mp,
-        xfs_sb_field_t  field,
-        int64_t         delta,
-        int             rsvd)
-{
-        xfs_icsb_cnts_t *icsbp;
-        long long       lcounter;       /* long counter for 64 bit fields */
-        int             ret = 0;
-        might_sleep();
-again:
-        preempt_disable();
-        icsbp = this_cpu_ptr(mp->m_sb_cnts);
-        /*
-         * if the counter is disabled, go to slow path
-         */
-        if (unlikely(xfs_icsb_counter_disabled(mp, field)))
-                goto slow_path;
-        xfs_icsb_lock_cntr(icsbp);
-        if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
-                xfs_icsb_unlock_cntr(icsbp);
-                goto slow_path;
-        }
-        switch (field) {
-        case XFS_SBS_ICOUNT:
-                lcounter = icsbp->icsb_icount;
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_icount = lcounter;
-                break;
-        case XFS_SBS_IFREE:
-                lcounter = icsbp->icsb_ifree;
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_ifree = lcounter;
-                break;
-        case XFS_SBS_FDBLOCKS:
-                BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-                lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-                lcounter += delta;
-                if (unlikely(lcounter < 0))
-                        goto balance_counter;
-                icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-                break;
-        default:
-                BUG();
-                break;
-        }
-        xfs_icsb_unlock_cntr(icsbp);
-        preempt_enable();
-        return 0;
-slow_path:
-        preempt_enable();
-        /*
-         * serialise with a mutex so we don't burn lots of cpu on
-         * the superblock lock. We still need to hold the superblock
-         * lock, however, when we modify the global structures.
-         */
-        xfs_icsb_lock(mp);
-        /*
-         * Now running atomically.
-         *
-         * If the counter is enabled, someone has beaten us to rebalancing.
-         * Drop the lock and try again in the fast path....
-         */
-        if (!(xfs_icsb_counter_disabled(mp, field))) {
-                xfs_icsb_unlock(mp);
-                goto again;
-        }
-        /*
-         * The counter is currently disabled. Because we are
-         * running atomically here, we know a rebalance cannot
-         * be in progress. Hence we can go straight to operating
-         * on the global superblock. We do not call xfs_mod_incore_sb()
-         * here even though we need to get the m_sb_lock. Doing so
-         * will cause us to re-enter this function and deadlock.
-         * Hence we get the m_sb_lock ourselves and then call
-         * xfs_mod_incore_sb_unlocked() as the unlocked path operates
-         * directly on the global counters.
-         */
-        spin_lock(&mp->m_sb_lock);
-        ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-        spin_unlock(&mp->m_sb_lock);
-        /*
-         * Now that we've modified the global superblock, we
-         * may be able to re-enable the distributed counters
-         * (e.g. lots of space just got freed). After that
-         * we are done.
-         */
-        if (ret != -ENOSPC)
-                xfs_icsb_balance_counter(mp, field, 0);
-        xfs_icsb_unlock(mp);
-        return ret;
-balance_counter:
-        xfs_icsb_unlock_cntr(icsbp);
-        preempt_enable();
-        /*
-         * We may have multiple threads here if multiple per-cpu
-         * counters run dry at the same time. This will mean we can
-         * do more balances than strictly necessary but it is not
-         * the common slowpath case.
-         */
-        xfs_icsb_lock(mp);
-        /*
-         * running atomically.
-         *
-         * This will leave the counter in the correct state for future
-         * accesses. After the rebalance, we simply try again and our retry
-         * will either succeed through the fast path or slow path without
-         * another balance operation being required.
-         */
-        xfs_icsb_balance_counter(mp, field, delta);
-        xfs_icsb_unlock(mp);
-        goto again;
-}
-#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0d8abd6364d9..8c995a2ccb6f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,8 +18,6 @@
 #ifndef __XFS_MOUNT_H__
 #define __XFS_MOUNT_H__
-#ifdef __KERNEL__
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
@@ -29,44 +27,6 @@ struct xfs_quotainfo;
 struct xfs_dir_ops;
 struct xfs_da_geometry;
-#ifdef HAVE_PERCPU_SB
-/*
- * Valid per-cpu incore superblock counters. Note that if you add new counters,
- * you may need to define new counter disabled bit field descriptors as there
- * are more possible fields in the superblock that can fit in a bitfield on a
- * 32 bit platform. The XFS_SBS_* values for the current current counters just
- * fit.
- */
-typedef struct xfs_icsb_cnts {
-        uint64_t        icsb_fdblocks;
-        uint64_t        icsb_ifree;
-        uint64_t        icsb_icount;
-        unsigned long   icsb_flags;
-} xfs_icsb_cnts_t;
-#define XFS_ICSB_FLAG_LOCK      (1 << 0)        /* counter lock bit */
-#define XFS_ICSB_LAZY_COUNT     (1 << 1)        /* accuracy not needed */
-extern int      xfs_icsb_init_counters(struct xfs_mount *);
-extern void     xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void     xfs_icsb_destroy_counters(struct xfs_mount *);
-extern void     xfs_icsb_sync_counters(struct xfs_mount *, int);
-extern void     xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
-extern int      xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
-                                                int64_t, int);
-#else
-#define xfs_icsb_init_counters(mp)              (0)
-#define xfs_icsb_destroy_counters(mp)           do { } while (0)
-#define xfs_icsb_reinit_counters(mp)            do { } while (0)
-#define xfs_icsb_sync_counters(mp, flags)       do { } while (0)
-#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
-#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
-        xfs_mod_incore_sb(mp, field, delta, rsvd)
-#endif
 /* dynamic preallocation free space thresholds, 5% down to 1% */
 enum {
        XFS_LOWSP_1_PCNT = 0,
@@ -81,8 +41,13 @@ typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
        struct xfs_ail          *m_ail;         /* fs active log item list */
-        xfs_sb_t                m_sb;           /* copy of fs superblock */
+        struct xfs_sb           m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
+        struct percpu_counter   m_icount;       /* allocated inodes counter */
+        struct percpu_counter   m_ifree;        /* free inodes counter */
+        struct percpu_counter   m_fdblocks;     /* free block counter */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
        char                    *m_fsname;      /* filesystem name */
        int                     m_fsname_len;   /* strlen of fs name */
@@ -152,12 +117,6 @@ typedef struct xfs_mount {
        const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
        uint                    m_chsize;       /* size of next field */
        atomic_t                m_active_trans; /* number trans frozen */
-#ifdef HAVE_PERCPU_SB
-        xfs_icsb_cnts_t __percpu *m_sb_cnts;    /* per-cpu superblock counters */
-        unsigned long           m_icsb_counters; /* disabled per-cpu counters */
-        struct notifier_block   m_icsb_notifier; /* hotplug cpu notifier */
-        struct mutex            m_icsb_mutex;   /* balancer sync lock */
-#endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct delayed_work     m_eofblocks_work; /* background eof blocks
@@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 }
 /*
- * Per-cpu superblock locking functions
- */
-#ifdef HAVE_PERCPU_SB
-static inline void
-xfs_icsb_lock(xfs_mount_t *mp)
-{
-        mutex_lock(&mp->m_icsb_mutex);
-}
-static inline void
-xfs_icsb_unlock(xfs_mount_t *mp)
-{
-        mutex_unlock(&mp->m_icsb_mutex);
-}
-#else
-#define xfs_icsb_lock(mp)
-#define xfs_icsb_unlock(mp)
-#endif
-/*
- * This structure is for use by the xfs_mod_incore_sb_batch() routine.
- * xfs_growfs can specify a few fields which are more than int limit
- */
-typedef struct xfs_mod_sb {
-        xfs_sb_field_t  msb_field;      /* Field to modify, see below */
-        int64_t         msb_delta;      /* Change to make to specified field */
-} xfs_mod_sb_t;
-/*
 * Per-ag incore structure, copies of information in agf and agi, to improve the
 * performance of allocation group selection.
 */
@@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int      xfs_mountfs(xfs_mount_t *mp);
 extern int      xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
                                     xfs_agnumber_t *maxagi);
 extern void     xfs_unmountfs(xfs_mount_t *);
-extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
+extern int      xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
-                        uint, int);
+extern int      xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
+extern int      xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+                                 bool reserved);
+extern int      xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
 extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
@@ -399,6 +332,4 @@ extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 extern void     xfs_set_low_space_thresholds(struct xfs_mount *);
-#endif  /* __KERNEL__ */
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 30ecca3037e3..f8a674d7f092 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -437,7 +437,7 @@ xfs_mru_cache_insert(
        if (!mru || !mru->lists)
                return -EINVAL;
-        if (radix_tree_preload(GFP_KERNEL))
+        if (radix_tree_preload(GFP_NOFS))
                return -ENOMEM;
        INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57ea760..981a657eca39 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
 int
 xfs_break_layouts(
        struct inode            *inode,
-        uint                    *iolock)
+        uint                    *iolock,
+        bool                    with_imutex)
 {
        struct xfs_inode        *ip = XFS_I(inode);
        int                     error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
        while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
                xfs_iunlock(ip, *iolock);
+                if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
+                        mutex_unlock(&inode->i_mutex);
                error = break_layout(inode, true);
                *iolock = XFS_IOLOCK_EXCL;
+                if (with_imutex)
+                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, *iolock);
        }
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce660f6..8147ac108820 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
                struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
 #else
-static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+static inline int
+xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
 {
        return 0;
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fbbb9e62e274..5538468c7f63 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
        xfs_trans_t     *tp;
        int             error;
        int             committed;
+        bool            need_alloc = true;
        *ip = NULL;
        /*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
                                return error;
                        mp->m_sb.sb_gquotino = NULLFSINO;
                        mp->m_sb.sb_pquotino = NULLFSINO;
+                        need_alloc = false;
                }
        }
@@ -758,7 +760,7 @@ xfs_qm_qino_alloc(
                return error;
        }
-        if (!*ip) {
+        if (need_alloc) {
                error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
                                                                &committed);
                if (error) {
@@ -794,11 +796,14 @@ xfs_qm_qino_alloc(
        spin_unlock(&mp->m_sb_lock);
        xfs_log_sb(tp);
-        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        if (error) {
+                ASSERT(XFS_FORCED_SHUTDOWN(mp));
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
-                return error;
        }
-        return 0;
+        if (need_alloc)
+                xfs_finish_inode_setup(*ip);
+        return error;
 }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8fcc4ccc5c79..5f357ca97e76 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj;	/* global debug sysfs attrs */
 #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
 #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
 #define MNTOPT_QUOTANOENF  "qnoenforce" /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG    "delaylog"   /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG  "nodelaylog" /* Delayed logging disabled */
 #define MNTOPT_DISCARD     "discard"    /* Discard unused blocks */
 #define MNTOPT_NODISCARD   "nodiscard"  /* Do not discard unused blocks */
@@ -361,28 +359,10 @@ xfs_parseargs(
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
                        mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
                        mp->m_qflags &= ~XFS_GQUOTA_ENFD;
-                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
-                        xfs_warn(mp,
-        "delaylog is the default now, option is deprecated.");
-                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
-                        xfs_warn(mp,
-        "nodelaylog support has been removed, option is deprecated.");
                } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
                        mp->m_flags |= XFS_MOUNT_DISCARD;
                } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
                        mp->m_flags &= ~XFS_MOUNT_DISCARD;
-                } else if (!strcmp(this_char, "ihashsize")) {
-                        xfs_warn(mp,
-        "ihashsize no longer used, option is deprecated.");
-                } else if (!strcmp(this_char, "osyncisdsync")) {
-                        xfs_warn(mp,
-        "osyncisdsync has no effect, option is deprecated.");
-                } else if (!strcmp(this_char, "osyncisosync")) {
-                        xfs_warn(mp,
-        "osyncisosync has no effect, option is deprecated.");
-                } else if (!strcmp(this_char, "irixsgid")) {
-                        xfs_warn(mp,
-        "irixsgid is now a sysctl(2) variable, option is deprecated.");
                } else {
                        xfs_warn(mp, "unknown mount option [%s].", this_char);
                        return -EINVAL;
@@ -986,6 +966,8 @@ xfs_fs_inode_init_once(
        atomic_set(&ip->i_pincount, 0);
        spin_lock_init(&ip->i_flags_lock);
+        mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                     "xfsino", ip->i_ino);
        mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
                     "xfsino", ip->i_ino);
 }
@@ -1033,23 +1015,6 @@ xfs_free_fsname(
        kfree(mp->m_logname);
 }
-STATIC void
-xfs_fs_put_super(
-        struct super_block      *sb)
-{
-        struct xfs_mount        *mp = XFS_M(sb);
-        xfs_filestream_unmount(mp);
-        xfs_unmountfs(mp);
-        xfs_freesb(mp);
-        xfs_icsb_destroy_counters(mp);
-        xfs_destroy_mount_workqueues(mp);
-        xfs_close_devices(mp);
-        xfs_free_fsname(mp);
-        kfree(mp);
-}
 STATIC int
 xfs_fs_sync_fs(
        struct super_block      *sb,
@@ -1085,6 +1050,9 @@ xfs_fs_statfs(
        xfs_sb_t                *sbp = &mp->m_sb;
        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
        __uint64_t              fakeinos, id;
+        __uint64_t              icount;
+        __uint64_t              ifree;
+        __uint64_t              fdblocks;
        xfs_extlen_t            lsize;
        __int64_t               ffree;
@@ -1095,17 +1063,21 @@ xfs_fs_statfs(
        statp->f_fsid.val[0] = (u32)id;
        statp->f_fsid.val[1] = (u32)(id >> 32);
-        xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+        icount = percpu_counter_sum(&mp->m_icount);
+        ifree = percpu_counter_sum(&mp->m_ifree);
+        fdblocks = percpu_counter_sum(&mp->m_fdblocks);
        spin_lock(&mp->m_sb_lock);
        statp->f_bsize = sbp->sb_blocksize;
        lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
        statp->f_blocks = sbp->sb_dblocks - lsize;
-        statp->f_bfree = statp->f_bavail =
+        spin_unlock(&mp->m_sb_lock);
-                                sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+        statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+        statp->f_bavail = statp->f_bfree;
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
-        statp->f_files =
+        statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
-            MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
        if (mp->m_maxicount)
                statp->f_files = min_t(typeof(statp->f_files),
                                        statp->f_files,
@@ -1117,10 +1089,9 @@ xfs_fs_statfs(
                                        sbp->sb_icount);
        /* make sure statp->f_ffree does not underflow */
-        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+        ffree = statp->f_files - (icount - ifree);
        statp->f_ffree = max_t(__int64_t, ffree, 0);
-        spin_unlock(&mp->m_sb_lock);
        if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
            ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
@@ -1256,6 +1227,12 @@ xfs_fs_remount(
        /* ro -> rw */
        if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+                if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+                        xfs_warn(mp,
+                "ro->rw transition prohibited on norecovery mount");
+                        return -EINVAL;
+                }
                mp->m_flags &= ~XFS_MOUNT_RDONLY;
                /*
@@ -1401,6 +1378,51 @@ xfs_finish_flags(
        return 0;
 }
+static int
+xfs_init_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        int             error;
+        error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
+        if (error)
+                return -ENOMEM;
+        error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
+        if (error)
+                goto free_icount;
+        error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
+        if (error)
+                goto free_ifree;
+        return 0;
+free_ifree:
+        percpu_counter_destroy(&mp->m_ifree);
+free_icount:
+        percpu_counter_destroy(&mp->m_icount);
+        return -ENOMEM;
+}
+void
+xfs_reinit_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
+        percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
+        percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+}
+static void
+xfs_destroy_percpu_counters(
+        struct xfs_mount        *mp)
+{
+        percpu_counter_destroy(&mp->m_icount);
+        percpu_counter_destroy(&mp->m_ifree);
+        percpu_counter_destroy(&mp->m_fdblocks);
+}
 STATIC int
 xfs_fs_fill_super(
        struct super_block      *sb,
@@ -1449,7 +1471,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_close_devices;
-        error = xfs_icsb_init_counters(mp);
+        error = xfs_init_percpu_counters(mp);
        if (error)
                goto out_destroy_workqueues;
@@ -1507,7 +1529,7 @@ xfs_fs_fill_super(
 out_free_sb:
        xfs_freesb(mp);
 out_destroy_counters:
-        xfs_icsb_destroy_counters(mp);
+        xfs_destroy_percpu_counters(mp);
 out_destroy_workqueues:
        xfs_destroy_mount_workqueues(mp);
 out_close_devices:
@@ -1524,6 +1546,24 @@ out_destroy_workqueues:
        goto out_free_sb;
 }
+STATIC void
+xfs_fs_put_super(
+        struct super_block      *sb)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        xfs_notice(mp, "Unmounting Filesystem");
+        xfs_filestream_unmount(mp);
+        xfs_unmountfs(mp);
+        xfs_freesb(mp);
+        xfs_destroy_percpu_counters(mp);
+        xfs_destroy_mount_workqueues(mp);
+        xfs_close_devices(mp);
+        xfs_free_fsname(mp);
+        kfree(mp);
+}
 STATIC struct dentry *
 xfs_fs_mount(
        struct file_system_type *fs_type,
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2b830c2f322e..499058fea303 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations;
 extern const struct xattr_handler *xfs_xattr_handlers[];
 extern const struct quotactl_ops xfs_quotactl_operations;
+extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 #define XFS_M(sb)               ((struct xfs_mount *)((sb)->s_fs_info))
 #endif  /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 25791df6f638..3df411eadb86 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -177,7 +177,7 @@ xfs_symlink(
        int                     pathlen;
        struct xfs_bmap_free    free_list;
        xfs_fsblock_t           first_block;
-        bool                    unlock_dp_on_error = false;
+        bool                    unlock_dp_on_error = false;
        uint                    cancel_flags;
        int                     committed;
        xfs_fileoff_t           first_fsb;
@@ -221,7 +221,7 @@ xfs_symlink(
                        XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
                        &udqp, &gdqp, &pdqp);
        if (error)
-                goto std_return;
+                return error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
        cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
@@ -241,7 +241,7 @@ xfs_symlink(
        }
        if (error) {
                cancel_flags = 0;
-                goto error_return;
+                goto out_trans_cancel;
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -252,7 +252,7 @@ xfs_symlink(
         */
        if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
                error = -EPERM;
-                goto error_return;
+                goto out_trans_cancel;
        }
        /*
@@ -261,7 +261,7 @@ xfs_symlink(
        error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
                                                pdqp, resblks, 1, 0);
        if (error)
-                goto error_return;
+                goto out_trans_cancel;
        /*
         * Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +269,7 @@ xfs_symlink(
        if (!resblks) {
                error = xfs_dir_canenter(tp, dp, link_name);
                if (error)
-                        goto error_return;
+                        goto out_trans_cancel;
        }
        /*
         * Initialize the bmap freelist prior to calling either
@@ -282,15 +282,14 @@ xfs_symlink(
         */
        error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
                               prid, resblks > 0, &ip, NULL);
-        if (error) {
+        if (error)
-                if (error == -ENOSPC)
+                goto out_trans_cancel;
-                        goto error_return;
-                goto error1;
-        }
        /*
-         * An error after we've joined dp to the transaction will result in the
+         * Now we join the directory inode to the transaction.  We do not do it
-         * transaction cancel unlocking dp so don't do it explicitly in the
+         * earlier because xfs_dir_ialloc might commit the previous transaction
+         * (and release all the locks).  An error from here on will result in
+         * the transaction cancel unlocking dp so don't do it explicitly in the
         * error path.
         */
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +329,7 @@ xfs_symlink(
                                  XFS_BMAPI_METADATA, &first_block, resblks,
                                  mval, &nmaps, &free_list);
                if (error)
-                        goto error2;
+                        goto out_bmap_cancel;
                if (resblks)
                        resblks -= fs_blocks;
@@ -348,7 +347,7 @@ xfs_symlink(
                                               BTOBB(byte_cnt), 0);
                        if (!bp) {
                                error = -ENOMEM;
-                                goto error2;
+                                goto out_bmap_cancel;
                        }
                        bp->b_ops = &xfs_symlink_buf_ops;
@@ -378,7 +377,7 @@ xfs_symlink(
        error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
                                        &first_block, &free_list, resblks);
        if (error)
-                goto error2;
+                goto out_bmap_cancel;
        xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
        xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
@@ -392,10 +391,13 @@ xfs_symlink(
        }
        error = xfs_bmap_finish(&tp, &free_list, &committed);
-        if (error) {
+        if (error)
-                goto error2;
+                goto out_bmap_cancel;
-        }
        error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+        if (error)
+                goto out_release_inode;
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        xfs_qm_dqrele(pdqp);
@@ -403,20 +405,28 @@ xfs_symlink(
        *ipp = ip;
        return 0;
- error2:
+out_bmap_cancel:
-        IRELE(ip);
- error1:
        xfs_bmap_cancel(&free_list);
        cancel_flags |= XFS_TRANS_ABORT;
- error_return:
+out_trans_cancel:
        xfs_trans_cancel(tp, cancel_flags);
+out_release_inode:
+        /*
+         * Wait until after the current transaction is aborted to finish the
+         * setup of the inode and release the inode.  This prevents recursive
+         * transactions and deadlocks from xfs_inactive.
+         */
+        if (ip) {
+                xfs_finish_inode_setup(ip);
+                IRELE(ip);
+        }
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(gdqp);
        xfs_qm_dqrele(pdqp);
        if (unlock_dp_on_error)
                xfs_iunlock(dp, XFS_ILOCK_EXCL);
- std_return:
        return error;
 }
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..615781bf4ee5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
                __entry->refcount = refcount;
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+        TP_printk("dev %d:%d agno %u refcount %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->refcount,
@@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  "offset %lld block %lld count %lld flag %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                  "offset %lld block %lld count %lld flag %d caller %pf",
+                  "offset %lld block %lld count %lld flag %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
-                  "lock %d flags %s caller %pf",
+                  "lock %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->nblks,
@@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                  "lock %d flags %s caller %pf",
+                  "lock %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->buffer_length,
@@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror,
                __entry->caller_ip = caller_ip;
        ),
        TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                  "lock %d error %d flags %s caller %pf",
+                  "lock %d error %d flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long long)__entry->bno,
                  __entry->buffer_length,
@@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class,
                __entry->lock_flags = lock_flags;
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
@@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
 DEFINE_INODE_EVENT(xfs_free_file_space);
 DEFINE_INODE_EVENT(xfs_zero_file_space);
 DEFINE_INODE_EVENT(xfs_collapse_file_space);
+DEFINE_INODE_EVENT(xfs_insert_file_space);
 DEFINE_INODE_EVENT(xfs_readdir);
 #ifdef CONFIG_XFS_POSIX_ACL
 DEFINE_INODE_EVENT(xfs_get_acl);
@@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_filemap_fault);
+DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
 DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
@@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
                __entry->pincount = atomic_read(&ip->i_pincount);
                __entry->caller_ip = caller_ip;
        ),
-        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+        TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->count,
@@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
 DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
+DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
 DECLARE_EVENT_CLASS(xfs_simple_io_class,
        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap,
                __entry->flags = flags;
        ),
        TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
-                  "flags %s caller %pf",
+                  "flags %s caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->ino,
                  __entry->size,
@@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf,
        ),
        TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
                  "levels b %u c %u flfirst %u fllast %u flcount %u "
-                  "freeblks %u longest %u caller %pf",
+                  "freeblks %u longest %u caller %ps",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index eb90cd59a0ec..220ef2c906b2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -173,7 +173,7 @@ xfs_trans_reserve(
        uint                    rtextents)
 {
        int             error = 0;
-        int             rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+        bool            rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
        /* Mark this thread as being in a transaction */
        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -184,8 +184,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (blocks > 0) {
-                error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
-                                          -((int64_t)blocks), rsvd);
                if (error != 0) {
                        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
                        return -ENOSPC;
@@ -236,8 +235,7 @@ xfs_trans_reserve(
         * fail if the count would go below zero.
         */
        if (rtextents > 0) {
-                error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
+                error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
-                                          -((int64_t)rtextents), rsvd);
                if (error) {
                        error = -ENOSPC;
                        goto undo_log;
@@ -268,8 +266,7 @@ undo_log:
 undo_blocks:
        if (blocks > 0) {
-                xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS,
+                xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
-                                         (int64_t)blocks, rsvd);
                tp->t_blk_res = 0;
        }
@@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas(
                                  sizeof(sbp->sb_frextents) - 1);
 }
+STATIC int
+xfs_sb_mod8(
+        uint8_t                 *field,
+        int8_t                  delta)
+{
+        int8_t                  counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
+STATIC int
+xfs_sb_mod32(
+        uint32_t                *field,
+        int32_t                 delta)
+{
+        int32_t                 counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
+STATIC int
+xfs_sb_mod64(
+        uint64_t                *field,
+        int64_t                 delta)
+{
+        int64_t                 counter = *field;
+        counter += delta;
+        if (counter < 0) {
+                ASSERT(0);
+                return -EINVAL;
+        }
+        *field = counter;
+        return 0;
+}
 /*
 * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
 * and apply superblock counter changes to the in-core superblock.  The
@@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas(
 * applied to the in-core superblock.  The idea is that that has already been
 * done.
 *
- * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
- * However, we have to ensure that we only modify each superblock field only
- * once because the application of the delta values may not be atomic. That can
- * lead to ENOSPC races occurring if we have two separate modifcations of the
- * free space counter to put back the entire reservation and then take away
- * what we used.
- *
 * If we are not logging superblock counters, then the inode allocated/free and
 * used block counts are not updated in the on disk superblock. In this case,
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
@@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas(
 */
 void
 xfs_trans_unreserve_and_mod_sb(
-        xfs_trans_t     *tp)
+        struct xfs_trans        *tp)
 {
-        xfs_mod_sb_t    msb[9]; /* If you add cases, add entries */
+        struct xfs_mount        *mp = tp->t_mountp;
-        xfs_mod_sb_t    *msbp;
+        bool                    rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
-        xfs_mount_t     *mp = tp->t_mountp;
+        int64_t                 blkdelta = 0;
-        /* REFERENCED */
+        int64_t                 rtxdelta = 0;
-        int             error;
+        int64_t                 idelta = 0;
-        int             rsvd;
+        int64_t                 ifreedelta = 0;
-        int64_t         blkdelta = 0;
+        int                     error;
-        int64_t         rtxdelta = 0;
-        int64_t         idelta = 0;
-        int64_t         ifreedelta = 0;
-        msbp = msb;
-        rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
        /* calculate deltas */
        if (tp->t_blk_res > 0)
@@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb(
        /* apply the per-cpu counters */
        if (blkdelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
+                error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
-                                                 blkdelta, rsvd);
                if (error)
                        goto out;
        }
        if (idelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT,
+                error = xfs_mod_icount(mp, idelta);
-                                                 idelta, rsvd);
                if (error)
                        goto out_undo_fdblocks;
        }
        if (ifreedelta) {
-                error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE,
+                error = xfs_mod_ifree(mp, ifreedelta);
-                                                 ifreedelta, rsvd);
                if (error)
                        goto out_undo_icount;
        }
+        if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+                return;
        /* apply remaining deltas */
-        if (rtxdelta != 0) {
+        spin_lock(&mp->m_sb_lock);
-                msbp->msb_field = XFS_SBS_FREXTENTS;
+        if (rtxdelta) {
-                msbp->msb_delta = rtxdelta;
+                error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
-                msbp++;
+                if (error)
+                        goto out_undo_ifree;
        }
-        if (tp->t_flags & XFS_TRANS_SB_DIRTY) {
+        if (tp->t_dblocks_delta != 0) {
-                if (tp->t_dblocks_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
-                        msbp->msb_field = XFS_SBS_DBLOCKS;
+                if (error)
-                        msbp->msb_delta = tp->t_dblocks_delta;
+                        goto out_undo_frextents;
-                        msbp++;
-                }
-                if (tp->t_agcount_delta != 0) {
-                        msbp->msb_field = XFS_SBS_AGCOUNT;
-                        msbp->msb_delta = tp->t_agcount_delta;
-                        msbp++;
-                }
-                if (tp->t_imaxpct_delta != 0) {
-                        msbp->msb_field = XFS_SBS_IMAX_PCT;
-                        msbp->msb_delta = tp->t_imaxpct_delta;
-                        msbp++;
-                }
-                if (tp->t_rextsize_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTSIZE;
-                        msbp->msb_delta = tp->t_rextsize_delta;
-                        msbp++;
-                }
-                if (tp->t_rbmblocks_delta != 0) {
-                        msbp->msb_field = XFS_SBS_RBMBLOCKS;
-                        msbp->msb_delta = tp->t_rbmblocks_delta;
-                        msbp++;
-                }
-                if (tp->t_rblocks_delta != 0) {
-                        msbp->msb_field = XFS_SBS_RBLOCKS;
-                        msbp->msb_delta = tp->t_rblocks_delta;
-                        msbp++;
-                }
-                if (tp->t_rextents_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTENTS;
-                        msbp->msb_delta = tp->t_rextents_delta;
-                        msbp++;
-                }
-                if (tp->t_rextslog_delta != 0) {
-                        msbp->msb_field = XFS_SBS_REXTSLOG;
-                        msbp->msb_delta = tp->t_rextslog_delta;
-                        msbp++;
-                }
        }
+        if (tp->t_agcount_delta != 0) {
-        /*
+                error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
-         * If we need to change anything, do it.
-         */
-        if (msbp > msb) {
-                error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
-                        (uint)(msbp - msb), rsvd);
                if (error)
-                        goto out_undo_ifreecount;
+                        goto out_undo_dblocks;
        }
+        if (tp->t_imaxpct_delta != 0) {
+                error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
+                if (error)
+                        goto out_undo_agcount;
+        }
+        if (tp->t_rextsize_delta != 0) {
+                error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
+                                     tp->t_rextsize_delta);
+                if (error)
+                        goto out_undo_imaxpct;
+        }
+        if (tp->t_rbmblocks_delta != 0) {
+                error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
+                                     tp->t_rbmblocks_delta);
+                if (error)
+                        goto out_undo_rextsize;
+        }
+        if (tp->t_rblocks_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
+                if (error)
+                        goto out_undo_rbmblocks;
+        }
+        if (tp->t_rextents_delta != 0) {
+                error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
+                                     tp->t_rextents_delta);
+                if (error)
+                        goto out_undo_rblocks;
+        }
+        if (tp->t_rextslog_delta != 0) {
+                error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
+                                     tp->t_rextslog_delta);
+                if (error)
+                        goto out_undo_rextents;
+        }
+        spin_unlock(&mp->m_sb_lock);
        return;
-out_undo_ifreecount:
+out_undo_rextents:
+        if (tp->t_rextents_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
+out_undo_rblocks:
+        if (tp->t_rblocks_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
+out_undo_rbmblocks:
+        if (tp->t_rbmblocks_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
+out_undo_rextsize:
+        if (tp->t_rextsize_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
+out_undo_imaxpct:
+        if (tp->t_rextsize_delta)
+                xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
+out_undo_agcount:
+        if (tp->t_agcount_delta)
+                xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
+out_undo_dblocks:
+        if (tp->t_dblocks_delta)
+                xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
+out_undo_frextents:
+        if (rtxdelta)
+                xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
+out_undo_ifree:
+        spin_unlock(&mp->m_sb_lock);
        if (ifreedelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd);
+                xfs_mod_ifree(mp, -ifreedelta);
 out_undo_icount:
        if (idelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd);
+                xfs_mod_icount(mp, -idelta);
 out_undo_fdblocks:
        if (blkdelta)
-                xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd);
+                xfs_mod_fdblocks(mp, -blkdelta, rsvd);
 out:
        ASSERT(error == 0);
        return;