Merge branch 'for_3.0/pm-fixes' of ssh://master.kernel.org/pub/scm/linux/kernel/git/khilman/linux-omap-pm into fixes

author: Tony Lindgren <tony@atomide.com> 2011-06-13 10:40:25 -0400
committer: Tony Lindgren <tony@atomide.com> 2011-06-13 10:40:25 -0400
commit: c8e0bf95fc01d6e2ca585fe08010800b6c56e823 (patch)
tree: f901bdcb5b20e93261cf9cf324ebbcf3fd24ce58 /fs
parent: 9d5ae7cd6cb9ead43336fec1094184d1dc740fbd (diff)
parent: 345f79b3de7f6d651e4dba794af7c7303bdfd649 (diff)
125 files changed, 4779 insertions, 1421 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae29..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
 int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
 {
-        dentry_unhash(d);
        return v9fs_remove(i, d, 1);
 }
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct p9_fid *newdirfid;
        struct p9_wstat wstat;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        P9_DPRINTK(P9_DEBUG_VFS, "\n");
        retval = 0;
        old_inode = old_dentry->d_inode;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390c..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
                 dentry->d_inode->i_ino,
                 (int)dentry->d_name.len, dentry->d_name.name);
-        dentry_unhash(dentry);
        return affs_remove_header(dentry);
 }
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh = NULL;
        int retval;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
                 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
                 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e05160042..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
        _enter("{%x:%u},{%s}",
               dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
-        dentry_unhash(dentry);
        ret = -ENAMETOOLONG;
        if (dentry->d_name.len >= AFSNAMEMAX)
                goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct key *key;
        int ret;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        vnode = AFS_FS_I(old_dentry->d_inode);
        orig_dvnode = AFS_FS_I(old_dir);
        new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f2..caf2aa521e2b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                        return -EPERM;
        }
+        if ((ia_valid & ATTR_MODE)) {
+                mode_t amode = attr->ia_mode;
+                /* Flag setting protected by i_mutex */
+                if (is_sxid(amode))
+                        inode->i_flags &= ~S_NOSEC;
+        }
        now = current_fs_time(inode->i_sb);
        attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddbc..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
                return -EACCES;
-        dentry_unhash(dentry);
        if (atomic_dec_and_test(&ino->count)) {
                p_ino = autofs4_dentry_ino(dentry->d_parent);
                if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b0483..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct bfs_sb_info *info;
        int error = -ENOENT;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        old_bh = new_bh = NULL;
        old_inode = old_dentry->d_inode;
        if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/bio.c b/fs/bio.c
index 840a0d755248..9bfade8a609b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 *      @offset: vec entry offset
 *
 *      Attempt to add a page to the bio_vec maplist. This can fail for a
- *      number of reasons, such as the bio being full or target block
+ *      number of reasons, such as the bio being full or target block device
- *      device limitations. The target block device must allow bio's
+ *      limitations. The target block device must allow bio's up to PAGE_SIZE,
- *      smaller than PAGE_SIZE, so it is always possible to add a single
+ *      so it is always possible to add a single page to an empty bio.
- *      page to an empty bio. This should only be used by REQ_PC bios.
+ *
+ *      This should only be used by REQ_PC bios.
 */
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
                    unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
 *      @offset: vec entry offset
 *
 *      Attempt to add a page to the bio_vec maplist. This can fail for a
- *      number of reasons, such as the bio being full or target block
+ *      number of reasons, such as the bio being full or target block device
- *      device limitations. The target block device must allow bio's
+ *      limitations. The target block device must allow bio's up to PAGE_SIZE,
- *      smaller than PAGE_SIZE, so it is always possible to add a single
+ *      so it is always possible to add a single page to an empty bio.
- *      page to an empty bio.
 */
 int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
                 unsigned int offset)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1f2b19978333..1a2421f908f0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
                 * individual writeable reference is too fragile given the
                 * way @mode is used in blkdev_get/put().
                 */
-                if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) &&
+                if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
-                    !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
+                    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
                        bdev->bd_write_holder = true;
                        disk_block_events(disk);
                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 93b1aa932014..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -121,9 +121,6 @@ struct btrfs_inode {
         */
        u64 index_cnt;
-        /* the start of block group preferred for allocations. */
-        u64 block_group;
        /* the fsync log has some corner cases that mean we have to check
         * directories to see if any unlinks have been done before
         * the directory was logged.  See tree-log.c for all the
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b0e18d986e0a..d84089349c82 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -43,8 +43,6 @@ struct btrfs_path *btrfs_alloc_path(void)
 {
        struct btrfs_path *path;
        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-        if (path)
-                path->reada = 1;
        return path;
 }
@@ -1224,6 +1222,7 @@ static void reada_for_search(struct btrfs_root *root,
        u64 search;
        u64 target;
        u64 nread = 0;
+        u64 gen;
        int direction = path->reada;
        struct extent_buffer *eb;
        u32 nr;
@@ -1251,6 +1250,15 @@ static void reada_for_search(struct btrfs_root *root,
        nritems = btrfs_header_nritems(node);
        nr = slot;
        while (1) {
+                if (!node->map_token) {
+                        unsigned long offset = btrfs_node_key_ptr_offset(nr);
+                        map_private_extent_buffer(node, offset,
+                                                  sizeof(struct btrfs_key_ptr),
+                                                  &node->map_token,
+                                                  &node->kaddr,
+                                                  &node->map_start,
+                                                  &node->map_len, KM_USER1);
+                }
                if (direction < 0) {
                        if (nr == 0)
                                break;
@@ -1268,14 +1276,23 @@ static void reada_for_search(struct btrfs_root *root,
                search = btrfs_node_blockptr(node, nr);
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
-                        readahead_tree_block(root, search, blocksize,
+                        gen = btrfs_node_ptr_generation(node, nr);
-                                     btrfs_node_ptr_generation(node, nr));
+                        if (node->map_token) {
+                                unmap_extent_buffer(node, node->map_token,
+                                                    KM_USER1);
+                                node->map_token = NULL;
+                        }
+                        readahead_tree_block(root, search, blocksize, gen);
                        nread += blocksize;
                }
                nscan++;
                if ((nread > 65536 || nscan > 32))
                        break;
        }
+        if (node->map_token) {
+                unmap_extent_buffer(node, node->map_token, KM_USER1);
+                node->map_token = NULL;
+        }
 }
 /*
@@ -1648,9 +1665,6 @@ again:
                }
 cow_done:
                BUG_ON(!cow && ins_len);
-                if (level != btrfs_header_level(b))
-                        WARN_ON(1);
-                level = btrfs_header_level(b);
                p->nodes[level] = b;
                if (!p->skip_locking)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332323e19dd1..378b5b4443f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -930,7 +930,6 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-        u64 open_ioctl_trans;
        unsigned long mount_opt:20;
        unsigned long compress_type:4;
        u64 max_inline;
@@ -947,7 +946,6 @@ struct btrfs_fs_info {
        struct super_block *sb;
        struct inode *btree_inode;
        struct backing_dev_info bdi;
-        struct mutex trans_mutex;
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
@@ -968,6 +966,7 @@ struct btrfs_fs_info {
        struct rw_semaphore subvol_sem;
        struct srcu_struct subvol_srcu;
+        spinlock_t trans_lock;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -980,6 +979,7 @@ struct btrfs_fs_info {
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
        atomic_t async_delalloc_pages;
+        atomic_t open_ioctl_trans;
        /*
         * this is used by the balancing code to wait for all the pending
@@ -1044,6 +1044,7 @@ struct btrfs_fs_info {
        int closing;
        int log_root_recovering;
        int enospc_unlink;
+        int trans_no_join;
        u64 total_pinned;
@@ -1065,7 +1066,6 @@ struct btrfs_fs_info {
        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
-        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
        /* data_alloc_cluster is only used in ssd mode */
@@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -2238,6 +2239,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2350,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * Get synced with close_ctree()
+         */
+        smp_mb();
+        return fs_info->closing;
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
                        struct btrfs_path *path,
@@ -2512,8 +2525,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid);
-                             u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2524,7 +2536,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 01e29503a54b..6462c29d2d37 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&head);
        next = item;
+        nitems = 0;
        /*
         * count the number of the continuous items that we can insert in batch
@@ -1129,7 +1130,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        delayed_node = async_node->delayed_node;
        root = delayed_node->root;
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                goto free_path;
@@ -1572,8 +1573,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
-        btrfs_set_stack_inode_block_group(inode_item,
+        btrfs_set_stack_inode_block_group(inode_item, 0);
-                                          BTRFS_I(inode)->block_group);
        btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
                                     inode->i_atime.tv_sec);
@@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root, struct inode *inode)
 {
        struct btrfs_delayed_node *delayed_node;
-        int ret;
+        int ret = 0;
        delayed_node = btrfs_get_or_create_delayed_node(inode);
        if (IS_ERR(delayed_node))
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 98b6a71decba..a203d363184d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1505,24 +1505,24 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                spin_lock(&root->fs_info->new_trans_lock);
+                spin_lock(&root->fs_info->trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        goto sleep;
                }
                now = get_seconds();
                if (!cur->blocked &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
                transid = cur->transid;
-                spin_unlock(&root->fs_info->new_trans_lock);
+                spin_unlock(&root->fs_info->trans_lock);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                if (transid == trans->transid) {
                        ret = btrfs_commit_transaction(trans, root);
@@ -1613,7 +1613,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
-        spin_lock_init(&fs_info->new_trans_lock);
+        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
@@ -1645,6 +1645,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
+        fs_info->trans_no_join = 0;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1709,7 +1710,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->do_barriers = 1;
-        mutex_init(&fs_info->trans_mutex);
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->chunk_mutex);
@@ -2479,13 +2479,13 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
        btrfs_commit_transaction(trans, root);
@@ -3024,10 +3024,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
        WARN_ON(1);
-        mutex_lock(&root->fs_info->trans_mutex);
        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&root->fs_info->trans_list, &list);
+        root->fs_info->trans_no_join = 1;
+        spin_unlock(&root->fs_info->trans_lock);
        while (!list_empty(&list)) {
                t = list_entry(list.next, struct btrfs_transaction, list);
                if (!t)
@@ -3052,23 +3055,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                t->blocked = 0;
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
-                mutex_unlock(&root->fs_info->trans_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
                t->commit_done = 1;
                if (waitqueue_active(&t->commit_wait))
                        wake_up(&t->commit_wait);
-                mutex_unlock(&root->fs_info->trans_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
                btrfs_destroy_pending_snapshots(t);
                btrfs_destroy_delalloc_inodes(root);
-                spin_lock(&root->fs_info->new_trans_lock);
+                spin_lock(&root->fs_info->trans_lock);
                root->fs_info->running_transaction = NULL;
-                spin_unlock(&root->fs_info->new_trans_lock);
+                spin_unlock(&root->fs_info->trans_lock);
                btrfs_destroy_marked_extents(root, &t->dirty_pages,
                                             EXTENT_DIRTY);
@@ -3082,8 +3080,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
                kmem_cache_free(btrfs_transaction_cachep, t);
        }
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
-        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 169bd62ce776..5b9b6b6df242 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -348,7 +348,7 @@ static int caching_kthread(void *data)
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
-        path->reada = 2;
+        path->reada = 1;
        key.objectid = last;
        key.offset = 0;
@@ -366,8 +366,7 @@ again:
        nritems = btrfs_header_nritems(leaf);
        while (1) {
-                smp_mb();
+                if (btrfs_fs_closing(fs_info) > 1) {
-                if (fs_info->closing > 1) {
                        last = (u64)-1;
                        break;
                }
@@ -379,15 +378,18 @@ again:
                        if (ret)
                                break;
-                        caching_ctl->progress = last;
+                        if (need_resched() ||
-                        btrfs_release_path(path);
+                            btrfs_next_leaf(extent_root, path)) {
-                        up_read(&fs_info->extent_commit_sem);
+                                caching_ctl->progress = last;
-                        mutex_unlock(&caching_ctl->mutex);
+                                btrfs_release_path(path);
-                        if (btrfs_transaction_in_commit(fs_info))
+                                up_read(&fs_info->extent_commit_sem);
-                                schedule_timeout(1);
+                                mutex_unlock(&caching_ctl->mutex);
-                        else
                                cond_resched();
-                        goto again;
+                                goto again;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        continue;
                }
                if (key.objectid < block_group->key.objectid) {
@@ -3065,7 +3067,7 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
@@ -3091,9 +3093,10 @@ alloc:
                /* commit the current transaction and try again */
 commit_trans:
-                if (!committed && !root->fs_info->open_ioctl_trans) {
+                if (!committed &&
+                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        committed = 1;
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
@@ -3472,7 +3475,7 @@ again:
                goto out;
        ret = -ENOSPC;
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                goto out;
        ret = btrfs_commit_transaction(trans, root);
@@ -3699,7 +3702,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                if (trans)
                        return -EAGAIN;
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
                return 0;
@@ -3837,6 +3840,37 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv)
+{
+        struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
+        u64 num_bytes;
+        int ret;
+        /*
+         * Truncate should be freeing data, but give us 2 items just in case it
+         * needs to use some space.  We may want to be smarter about this in the
+         * future.
+         */
+        num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+        /* We already have enough bytes, just return */
+        if (rsv->reserved >= num_bytes)
+                return 0;
+        num_bytes -= rsv->reserved;
+        /*
+         * You should have reserved enough space before hand to do this, so this
+         * should not fail.
+         */
+        ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+        BUG_ON(ret);
+        return 0;
+}
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 int num_items)
@@ -3877,23 +3911,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
        /*
-         * one for deleting orphan item, one for updating inode and
+         * We need to hold space in order to delete our orphan item once we've
-         * two for calling btrfs_truncate_inode_items.
+         * added it, so this takes the reservation so we can release it later
-         *
+         * when we are truly done with the orphan item.
-         * btrfs_truncate_inode_items is a delete operation, it frees
-         * more space than it uses in most cases. So two units of
-         * metadata space should be enough for calling it many times.
-         * If all of the metadata space is used, we can commit
-         * transaction and use space it freed.
         */
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
@@ -4987,6 +5016,15 @@ have_block_group:
                if (unlikely(block_group->ro))
                        goto loop;
+                spin_lock(&block_group->free_space_ctl->tree_lock);
+                if (cached &&
+                    block_group->free_space_ctl->free_space <
+                    num_bytes + empty_size) {
+                        spin_unlock(&block_group->free_space_ctl->tree_lock);
+                        goto loop;
+                }
+                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
                 * Ok we want to try and use the cluster allocator, so lets look
                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -5150,6 +5188,7 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
@@ -5242,14 +5281,7 @@ loop:
                ret = -ENOSPC;
        } else if (!ins->objectid) {
                ret = -ENOSPC;
-        }
+        } else if (ins->objectid) {
-        /* we found what we needed */
-        if (ins->objectid) {
-                if (!(data & BTRFS_BLOCK_GROUP_DATA))
-                        trans->block_group = block_group->key.objectid;
-                btrfs_put_block_group(block_group);
                ret = 0;
        }
@@ -6526,7 +6558,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        BUG_ON(cache->ro);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        alloc_flags = update_block_group_flags(root, cache->flags);
@@ -6882,6 +6914,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
        if (cache_gen != 0 &&
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c5d9fbb92bc3..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1476,7 +1476,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                        if (total_bytes >= max_bytes)
                                break;
                        if (!found) {
-                                *start = state->start;
+                                *start = max(cur_start, state->start);
                                found = 1;
                        }
                        last = state->end;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c6a22d783c35..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        if (!btrfs_test_opt(root, AUTO_DEFRAG))
                return 0;
-        if (root->fs_info->closing)
+        if (btrfs_fs_closing(root->fs_info))
                return 0;
        if (BTRFS_I(inode)->in_defrag)
@@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        if (!defrag)
                return -ENOMEM;
-        defrag->ino = inode->i_ino;
+        defrag->ino = btrfs_ino(inode);
        defrag->transid = transid;
        defrag->root = root->root_key.objectid;
@@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                first_ino = defrag->ino + 1;
                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-                if (fs_info->closing)
+                if (btrfs_fs_closing(fs_info))
                        goto next_free;
                spin_unlock(&fs_info->defrag_inodes_lock);
@@ -1480,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
         * the current transaction, we can bail out now without any
         * syncing
         */
-        mutex_lock(&root->fs_info->trans_mutex);
+        smp_mb();
        if (BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
-                mutex_unlock(&root->fs_info->trans_mutex);
                goto out;
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        /*
         * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 70d45795d758..ad144736a5fd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&block_group->lock);
-        if (!root->fs_info->closing) {
+        if (!btrfs_fs_closing(root->fs_info)) {
                block_group->inode = igrab(inode);
                block_group->iref = 1;
        }
@@ -402,7 +402,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                spin_lock(&ctl->tree_lock);
                                ret = link_free_space(ctl, e);
                                spin_unlock(&ctl->tree_lock);
-                                BUG_ON(ret);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
                        } else {
                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
                                if (!e->bitmap) {
@@ -419,6 +426,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                                ctl->op->recalc_thresholds(ctl);
                                spin_unlock(&ctl->tree_lock);
                                list_add_tail(&e->list, &bitmaps);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
                        }
                        num_entries--;
@@ -478,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
         * If we're unmounting then just return, since this does a search on the
         * normal root and not the commit root and we could deadlock.
         */
-        smp_mb();
+        if (btrfs_fs_closing(fs_info))
-        if (fs_info->closing)
                return 0;
        /*
@@ -575,10 +589,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
                PAGE_CACHE_SHIFT;
+        /* Since the first page has all of our checksums and our generation we
+         * need to calculate the offset into the page that we can start writing
+         * our entries.
+         */
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        filemap_write_and_wait(inode->i_mapping);
        btrfs_wait_ordered_range(inode, inode->i_size &
                                 ~(root->sectorsize - 1), (u64)-1);
+        /* make sure we don't overflow that first page */
+        if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+                /* this is really the same as running out of space, where we also return 0 */
+                printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+                ret = 0;
+                goto out_update;
+        }
        /* We need a checksum per page. */
        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
        if (!crc)
@@ -590,12 +619,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                return -1;
        }
-        /* Since the first page has all of our checksums and our generation we
-         * need to calculate the offset into the page that we can start writing
-         * our entries.
-         */
-        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
                cluster = list_entry(block_group->cluster_list.next,
@@ -857,12 +880,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        ret = 1;
 out_free:
+        kfree(checksums);
+        kfree(pages);
+out_update:
        if (ret != 1) {
                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
                BTRFS_I(inode)->generation = 0;
        }
-        kfree(checksums);
-        kfree(pages);
        btrfs_update_inode(trans, root, inode);
        return ret;
 }
@@ -963,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
                         * logically.
                         */
                        if (bitmap) {
-                                WARN_ON(info->bitmap);
+                                if (info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_right;
                        } else {
-                                WARN_ON(!info->bitmap);
+                                if (!info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_left;
                        }
                }
@@ -2481,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&root->cache_lock);
-        if (!root->fs_info->closing)
+        if (!btrfs_fs_closing(root->fs_info))
                root->cache_inode = igrab(inode);
        spin_unlock(&root->cache_lock);
@@ -2504,12 +2535,14 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
        int ret = 0;
        u64 root_gen = btrfs_root_generation(&root->root_item);
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        /*
         * If we're unmounting then just return, since this does a search on the
         * normal root and not the commit root and we could deadlock.
         */
-        smp_mb();
+        if (btrfs_fs_closing(fs_info))
-        if (fs_info->closing)
                return 0;
        path = btrfs_alloc_path();
@@ -2543,6 +2576,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
        struct inode *inode;
        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode))
                return 0;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 3262cd17a12f..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,6 +38,9 @@ static int caching_kthread(void *data)
        int slot;
        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
@@ -59,8 +62,7 @@ again:
                goto out;
        while (1) {
-                smp_mb();
+                if (btrfs_fs_closing(fs_info))
-                if (fs_info->closing)
                        goto out;
                leaf = path->nodes[0];
@@ -141,6 +143,9 @@ static void start_caching(struct btrfs_root *root)
        int ret;
        u64 objectid;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
        spin_lock(&root->cache_lock);
        if (root->cached != BTRFS_CACHE_NO) {
                spin_unlock(&root->cache_lock);
@@ -178,6 +183,9 @@ static void start_caching(struct btrfs_root *root)
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
 {
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return btrfs_find_free_objectid(root, objectid);
 again:
        *objectid = btrfs_find_ino_for_alloc(root);
@@ -201,6 +209,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
 again:
        if (root->cached == BTRFS_CACHE_FINISHED) {
                __btrfs_add_free_space(ctl, objectid, 1);
@@ -250,6 +262,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
        struct rb_node *n;
        u64 count;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
        while (1) {
                n = rb_first(rbroot);
                if (!n)
@@ -388,9 +403,24 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
        int prealloc;
        bool retry = false;
+        /* only fs tree and subvol/snap needs ino cache */
+        if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+            (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+             root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+                return 0;
+        /* Don't save inode cache if we are deleting this root */
+        if (btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root)
+                return 0;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
 again:
        inode = lookup_free_ino_inode(root, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb51bb1fa44f..ebf95f7a44d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -138,7 +138,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->leave_spinning = 1;
-        btrfs_set_trans_block_group(trans, inode);
        key.objectid = btrfs_ino(inode);
        key.offset = start;
@@ -426,9 +425,8 @@ again:
                }
        }
        if (start == 0) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
@@ -623,8 +621,9 @@ retry:
                            async_extent->start + async_extent->ram_size - 1,
                            GFP_NOFS);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
@@ -793,9 +792,8 @@ static noinline int cow_file_range(struct inode *inode,
        int ret = 0;
        BUG_ON(is_free_space_inode(root, inode));
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -1077,10 +1075,12 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        nolock = is_free_space_inode(root, inode);
        if (nolock)
-                trans = btrfs_join_transaction_nolock(root, 1);
+                trans = btrfs_join_transaction_nolock(root);
        else
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        cow_start = (u64)-1;
        cur_offset = start;
@@ -1519,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 {
        struct btrfs_ordered_sum *sum;
-        btrfs_set_trans_block_group(trans, inode);
        list_for_each_entry(sum, list, list) {
                btrfs_csum_file_blocks(trans,
                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1735,11 +1733,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        if (nolock)
-                                trans = btrfs_join_transaction_nolock(root, 1);
+                                trans = btrfs_join_transaction_nolock(root);
                        else
-                                trans = btrfs_join_transaction(root, 1);
+                                trans = btrfs_join_transaction(root);
                        BUG_ON(IS_ERR(trans));
-                        btrfs_set_trans_block_group(trans, inode);
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
@@ -1752,11 +1749,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        if (nolock)
-                trans = btrfs_join_transaction_nolock(root, 1);
+                trans = btrfs_join_transaction_nolock(root);
        else
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2431,7 +2427,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                        (u64)-1);
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                if (!IS_ERR(trans))
                        btrfs_end_transaction(trans, root);
        }
@@ -2511,12 +2507,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        int maybe_acls;
-        u64 alloc_group_block;
        u32 rdev;
        int ret;
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2526,6 +2522,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)inode_item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2555,8 +2557,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->index_cnt = (u64)-1;
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
-        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
        /*
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
@@ -2566,8 +2566,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
        if (!maybe_acls)
                cache_no_acl(inode);
-        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+        if (leaf->map_token) {
-                                                alloc_group_block, 0);
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
        btrfs_free_path(path);
        inode_item = NULL;
@@ -2647,7 +2650,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_transid(leaf, item, trans->transid);
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        btrfs_set_inode_block_group(leaf, item, 0);
        if (leaf->map_token) {
                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -3004,8 +3007,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -3094,8 +3095,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                err = btrfs_unlink_subvol(trans, root, dir,
                                          BTRFS_I(inode)->location.objectid,
@@ -3514,7 +3513,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                err = PTR_ERR(trans);
                                break;
                        }
-                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
@@ -3650,7 +3648,6 @@ void btrfs_evict_inode(struct inode *inode)
        while (1) {
                trans = btrfs_start_transaction(root, 0);
                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_block_rsv_check(trans, root,
@@ -4133,7 +4130,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        path->reada = 1;
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                INIT_LIST_HEAD(&ins_list);
@@ -4268,18 +4266,16 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
-        smp_mb();
+        if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
-        if (root->fs_info->closing && is_free_space_inode(root, inode))
                nolock = true;
        if (wbc->sync_mode == WB_SYNC_ALL) {
                if (nolock)
-                        trans = btrfs_join_transaction_nolock(root, 1);
+                        trans = btrfs_join_transaction_nolock(root);
                else
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-                btrfs_set_trans_block_group(trans, inode);
                if (nolock)
                        ret = btrfs_end_transaction_nolock(trans, root);
                else
@@ -4294,7 +4290,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
@@ -4303,9 +4299,8 @@ void btrfs_dirty_inode(struct inode *inode)
        if (BTRFS_I(inode)->dummy_inode)
                return;
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
-        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && ret == -ENOSPC) {
@@ -4319,7 +4314,6 @@ void btrfs_dirty_inode(struct inode *inode)
                                       PTR_ERR(trans));
                        return;
                }
-                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
@@ -4418,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
                                     const char *name, int name_len,
-                                     u64 ref_objectid, u64 objectid,
+                                     u64 ref_objectid, u64 objectid, int mode,
-                                     u64 alloc_hint, int mode, u64 *index)
+                                     u64 *index)
 {
        struct inode *inode;
        struct btrfs_inode_item *inode_item;
@@ -4472,8 +4466,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                owner = 0;
        else
                owner = 1;
-        BTRFS_I(inode)->block_group =
-                        btrfs_find_block_group(root, 0, alloc_hint, owner);
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4629,15 +4621,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, mode, &index);
+                                mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -4649,7 +4639,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -4658,8 +4647,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4692,15 +4679,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, mode, &index);
+                                mode, &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -4712,7 +4697,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -4723,8 +4707,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4771,8 +4753,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        btrfs_inc_nlink(inode);
        inode->i_ctime = CURRENT_TIME;
-        btrfs_set_trans_block_group(trans, dir);
        ihold(inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -4781,7 +4761,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
        } else {
                struct dentry *parent = dget_parent(dentry);
-                btrfs_update_inode_block_group(trans, dir);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
                btrfs_log_new_name(trans, inode, NULL, parent);
@@ -4818,7 +4797,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
@@ -4826,8 +4804,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
+                                S_IFDIR | mode, &index);
-                                &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_fail;
@@ -4841,7 +4818,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        inode->i_op = &btrfs_dir_inode_operations;
        inode->i_fop = &btrfs_dir_file_operations;
-        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
@@ -4855,8 +4831,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_fail:
        nr = trans->blocks_used;
@@ -4989,7 +4963,15 @@ again:
        if (!path) {
                path = btrfs_alloc_path();
-                BUG_ON(!path);
+                if (!path) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * Chances are we'll be called again, so go ahead and do
+                 * readahead
+                 */
+                path->reada = 1;
        }
        ret = btrfs_lookup_file_extent(trans, root, path,
@@ -5130,8 +5112,10 @@ again:
                                kunmap(page);
                                free_extent_map(em);
                                em = NULL;
                                btrfs_release_path(path);
-                                trans = btrfs_join_transaction(root, 1);
+                                trans = btrfs_join_transaction(root);
                                if (IS_ERR(trans))
                                        return ERR_CAST(trans);
                                goto again;
@@ -5375,7 +5359,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
        }
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return ERR_CAST(trans);
@@ -5611,7 +5595,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * to make sure the current transaction stays open
                 * while we look for nocow cross refs
                 */
-                trans = btrfs_join_transaction(root, 0);
+                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans))
                        goto must_cow;
@@ -5750,7 +5734,7 @@ again:
        BUG_ON(!ordered);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                err = -ENOMEM;
                goto out;
@@ -6500,6 +6484,7 @@ out:
 static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *rsv;
        int ret;
        int err = 0;
        struct btrfs_trans_handle *trans;
@@ -6513,28 +6498,80 @@ static int btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 5);
+        /*
-        if (IS_ERR(trans))
+         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
-                return PTR_ERR(trans);
+         * 3 things going on here
+         *
+         * 1) We need to reserve space for our orphan item and the space to
+         * delete our orphan item.  Lord knows we don't want to have a dangling
+         * orphan item because we didn't reserve space to remove it.
+         *
+         * 2) We need to reserve space to update our inode.
+         *
+         * 3) We need to have something to cache all the space that is going to
+         * be free'd up by the truncate operation, but also have some slack
+         * space reserved in case it uses space during the truncate (thank you
+         * very much snapshotting).
+         *
+         * And we need these to all be seperate.  The fact is we can use alot of
+         * space doing the truncate, and we have no earthly idea how much space
+         * we will use, so we need the truncate reservation to be seperate so it
+         * doesn't end up using space reserved for updating the inode or
+         * removing the orphan item.  We also need to be able to stop the
+         * transaction and start a new one, which means we need to be able to
+         * update the inode several times, and we have no idea of knowing how
+         * many times that will be, so we can't just reserve 1 item for the
+         * entirety of the opration, so that has to be done seperately as well.
+         * Then there is the orphan item, which does indeed need to be held on
+         * to for the whole operation, and we need nobody to touch this reserved
+         * space except the orphan code.
+         *
+         * So that leaves us with
+         *
+         * 1) root->orphan_block_rsv - for the orphan deletion.
+         * 2) rsv - for the truncate reservation, which we will steal from the
+         * transaction reservation.
+         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+         * updating the inode.
+         */
+        rsv = btrfs_alloc_block_rsv(root);
+        if (!rsv)
+                return -ENOMEM;
+        btrfs_add_durable_block_rsv(root->fs_info, rsv);
+        trans = btrfs_start_transaction(root, 4);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
-        btrfs_set_trans_block_group(trans, inode);
+        /*
+         * Reserve space for the truncate process.  Truncate should be adding
+         * space, but if there are snapshots it may end up using space.
+         */
+        ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+        BUG_ON(ret);
        ret = btrfs_orphan_add(trans, inode);
        if (ret) {
                btrfs_end_transaction(trans, root);
-                return ret;
+                goto out;
        }
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        /* Now start a transaction for the truncate */
+        /*
-        trans = btrfs_start_transaction(root, 0);
+         * Ok so we've already migrated our bytes over for the truncate, so here
-        if (IS_ERR(trans))
+         * just reserve the one slot we need for updating the inode.
-                return PTR_ERR(trans);
+         */
-        btrfs_set_trans_block_group(trans, inode);
+        trans = btrfs_start_transaction(root, 1);
-        trans->block_rsv = root->orphan_block_rsv;
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -6558,24 +6595,17 @@ static int btrfs_truncate(struct inode *inode)
        while (1) {
                if (!trans) {
-                        trans = btrfs_start_transaction(root, 0);
+                        trans = btrfs_start_transaction(root, 3);
-                        if (IS_ERR(trans))
+                        if (IS_ERR(trans)) {
-                                return PTR_ERR(trans);
+                                err = PTR_ERR(trans);
-                        btrfs_set_trans_block_group(trans, inode);
+                                goto out;
-                        trans->block_rsv = root->orphan_block_rsv;
+                        }
-                }
-                ret = btrfs_block_rsv_check(trans, root,
+                        ret = btrfs_truncate_reserve_metadata(trans, root,
-                                            root->orphan_block_rsv, 0, 5);
+                                                              rsv);
-                if (ret == -EAGAIN) {
+                        BUG_ON(ret);
-                        ret = btrfs_commit_transaction(trans, root);
-                        if (ret)
+                        trans->block_rsv = rsv;
-                                return ret;
-                        trans = NULL;
-                        continue;
-                } else if (ret) {
-                        err = ret;
-                        break;
                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6586,6 +6616,7 @@ static int btrfs_truncate(struct inode *inode)
                        break;
                }
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
                        err = ret;
@@ -6599,6 +6630,7 @@ static int btrfs_truncate(struct inode *inode)
        }
        if (ret == 0 && inode->i_nlink > 0) {
+                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
@@ -6610,15 +6642,20 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(NULL, inode);
        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && !err)
                err = ret;
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+out:
+        btrfs_free_block_rsv(root, rsv);
        if (ret && !err)
                err = ret;
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -6627,15 +6664,14 @@ static int btrfs_truncate(struct inode *inode)
 * create a new subvolume directory/inode (helper for the ioctl).
 */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid)
-                             u64 new_dirid, u64 alloc_hint)
 {
        struct inode *inode;
        int err;
        u64 index = 0;
        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-                                new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+                                new_dirid, S_IFDIR | 0700, &index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        inode->i_op = &btrfs_dir_inode_operations;
@@ -6748,21 +6784,6 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        if (root == root->fs_info->tree_root) {
-                struct btrfs_block_group_cache *block_group;
-                block_group = btrfs_lookup_block_group(root->fs_info,
-                                                BTRFS_I(inode)->block_group);
-                if (block_group && block_group->inode == inode) {
-                        spin_lock(&block_group->lock);
-                        block_group->inode = NULL;
-                        spin_unlock(&block_group->lock);
-                        btrfs_put_block_group(block_group);
-                } else if (block_group) {
-                        btrfs_put_block_group(block_group);
-                }
-        }
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
@@ -6948,8 +6969,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto out_notrans;
        }
-        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
                btrfs_record_root_in_trans(trans, dest);
@@ -7131,16 +7150,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        err = btrfs_find_free_ino(root, &objectid);
        if (err)
                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+                                S_IFLNK|S_IRWXUGO, &index);
-                                &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_unlock;
@@ -7152,7 +7168,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
@@ -7163,8 +7178,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
                goto out_unlock;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e818ce00c5..ac37040e426a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -243,7 +243,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
        }
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
@@ -414,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_record_root_in_trans(trans, new_root);
-        ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+        ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
-                                       BTRFS_I(dir)->block_group);
        /*
         * insert the directory item
         */
@@ -707,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root,
        struct btrfs_file_extent_item *extent;
        int type;
        int ret;
+        u64 ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = BTRFS_EXTENT_DATA_KEY;
        min_key.offset = *off;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
@@ -727,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root,
                                           path, 0, newer_than);
                if (ret != 0)
                        goto none;
-                if (min_key.objectid != inode->i_ino)
+                if (min_key.objectid != ino)
                        goto none;
                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
                        goto none;
@@ -2489,12 +2489,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (ret)
                goto out;
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_inc(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans++;
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = -ENOMEM;
-        trans = btrfs_start_ioctl_transaction(root, 0);
+        trans = btrfs_start_ioctl_transaction(root);
        if (IS_ERR(trans))
                goto out_drop;
@@ -2502,9 +2500,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
        return 0;
 out_drop:
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
 out:
        return ret;
@@ -2738,9 +2734,7 @@ long btrfs_ioctl_trans_end(struct file *file)
        btrfs_end_transaction(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
        return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ca38eca70af0..b1ef27cc673b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
                err = -ENOMEM;
                goto out;
        }
+        path1->reada = 1;
+        path2->reada = 2;
        node = alloc_backref_node(cache);
        if (!node) {
@@ -1999,6 +2001,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
@@ -2139,10 +2142,10 @@ int prepare_to_merge(struct reloc_control *rc, int err)
        u64 num_bytes = 0;
        int ret;
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
        rc->merging_rsv_size += rc->nodes_relocated * 2;
-        mutex_unlock(&root->fs_info->trans_mutex);
+        spin_unlock(&root->fs_info->trans_lock);
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
@@ -2152,7 +2155,7 @@ again:
                        err = ret;
        }
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                if (!err)
                        btrfs_block_rsv_release(rc->extent_root,
@@ -2211,9 +2214,9 @@ int merge_reloc_roots(struct reloc_control *rc)
        int ret;
 again:
        root = rc->extent_root;
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_splice_init(&rc->reloc_roots, &reloc_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        spin_unlock(&root->fs_info->trans_lock);
        while (!list_empty(&reloc_roots)) {
                found = 1;
@@ -3236,7 +3239,7 @@ truncate:
                goto out;
        }
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                ret = PTR_ERR(trans);
@@ -3300,6 +3303,7 @@ static int find_data_references(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        root = read_fs_root(rc->extent_root->fs_info, ref_root);
        if (IS_ERR(root)) {
@@ -3586,17 +3590,17 @@ next:
 static void set_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        spin_lock(&fs_info->trans_lock);
        fs_info->reloc_ctl = rc;
-        mutex_unlock(&fs_info->trans_mutex);
+        spin_unlock(&fs_info->trans_lock);
 }
 static void unset_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        spin_lock(&fs_info->trans_lock);
        fs_info->reloc_ctl = NULL;
-        mutex_unlock(&fs_info->trans_mutex);
+        spin_unlock(&fs_info->trans_lock);
 }
 static int check_extent_flags(u64 flags)
@@ -3645,7 +3649,7 @@ int prepare_to_relocate(struct reloc_control *rc)
        rc->create_reloc_tree = 1;
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        BUG_ON(IS_ERR(trans));
        btrfs_commit_transaction(trans, rc->extent_root);
        return 0;
@@ -3668,6 +3672,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        ret = prepare_to_relocate(rc);
        if (ret) {
@@ -3834,7 +3839,7 @@ restart:
        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans))
                err = PTR_ERR(trans);
        else
@@ -4093,6 +4098,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = -1;
        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4159,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans)) {
                unset_reloc_control(rc);
                err = PTR_ERR(trans);
@@ -4193,7 +4199,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
        if (IS_ERR(trans))
                err = PTR_ERR(trans);
        else
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dfed0c27ac3..df50fd1eca8f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev)
        }
 }
+static void scrub_free_bio(struct bio *bio)
+{
+        int i;
+        struct page *last_page = NULL;
+        if (!bio)
+                return;
+        for (i = 0; i < bio->bi_vcnt; ++i) {
+                if (bio->bi_io_vec[i].bv_page == last_page)
+                        continue;
+                last_page = bio->bi_io_vec[i].bv_page;
+                __free_page(last_page);
+        }
+        bio_put(bio);
+}
 static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
 {
        int i;
-        int j;
-        struct page *last_page;
        if (!sdev)
                return;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
                struct scrub_bio *sbio = sdev->bios[i];
-                struct bio *bio;
                if (!sbio)
                        break;
-                bio = sbio->bio;
+                scrub_free_bio(sbio->bio);
-                if (bio) {
-                        last_page = NULL;
-                        for (j = 0; j < bio->bi_vcnt; ++j) {
-                                if (bio->bi_io_vec[j].bv_page == last_page)
-                                        continue;
-                                last_page = bio->bi_io_vec[j].bv_page;
-                                __free_page(last_page);
-                        }
-                        bio_put(bio);
-                }
                kfree(sbio);
        }
@@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 {
        struct scrub_dev *sdev;
        int             i;
-        int             j;
-        int             ret;
        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
@@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                goto nomem;
        sdev->dev = dev;
        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-                struct bio *bio;
                struct scrub_bio *sbio;
                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
                        goto nomem;
                sdev->bios[i] = sbio;
-                bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-                if (!bio)
-                        goto nomem;
                sbio->index = i;
                sbio->sdev = sdev;
-                sbio->bio = bio;
                sbio->count = 0;
                sbio->work.func = scrub_checksum;
-                bio->bi_private = sdev->bios[i];
-                bio->bi_end_io = scrub_bio_end_io;
-                bio->bi_sector = 0;
-                bio->bi_bdev = dev->bdev;
-                bio->bi_size = 0;
-                for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
-                        struct page *page;
-                        page = alloc_page(GFP_NOFS);
-                        if (!page)
-                                goto nomem;
-                        ret = bio_add_page(bio, page, PAGE_SIZE, 0);
-                        if (!ret)
-                                goto nomem;
-                }
-                WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
                if (i != SCRUB_BIOS_PER_DEV-1)
                        sdev->bios[i]->next_free = i + 1;
@@ -369,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        int ret;
        DECLARE_COMPLETION_ONSTACK(complete);
-        /* we are going to wait on this IO */
-        rw |= REQ_SYNC;
        bio = bio_alloc(GFP_NOFS, 1);
        bio->bi_bdev = bdev;
        bio->bi_sector = sector;
@@ -380,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
        bio->bi_private = &complete;
        submit_bio(rw, bio);
+        /* this will also unplug the queue */
        wait_for_completion(&complete);
        ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -394,6 +371,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
        sbio->err = err;
+        sbio->bio = bio;
        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
 }
@@ -453,6 +431,8 @@ static void scrub_checksum(struct btrfs_work *work)
        }
 out:
+        scrub_free_bio(sbio->bio);
+        sbio->bio = NULL;
        spin_lock(&sdev->list_lock);
        sbio->next_free = sdev->first_free;
        sdev->first_free = sbio->index;
@@ -583,25 +563,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
 static int scrub_submit(struct scrub_dev *sdev)
 {
        struct scrub_bio *sbio;
+        struct bio *bio;
+        int i;
        if (sdev->curr == -1)
                return 0;
        sbio = sdev->bios[sdev->curr];
-        sbio->bio->bi_sector = sbio->physical >> 9;
+        bio = bio_alloc(GFP_NOFS, sbio->count);
-        sbio->bio->bi_size = sbio->count * PAGE_SIZE;
+        if (!bio)
-        sbio->bio->bi_next = NULL;
+                goto nomem;
-        sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-        sbio->bio->bi_comp_cpu = -1;
+        bio->bi_private = sbio;
-        sbio->bio->bi_bdev = sdev->dev->bdev;
+        bio->bi_end_io = scrub_bio_end_io;
+        bio->bi_bdev = sdev->dev->bdev;
+        bio->bi_sector = sbio->physical >> 9;
+        for (i = 0; i < sbio->count; ++i) {
+                struct page *page;
+                int ret;
+                page = alloc_page(GFP_NOFS);
+                if (!page)
+                        goto nomem;
+                ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+                if (!ret) {
+                        __free_page(page);
+                        goto nomem;
+                }
+        }
        sbio->err = 0;
        sdev->curr = -1;
        atomic_inc(&sdev->in_flight);
-        submit_bio(0, sbio->bio);
+        submit_bio(READ, bio);
        return 0;
+nomem:
+        scrub_free_bio(bio);
+        return -ENOMEM;
 }
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -633,7 +638,11 @@ again:
                sbio->logical = logical;
        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-                scrub_submit(sdev);
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
                goto again;
        }
        sbio->spag[sbio->count].flags = flags;
@@ -645,8 +654,13 @@ again:
                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
        }
        ++sbio->count;
-        if (sbio->count == SCRUB_PAGES_PER_BIO || force)
+        if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
-                scrub_submit(sdev);
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+        }
        return 0;
 }
@@ -727,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_root *csum_root = fs_info->csum_root;
        struct btrfs_extent_item *extent;
+        struct blk_plug plug;
        u64 flags;
        int ret;
        int slot;
@@ -831,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
        start_stripe = 0;
+        blk_start_plug(&plug);
 again:
        logical = base + offset + start_stripe * increment;
        for (i = start_stripe; i < nstripes; ++i) {
@@ -972,6 +988,7 @@ next:
        scrub_submit(sdev);
 out:
+        blk_finish_plug(&plug);
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
@@ -1166,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
        int ret;
        struct btrfs_device *dev;
-        if (root->fs_info->closing)
+        if (btrfs_fs_closing(root->fs_info))
                return -EINVAL;
        /*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b2e7e5bc3ef..117e74e3604b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -161,7 +161,8 @@ enum {
        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+        Opt_inode_cache, Opt_err,
 };
 static match_table_t tokens = {
@@ -193,6 +194,7 @@ static match_table_t tokens = {
        {Opt_enospc_debug, "enospc_debug"},
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
+        {Opt_inode_cache, "inode_cache"},
        {Opt_err, NULL},
 };
@@ -361,6 +363,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
                        break;
+                case Opt_inode_cache:
+                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
+                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+                        break;
                case Opt_clear_cache:
                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc80f7156923..dd719662340e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
        WARN_ON(atomic_read(&transaction->use_count) == 0);
        if (atomic_dec_and_test(&transaction->use_count)) {
+                BUG_ON(!list_empty(&transaction->list));
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -49,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
 * either allocate a new transaction or hop into the existing one
 */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
+        if (root->fs_info->trans_no_join) {
+                if (!nofail) {
+                        spin_unlock(&root->fs_info->trans_lock);
+                        return -EBUSY;
+                }
+        }
        cur_trans = root->fs_info->running_transaction;
-        if (!cur_trans) {
+        if (cur_trans) {
-                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+                atomic_inc(&cur_trans->use_count);
-                                             GFP_NOFS);
-                if (!cur_trans)
-                        return -ENOMEM;
-                root->fs_info->generation++;
-                atomic_set(&cur_trans->num_writers, 1);
-                cur_trans->num_joined = 0;
-                cur_trans->transid = root->fs_info->generation;
-                init_waitqueue_head(&cur_trans->writer_wait);
-                init_waitqueue_head(&cur_trans->commit_wait);
-                cur_trans->in_commit = 0;
-                cur_trans->blocked = 0;
-                atomic_set(&cur_trans->use_count, 1);
-                cur_trans->commit_done = 0;
-                cur_trans->start_time = get_seconds();
-                cur_trans->delayed_refs.root = RB_ROOT;
-                cur_trans->delayed_refs.num_entries = 0;
-                cur_trans->delayed_refs.num_heads_ready = 0;
-                cur_trans->delayed_refs.num_heads = 0;
-                cur_trans->delayed_refs.flushing = 0;
-                cur_trans->delayed_refs.run_delayed_start = 0;
-                spin_lock_init(&cur_trans->delayed_refs.lock);
-                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-                extent_io_tree_init(&cur_trans->dirty_pages,
-                                     root->fs_info->btree_inode->i_mapping);
-                spin_lock(&root->fs_info->new_trans_lock);
-                root->fs_info->running_transaction = cur_trans;
-                spin_unlock(&root->fs_info->new_trans_lock);
-        } else {
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
+                spin_unlock(&root->fs_info->trans_lock);
+                return 0;
        }
+        spin_unlock(&root->fs_info->trans_lock);
+        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+        if (!cur_trans)
+                return -ENOMEM;
+        spin_lock(&root->fs_info->trans_lock);
+        if (root->fs_info->running_transaction) {
+                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+                cur_trans = root->fs_info->running_transaction;
+                atomic_inc(&cur_trans->use_count);
+                atomic_inc(&cur_trans->num_writers);
+                cur_trans->num_joined++;
+                spin_unlock(&root->fs_info->trans_lock);
+                return 0;
+        }
+        atomic_set(&cur_trans->num_writers, 1);
+        cur_trans->num_joined = 0;
+        init_waitqueue_head(&cur_trans->writer_wait);
+        init_waitqueue_head(&cur_trans->commit_wait);
+        cur_trans->in_commit = 0;
+        cur_trans->blocked = 0;
+        /*
+         * One for this trans handle, one so it will live on until we
+         * commit the transaction.
+         */
+        atomic_set(&cur_trans->use_count, 2);
+        cur_trans->commit_done = 0;
+        cur_trans->start_time = get_seconds();
+        cur_trans->delayed_refs.root = RB_ROOT;
+        cur_trans->delayed_refs.num_entries = 0;
+        cur_trans->delayed_refs.num_heads_ready = 0;
+        cur_trans->delayed_refs.num_heads = 0;
+        cur_trans->delayed_refs.flushing = 0;
+        cur_trans->delayed_refs.run_delayed_start = 0;
+        spin_lock_init(&cur_trans->commit_lock);
+        spin_lock_init(&cur_trans->delayed_refs.lock);
+        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+        extent_io_tree_init(&cur_trans->dirty_pages,
+                             root->fs_info->btree_inode->i_mapping);
+        root->fs_info->generation++;
+        cur_trans->transid = root->fs_info->generation;
+        root->fs_info->running_transaction = cur_trans;
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root)
+                               struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
+                spin_lock(&root->fs_info->fs_roots_radix_lock);
+                if (root->last_trans == trans->transid) {
+                        spin_unlock(&root->fs_info->fs_roots_radix_lock);
+                        return 0;
+                }
+                root->last_trans = trans->transid;
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
-                root->last_trans = trans->transid;
+                spin_unlock(&root->fs_info->fs_roots_radix_lock);
                btrfs_init_reloc_root(trans, root);
        }
        return 0;
 }
-int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root)
-{
-        if (!root->ref_cows)
-                return 0;
-        mutex_lock(&root->fs_info->trans_mutex);
-        if (root->last_trans == trans->transid) {
-                mutex_unlock(&root->fs_info->trans_mutex);
-                return 0;
-        }
-        record_root_in_trans(trans, root);
-        mutex_unlock(&root->fs_info->trans_mutex);
-        return 0;
-}
 /* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
@@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
                atomic_inc(&cur_trans->use_count);
+                spin_unlock(&root->fs_info->trans_lock);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (!cur_trans->blocked)
                                break;
-                        mutex_unlock(&root->fs_info->trans_mutex);
                        schedule();
-                        mutex_lock(&root->fs_info->trans_mutex);
                }
                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
 }
@@ -167,10 +185,16 @@ enum btrfs_trans_type {
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-        if (!root->fs_info->log_root_recovering &&
+        if (root->fs_info->log_root_recovering)
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+                return 0;
-             type == TRANS_USERSPACE))
+        if (type == TRANS_USERSPACE)
+                return 1;
+        if (type == TRANS_START &&
+            !atomic_read(&root->fs_info->open_ioctl_trans))
                return 1;
        return 0;
 }
@@ -184,36 +208,44 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
                return ERR_PTR(-EROFS);
+        if (current->journal_info) {
+                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+                h = current->journal_info;
+                h->use_count++;
+                h->orig_rsv = h->block_rsv;
+                h->block_rsv = NULL;
+                goto got_it;
+        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
                return ERR_PTR(-ENOMEM);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
-        ret = join_transaction(root);
+        do {
+                ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+                if (ret == -EBUSY)
+                        wait_current_trans(root);
+        } while (ret == -EBUSY);
        if (ret < 0) {
                kmem_cache_free(btrfs_trans_handle_cachep, h);
-                if (type != TRANS_JOIN_NOLOCK)
-                        mutex_unlock(&root->fs_info->trans_mutex);
                return ERR_PTR(ret);
        }
        cur_trans = root->fs_info->running_transaction;
-        atomic_inc(&cur_trans->use_count);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_unlock(&root->fs_info->trans_mutex);
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
        h->blocks_used = 0;
-        h->block_group = 0;
        h->bytes_reserved = 0;
        h->delayed_ref_updates = 0;
+        h->use_count = 1;
        h->block_rsv = NULL;
+        h->orig_rsv = NULL;
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -241,11 +273,8 @@ again:
                }
        }
-        if (type != TRANS_JOIN_NOLOCK)
+got_it:
-                mutex_lock(&root->fs_info->trans_mutex);
+        btrfs_record_root_in_trans(h, root);
-        record_root_in_trans(h, root);
-        if (type != TRANS_JOIN_NOLOCK)
-                mutex_unlock(&root->fs_info->trans_mutex);
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -257,22 +286,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 {
        return start_transaction(root, num_items, TRANS_START);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
-                                                   int num_blocks)
 {
        return start_transaction(root, 0, TRANS_JOIN);
 }
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
-                                                          int num_blocks)
 {
        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
 }
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
-                                                         int num_blocks)
 {
-        return start_transaction(r, 0, TRANS_USERSPACE);
+        return start_transaction(root, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -280,17 +306,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
        DEFINE_WAIT(wait);
-        mutex_lock(&root->fs_info->trans_mutex);
        while (!commit->commit_done) {
                prepare_to_wait(&commit->commit_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (commit->commit_done)
                        break;
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        finish_wait(&commit->commit_wait, &wait);
        return 0;
 }
@@ -300,59 +322,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
        struct btrfs_transaction *cur_trans = NULL, *t;
        int ret;
-        mutex_lock(&root->fs_info->trans_mutex);
        ret = 0;
        if (transid) {
                if (transid <= root->fs_info->last_trans_committed)
-                        goto out_unlock;
+                        goto out;
                /* find specified transaction */
+                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry(t, &root->fs_info->trans_list, list) {
                        if (t->transid == transid) {
                                cur_trans = t;
+                                atomic_inc(&cur_trans->use_count);
                                break;
                        }
                        if (t->transid > transid)
                                break;
                }
+                spin_unlock(&root->fs_info->trans_lock);
                ret = -EINVAL;
                if (!cur_trans)
-                        goto out_unlock;  /* bad transid */
+                        goto out;  /* bad transid */
        } else {
                /* find newest transaction that is committing | committed */
+                spin_lock(&root->fs_info->trans_lock);
                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
                                            list) {
                        if (t->in_commit) {
                                if (t->commit_done)
-                                        goto out_unlock;
+                                        goto out;
                                cur_trans = t;
+                                atomic_inc(&cur_trans->use_count);
                                break;
                        }
                }
+                spin_unlock(&root->fs_info->trans_lock);
                if (!cur_trans)
-                        goto out_unlock;  /* nothing committing|committed */
+                        goto out;  /* nothing committing|committed */
        }
-        atomic_inc(&cur_trans->use_count);
-        mutex_unlock(&root->fs_info->trans_mutex);
        wait_for_commit(root, cur_trans);
-        mutex_lock(&root->fs_info->trans_mutex);
        put_transaction(cur_trans);
        ret = 0;
-out_unlock:
+out:
-        mutex_unlock(&root->fs_info->trans_mutex);
        return ret;
 }
 void btrfs_throttle(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        if (!atomic_read(&root->fs_info->open_ioctl_trans))
-        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
-        mutex_unlock(&root->fs_info->trans_mutex);
 }
 static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -370,6 +389,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        int updates;
+        smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
@@ -388,6 +408,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
+        if (--trans->use_count) {
+                trans->block_rsv = trans->orig_rsv;
+                return 0;
+        }
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -410,9 +435,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
-        if (lock && !root->fs_info->open_ioctl_trans &&
+        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-            should_end_transaction(trans, root))
+            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
+                smp_wmb();
+        }
        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
                if (throttle)
@@ -703,9 +730,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 */
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_add(&root->root_list, &root->fs_info->dead_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -721,6 +748,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
+        spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
@@ -733,6 +761,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)root->root_key.objectid,
                                        BTRFS_ROOT_TRANS_TAG);
+                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
@@ -753,10 +782,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        err = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
+                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
                }
        }
+        spin_unlock(&fs_info->fs_roots_radix_lock);
        return err;
 }
@@ -786,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                if (root->fs_info->closing || ret != -EAGAIN)
+                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
@@ -851,7 +882,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        parent = dget_parent(dentry);
        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
-        record_root_in_trans(trans, parent_root);
+        btrfs_record_root_in_trans(trans, parent_root);
        /*
         * insert the directory item
@@ -869,7 +900,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
-        record_root_in_trans(trans, root);
+        btrfs_record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
        btrfs_check_and_init_root_item(new_root_item);
@@ -967,20 +998,20 @@ static void update_super_roots(struct btrfs_root *root)
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->in_commit;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->blocked;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
@@ -1004,9 +1035,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
        }
 }
@@ -1032,9 +1061,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
                                    &wait);
                        break;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&root->fs_info->transaction_wait,
                            &wait);
        }
@@ -1072,7 +1099,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        INIT_DELAYED_WORK(&ac->work, do_async_commit);
        ac->root = root;
-        ac->newtrans = btrfs_join_transaction(root, 0);
+        ac->newtrans = btrfs_join_transaction(root);
        if (IS_ERR(ac->newtrans)) {
                int err = PTR_ERR(ac->newtrans);
                kfree(ac);
@@ -1080,22 +1107,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        }
        /* take transaction reference */
-        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans = trans->transaction;
        atomic_inc(&cur_trans->use_count);
-        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_end_transaction(trans, root);
        schedule_delayed_work(&ac->work, 0);
        /* wait for transaction to start and unblock */
-        mutex_lock(&root->fs_info->trans_mutex);
        if (wait_for_unblock)
                wait_current_trans_commit_start_and_unblock(root, cur_trans);
        else
                wait_current_trans_commit_start(root, cur_trans);
        put_transaction(cur_trans);
-        mutex_unlock(&root->fs_info->trans_mutex);
        return 0;
 }
@@ -1139,38 +1162,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
+                spin_unlock(&cur_trans->commit_lock);
                atomic_inc(&cur_trans->use_count);
-                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
                ret = wait_for_commit(root, cur_trans);
                BUG_ON(ret);
-                mutex_lock(&root->fs_info->trans_mutex);
                put_transaction(cur_trans);
-                mutex_unlock(&root->fs_info->trans_mutex);
                return 0;
        }
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
+        spin_unlock(&cur_trans->commit_lock);
        wake_up(&root->fs_info->transaction_blocked_wait);
+        spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
                        atomic_inc(&prev_trans->use_count);
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->trans_lock);
                        wait_for_commit(root, prev_trans);
-                        mutex_lock(&root->fs_info->trans_mutex);
                        put_transaction(prev_trans);
+                } else {
+                        spin_unlock(&root->fs_info->trans_lock);
                }
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1178,12 +1204,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
                if (!list_empty(&trans->transaction->pending_snapshots))
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                mutex_unlock(&root->fs_info->trans_mutex);
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
@@ -1206,14 +1232,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
-                smp_mb();
                if (atomic_read(&cur_trans->num_writers) > 1)
                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
                else if (should_grow)
                        schedule_timeout(1);
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
+                spin_lock(&root->fs_info->trans_lock);
+                root->fs_info->trans_no_join = 1;
+                spin_unlock(&root->fs_info->trans_lock);
        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
@@ -1258,9 +1285,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
-        spin_lock(&root->fs_info->new_trans_lock);
-        root->fs_info->running_transaction = NULL;
-        spin_unlock(&root->fs_info->new_trans_lock);
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
@@ -1281,10 +1305,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
               sizeof(root->fs_info->super_copy));
        trans->transaction->blocked = 0;
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->running_transaction = NULL;
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
        wake_up(&root->fs_info->transaction_wait);
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = btrfs_write_and_wait_transaction(trans, root);
        BUG_ON(ret);
        write_ctree_super(trans, root, 0);
@@ -1297,22 +1324,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
+        spin_lock(&root->fs_info->trans_lock);
        list_del_init(&cur_trans->list);
+        spin_unlock(&root->fs_info->trans_lock);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
        trace_btrfs_transaction_commit(root);
-        mutex_unlock(&root->fs_info->trans_mutex);
        btrfs_scrub_continue(root);
        if (current->journal_info == trans)
@@ -1334,9 +1360,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        LIST_HEAD(list);
        struct btrfs_fs_info *fs_info = root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        spin_lock(&fs_info->trans_lock);
        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&fs_info->trans_mutex);
+        spin_unlock(&fs_info->trans_lock);
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 804c88639e5d..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,10 +28,12 @@ struct btrfs_transaction {
         * transaction can end
         */
        atomic_t num_writers;
+        atomic_t use_count;
        unsigned long num_joined;
+        spinlock_t commit_lock;
        int in_commit;
-        atomic_t use_count;
        int commit_done;
        int blocked;
        struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
-        u64 block_group;
        u64 bytes_reserved;
+        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
        unsigned long delayed_ref_updates;
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *orig_rsv;
 };
 struct btrfs_pending_snapshot {
@@ -66,19 +69,6 @@ struct btrfs_pending_snapshot {
        struct list_head list;
 };
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
-                                               struct inode *inode)
-{
-        trans->block_group = BTRFS_I(inode)->block_group;
-}
-static inline void btrfs_update_inode_block_group(
-                                          struct btrfs_trans_handle *trans,
-                                          struct inode *inode)
-{
-        BTRFS_I(inode)->block_group = trans->block_group;
-}
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
                                              struct inode *inode)
 {
@@ -92,12 +82,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-                                                  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
-                                                          int num_blocks);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                         int num_blocks);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c48214ef5c09..da541dfca2e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                BUG_ON(!new_device);
                memcpy(new_device, device, sizeof(*new_device));
                new_device->name = kstrdup(device->name, GFP_NOFS);
-                BUG_ON(!new_device->name);
+                BUG_ON(device->name && !new_device->name);
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f3107e4b4d56..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
        if (ret)
                goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc462..49c9aada0374 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                ret = -EAGAIN;
                goto out_unlock;
        }
+        wait_on_page_writeback(page);
        return 0;
 out_unlock:
        unlock_page(page);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b41..21de1d6d5849 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
 * Run idmap cache shrinker.
 */
 static int
-cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
+cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 {
+        int nr_to_scan = sc->nr_to_scan;
        int nr_del = 0;
        int nr_rem = 0;
        struct rb_root *root;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd5735..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
        int len = de->d_name.len;
        int error;
-        dentry_unhash(de);
        error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
        if (!error) {
                /* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        int new_length = new_dentry->d_name.len;
        int error;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
                             coda_i2f(new_dir), old_length, new_length,
                             (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc5..9a37a9b6de3a 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct module *subsys_owner = NULL, *dead_item_owner = NULL;
        int ret;
-        dentry_unhash(dentry);
        if (dentry->d_parent == configfs_sb->s_root)
                return -EPERM;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c8091024..58609bde3b9f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
 }
 /**
- * contains_ecryptfs_marker - check for the ecryptfs marker
+ * ecryptfs_validate_marker - check for the ecryptfs marker
 * @data: The data block in which to check
 *
- * Returns one if marker found; zero if not found
+ * Returns zero if marker found; -EINVAL if not found
 */
-static int contains_ecryptfs_marker(char *data)
+static int ecryptfs_validate_marker(char *data)
 {
        u32 m_1, m_2;
        m_1 = get_unaligned_be32(data);
        m_2 = get_unaligned_be32(data + 4);
        if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
-                return 1;
+                return 0;
        ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
                        "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
                        MAGIC_ECRYPTFS_MARKER);
        ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
                        "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
-        return 0;
+        return -EINVAL;
 }
 struct ecryptfs_flag_map_elem {
@@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
        return rc;
 }
-int ecryptfs_read_and_validate_header_region(char *data,
+int ecryptfs_read_and_validate_header_region(struct inode *inode)
-                                             struct inode *ecryptfs_inode)
 {
-        struct ecryptfs_crypt_stat *crypt_stat =
+        u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
-                &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
+        u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
        int rc;
-        if (crypt_stat->extent_size == 0)
+        rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
-                crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE;
+                                 inode);
-        rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size,
+        if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
-                                 ecryptfs_inode);
+                return rc >= 0 ? -EINVAL : rc;
-        if (rc < 0) {
+        rc = ecryptfs_validate_marker(marker);
-                printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n",
+        if (!rc)
-                       __func__, rc);
+                ecryptfs_i_size_init(file_size, inode);
-                goto out;
-        }
-        if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
-                rc = -EINVAL;
-        } else
-                rc = 0;
-out:
        return rc;
 }
@@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
        (*written) = 6;
 }
-struct kmem_cache *ecryptfs_header_cache_1;
+struct kmem_cache *ecryptfs_header_cache;
-struct kmem_cache *ecryptfs_header_cache_2;
 /**
 * ecryptfs_write_headers_virt
@@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
        crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        offset = ECRYPTFS_FILE_SIZE_BYTES;
-        rc = contains_ecryptfs_marker(page_virt + offset);
+        rc = ecryptfs_validate_marker(page_virt + offset);
-        if (rc == 0) {
+        if (rc)
-                rc = -EINVAL;
                goto out;
-        }
        if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
                ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
        offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1567,20 +1556,21 @@ out:
        return rc;
 }
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
-                                            struct dentry *ecryptfs_dentry)
+                                            struct inode *inode)
 {
+        u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
+        u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
        int rc;
-        rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode);
+        rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
-        if (rc)
+                                     ECRYPTFS_XATTR_NAME, file_size,
-                goto out;
+                                     ECRYPTFS_SIZE_AND_MARKER_BYTES);
-        if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) {
+        if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
-                printk(KERN_WARNING "Valid data found in [%s] xattr, but "
+                return rc >= 0 ? -EINVAL : rc;
-                        "the marker is invalid\n", ECRYPTFS_XATTR_NAME);
+        rc = ecryptfs_validate_marker(marker);
-                rc = -EINVAL;
+        if (!rc)
-        }
+                ecryptfs_i_size_init(file_size, inode);
-out:
        return rc;
 }
@@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
        ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
                                                      mount_crypt_stat);
        /* Read the first page from the underlying file */
-        page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER);
+        page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
        if (!page_virt) {
                rc = -ENOMEM;
                printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 out:
        if (page_virt) {
                memset(page_virt, 0, PAGE_CACHE_SIZE);
-                kmem_cache_free(ecryptfs_header_cache_1, page_virt);
+                kmem_cache_free(ecryptfs_header_cache, page_virt);
        }
        return rc;
 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2c..43c7c43b06f5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
 #define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
 #define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8      /* 4*2 */
 #define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
+#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
+                                        + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
 #define ECRYPTFS_DEFAULT_CIPHER "aes"
 #define ECRYPTFS_DEFAULT_KEY_BYTES 16
 #define ECRYPTFS_DEFAULT_HASH "md5"
@@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
 extern struct kmem_cache *ecryptfs_dentry_info_cache;
 extern struct kmem_cache *ecryptfs_inode_info_cache;
 extern struct kmem_cache *ecryptfs_sb_info_cache;
-extern struct kmem_cache *ecryptfs_header_cache_1;
+extern struct kmem_cache *ecryptfs_header_cache;
-extern struct kmem_cache *ecryptfs_header_cache_2;
 extern struct kmem_cache *ecryptfs_xattr_cache;
 extern struct kmem_cache *ecryptfs_key_record_cache;
 extern struct kmem_cache *ecryptfs_key_sig_cache;
@@ -625,14 +626,9 @@ struct ecryptfs_open_req {
        struct list_head kthread_ctl_list;
 };
-#define ECRYPTFS_INTERPOSE_FLAG_D_ADD                 0x00000001
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
-int ecryptfs_interpose(struct dentry *hidden_dentry,
+                                 struct super_block *sb);
-                       struct dentry *this_dentry, struct super_block *sb,
-                       u32 flags);
 void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
-                                        struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
                                         struct dentry *ecryptfs_dentry,
@@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
 void ecryptfs_write_crypt_stat_flags(char *page_virt,
                                     struct ecryptfs_crypt_stat *crypt_stat,
                                     size_t *written);
-int ecryptfs_read_and_validate_header_region(char *data,
+int ecryptfs_read_and_validate_header_region(struct inode *inode);
-                                             struct inode *ecryptfs_inode);
+int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
-int ecryptfs_read_and_validate_xattr_region(char *page_virt,
+                                            struct inode *inode);
-                                            struct dentry *ecryptfs_dentry);
 u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
 int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
 void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -679,9 +674,6 @@ int
 ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
                          unsigned char *src, struct dentry *ecryptfs_dentry);
 int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
 ssize_t
 ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
                        void *value, size_t size);
@@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
                             struct dentry *lower_dentry,
                             struct vfsmount *lower_mnt,
                             const struct cred *cred);
-int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry);
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
 void ecryptfs_put_lower_file(struct inode *inode);
 int
 ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78c..4ec9eb00a241 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                                      | ECRYPTFS_ENCRYPTED);
        }
        mutex_unlock(&crypt_stat->cs_mutex);
-        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bc116b9ffcf2..7349ade17de6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir)
        dput(dir);
 }
+static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
+{
+        if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
+                return 1;
+        return 0;
+}
+static int ecryptfs_inode_set(struct inode *inode, void *opaque)
+{
+        struct inode *lower_inode = opaque;
+        ecryptfs_set_inode_lower(inode, lower_inode);
+        fsstack_copy_attr_all(inode, lower_inode);
+        /* i_size will be overwritten for encrypted regular files */
+        fsstack_copy_inode_size(inode, lower_inode);
+        inode->i_ino = lower_inode->i_ino;
+        inode->i_version++;
+        inode->i_mapping->a_ops = &ecryptfs_aops;
+        if (S_ISLNK(inode->i_mode))
+                inode->i_op = &ecryptfs_symlink_iops;
+        else if (S_ISDIR(inode->i_mode))
+                inode->i_op = &ecryptfs_dir_iops;
+        else
+                inode->i_op = &ecryptfs_main_iops;
+        if (S_ISDIR(inode->i_mode))
+                inode->i_fop = &ecryptfs_dir_fops;
+        else if (special_file(inode->i_mode))
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+        else
+                inode->i_fop = &ecryptfs_main_fops;
+        return 0;
+}
+static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
+                                          struct super_block *sb)
+{
+        struct inode *inode;
+        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
+                return ERR_PTR(-EXDEV);
+        if (!igrab(lower_inode))
+                return ERR_PTR(-ESTALE);
+        inode = iget5_locked(sb, (unsigned long)lower_inode,
+                             ecryptfs_inode_test, ecryptfs_inode_set,
+                             lower_inode);
+        if (!inode) {
+                iput(lower_inode);
+                return ERR_PTR(-EACCES);
+        }
+        if (!(inode->i_state & I_NEW))
+                iput(lower_inode);
+        return inode;
+}
+struct inode *ecryptfs_get_inode(struct inode *lower_inode,
+                                 struct super_block *sb)
+{
+        struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
+        if (!IS_ERR(inode) && (inode->i_state & I_NEW))
+                unlock_new_inode(inode);
+        return inode;
+}
+/**
+ * ecryptfs_interpose
+ * @lower_dentry: Existing dentry in the lower filesystem
+ * @dentry: ecryptfs' dentry
+ * @sb: ecryptfs's super_block
+ *
+ * Interposes upper and lower dentries.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_interpose(struct dentry *lower_dentry,
+                              struct dentry *dentry, struct super_block *sb)
+{
+        struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        d_instantiate(dentry, inode);
+        return 0;
+}
 /**
 * ecryptfs_create_underlying_file
 * @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
                goto out_lock;
        }
        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                directory_inode->i_sb, 0);
+                                directory_inode->i_sb);
        if (rc) {
                ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
                goto out_lock;
@@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
                                "context; rc = [%d]\n", rc);
                goto out;
        }
-        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+        rc = ecryptfs_get_lower_file(ecryptfs_dentry,
+                                     ecryptfs_dentry->d_inode);
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to initialize "
                        "the lower file for the dentry with name "
@@ -215,102 +307,90 @@ out:
        return rc;
 }
+static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
+{
+        struct ecryptfs_crypt_stat *crypt_stat;
+        int rc;
+        rc = ecryptfs_get_lower_file(dentry, inode);
+        if (rc) {
+                printk(KERN_ERR "%s: Error attempting to initialize "
+                        "the lower file for the dentry with name "
+                        "[%s]; rc = [%d]\n", __func__,
+                        dentry->d_name.name, rc);
+                return rc;
+        }
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        /* TODO: lock for crypt_stat comparison */
+        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+                ecryptfs_set_default_sizes(crypt_stat);
+        rc = ecryptfs_read_and_validate_header_region(inode);
+        ecryptfs_put_lower_file(inode);
+        if (rc) {
+                rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
+                if (!rc)
+                        crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+        }
+        /* Must return 0 to allow non-eCryptfs files to be looked up, too */
+        return 0;
+}
 /**
- * ecryptfs_lookup_and_interpose_lower - Perform a lookup
+ * ecryptfs_lookup_interpose - Dentry interposition for a lookup
 */
-int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
+static int ecryptfs_lookup_interpose(struct dentry *dentry,
-                                        struct dentry *lower_dentry,
+                                     struct dentry *lower_dentry,
-                                        struct inode *ecryptfs_dir_inode)
+                                     struct inode *dir_inode)
 {
-        struct dentry *lower_dir_dentry;
+        struct inode *inode, *lower_inode = lower_dentry->d_inode;
+        struct ecryptfs_dentry_info *dentry_info;
        struct vfsmount *lower_mnt;
-        struct inode *lower_inode;
+        int rc = 0;
-        struct ecryptfs_crypt_stat *crypt_stat;
-        char *page_virt = NULL;
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-        int put_lower = 0, rc = 0;
+        fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-        lower_dir_dentry = lower_dentry->d_parent;
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-                                   ecryptfs_dentry->d_parent));
-        lower_inode = lower_dentry->d_inode;
-        fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
        BUG_ON(!lower_dentry->d_count);
-        ecryptfs_set_dentry_private(ecryptfs_dentry,
-                                    kmem_cache_alloc(ecryptfs_dentry_info_cache,
+        dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-                                                     GFP_KERNEL));
+        ecryptfs_set_dentry_private(dentry, dentry_info);
-        if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) {
+        if (!dentry_info) {
-                rc = -ENOMEM;
                printk(KERN_ERR "%s: Out of memory whilst attempting "
                       "to allocate ecryptfs_dentry_info struct\n",
                        __func__);
-                goto out_put;
+                dput(lower_dentry);
+                mntput(lower_mnt);
+                d_drop(dentry);
+                return -ENOMEM;
        }
-        ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry);
+        ecryptfs_set_dentry_lower(dentry, lower_dentry);
-        ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt);
+        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
        if (!lower_dentry->d_inode) {
                /* We want to add because we couldn't find in lower */
-                d_add(ecryptfs_dentry, NULL);
+                d_add(dentry, NULL);
-                goto out;
+                return 0;
-        }
-        rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
-                                ecryptfs_dir_inode->i_sb,
-                                ECRYPTFS_INTERPOSE_FLAG_D_ADD);
-        if (rc) {
-                printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
-                       __func__, rc);
-                goto out;
-        }
-        if (S_ISDIR(lower_inode->i_mode))
-                goto out;
-        if (S_ISLNK(lower_inode->i_mode))
-                goto out;
-        if (special_file(lower_inode->i_mode))
-                goto out;
-        /* Released in this function */
-        page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
-        if (!page_virt) {
-                printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
-                       __func__);
-                rc = -ENOMEM;
-                goto out;
        }
-        rc = ecryptfs_get_lower_file(ecryptfs_dentry);
+        inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
-        if (rc) {
+        if (IS_ERR(inode)) {
-                printk(KERN_ERR "%s: Error attempting to initialize "
+                printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
-                        "the lower file for the dentry with name "
+                       __func__, PTR_ERR(inode));
-                        "[%s]; rc = [%d]\n", __func__,
+                return PTR_ERR(inode);
-                        ecryptfs_dentry->d_name.name, rc);
-                goto out_free_kmem;
        }
-        put_lower = 1;
+        if (S_ISREG(inode->i_mode)) {
-        crypt_stat = &ecryptfs_inode_to_private(
+                rc = ecryptfs_i_size_read(dentry, inode);
-                                        ecryptfs_dentry->d_inode)->crypt_stat;
-        /* TODO: lock for crypt_stat comparison */
-        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-                        ecryptfs_set_default_sizes(crypt_stat);
-        rc = ecryptfs_read_and_validate_header_region(page_virt,
-                                                      ecryptfs_dentry->d_inode);
-        if (rc) {
-                memset(page_virt, 0, PAGE_CACHE_SIZE);
-                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
-                                                             ecryptfs_dentry);
                if (rc) {
-                        rc = 0;
+                        make_bad_inode(inode);
-                        goto out_free_kmem;
+                        return rc;
                }
-                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
        }
-        ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
-out_free_kmem:
+        if (inode->i_state & I_NEW)
-        kmem_cache_free(ecryptfs_header_cache_2, page_virt);
+                unlock_new_inode(inode);
-        goto out;
+        d_add(dentry, inode);
-out_put:
-        dput(lower_dentry);
-        mntput(lower_mnt);
-        d_drop(ecryptfs_dentry);
-out:
-        if (put_lower)
-                ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
        return rc;
 }
@@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                goto out_d_drop;
        }
        if (lower_dentry->d_inode)
-                goto lookup_and_interpose;
+                goto interpose;
        mount_crypt_stat = &ecryptfs_superblock_to_private(
                                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (!(mount_crypt_stat
            && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
-                goto lookup_and_interpose;
+                goto interpose;
        dput(lower_dentry);
        rc = ecryptfs_encrypt_and_encode_filename(
                &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
-lookup_and_interpose:
+interpose:
-        rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
+        rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
-                                                 ecryptfs_dir_inode);
+                                       ecryptfs_dir_inode);
        goto out;
 out_d_drop:
        d_drop(ecryptfs_dentry);
@@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
                      lower_new_dentry);
        if (rc || !lower_new_dentry->d_inode)
                goto out_lock;
-        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
+        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
        if (rc)
                goto out_lock;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
        kfree(encoded_symname);
        if (rc || !lower_dentry->d_inode)
                goto out_lock;
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out_lock;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
        if (rc || !lower_dentry->d_inode)
                goto out;
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct dentry *lower_dir_dentry;
        int rc;
-        dentry_unhash(dentry);
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        dget(dentry);
        lower_dir_dentry = lock_parent(lower_dentry);
@@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
        if (rc || !lower_dentry->d_inode)
                goto out;
-        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0);
+        rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
        if (rc)
                goto out;
        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct dentry *lower_new_dir_dentry;
        struct dentry *trap = NULL;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
        lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
        dget(lower_old_dentry);
@@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                lower_ia->ia_valid &= ~ATTR_SIZE;
                return 0;
        }
-        rc = ecryptfs_get_lower_file(dentry);
+        rc = ecryptfs_get_lower_file(dentry, inode);
        if (rc)
                return rc;
        crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                mount_crypt_stat = &ecryptfs_superblock_to_private(
                        dentry->d_sb)->mount_crypt_stat;
-                rc = ecryptfs_get_lower_file(dentry);
+                rc = ecryptfs_get_lower_file(dentry, inode);
                if (rc) {
                        mutex_unlock(&crypt_stat->cs_mutex);
                        goto out;
@@ -1084,21 +1159,6 @@ out:
        return rc;
 }
-int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
-{
-        if ((ecryptfs_inode_to_lower(inode)
-             == (struct inode *)candidate_lower_inode))
-                return 1;
-        else
-                return 0;
-}
-int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
-{
-        ecryptfs_init_inode(inode, (struct inode *)lower_inode);
-        return 0;
-}
 const struct inode_operations ecryptfs_symlink_iops = {
        .readlink = ecryptfs_readlink,
        .follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8e..9f1bb747d77d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
        return rc;
 }
-int ecryptfs_get_lower_file(struct dentry *dentry)
+int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
 {
-        struct ecryptfs_inode_info *inode_info =
+        struct ecryptfs_inode_info *inode_info;
-                ecryptfs_inode_to_private(dentry->d_inode);
        int count, rc = 0;
+        inode_info = ecryptfs_inode_to_private(inode);
        mutex_lock(&inode_info->lower_file_mutex);
        count = atomic_inc_return(&inode_info->lower_file_count);
        if (WARN_ON_ONCE(count < 1))
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
        }
 }
-static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
-                       struct super_block *sb)
-{
-        struct inode *inode;
-        int rc = 0;
-        if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
-                rc = -EXDEV;
-                goto out;
-        }
-        if (!igrab(lower_inode)) {
-                rc = -ESTALE;
-                goto out;
-        }
-        inode = iget5_locked(sb, (unsigned long)lower_inode,
-                             ecryptfs_inode_test, ecryptfs_inode_set,
-                             lower_inode);
-        if (!inode) {
-                rc = -EACCES;
-                iput(lower_inode);
-                goto out;
-        }
-        if (inode->i_state & I_NEW)
-                unlock_new_inode(inode);
-        else
-                iput(lower_inode);
-        if (S_ISLNK(lower_inode->i_mode))
-                inode->i_op = &ecryptfs_symlink_iops;
-        else if (S_ISDIR(lower_inode->i_mode))
-                inode->i_op = &ecryptfs_dir_iops;
-        if (S_ISDIR(lower_inode->i_mode))
-                inode->i_fop = &ecryptfs_dir_fops;
-        if (special_file(lower_inode->i_mode))
-                init_special_inode(inode, lower_inode->i_mode,
-                                   lower_inode->i_rdev);
-        fsstack_copy_attr_all(inode, lower_inode);
-        /* This size will be overwritten for real files w/ headers and
-         * other metadata */
-        fsstack_copy_inode_size(inode, lower_inode);
-        return inode;
-out:
-        return ERR_PTR(rc);
-}
-/**
- * ecryptfs_interpose
- * @lower_dentry: Existing dentry in the lower filesystem
- * @dentry: ecryptfs' dentry
- * @sb: ecryptfs's super_block
- * @flags: flags to govern behavior of interpose procedure
- *
- * Interposes upper and lower dentries.
- *
- * Returns zero on success; non-zero otherwise
- */
-int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
-                       struct super_block *sb, u32 flags)
-{
-        struct inode *lower_inode = lower_dentry->d_inode;
-        struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
-        if (IS_ERR(inode))
-                return PTR_ERR(inode);
-        if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
-                d_add(dentry, inode);
-        else
-                d_instantiate(dentry, inode);
-        return 0;
-}
 enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
       ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
       ecryptfs_opt_ecryptfs_key_bytes,
@@ -704,13 +635,8 @@ static struct ecryptfs_cache_info {
                .size = sizeof(struct ecryptfs_sb_info),
        },
        {
-                .cache = &ecryptfs_header_cache_1,
+                .cache = &ecryptfs_header_cache,
-                .name = "ecryptfs_headers_1",
+                .name = "ecryptfs_headers",
-                .size = PAGE_CACHE_SIZE,
-        },
-        {
-                .cache = &ecryptfs_header_cache_2,
-                .name = "ecryptfs_headers_2",
                .size = PAGE_CACHE_SIZE,
        },
        {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b6..dbd52d40df4c 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
 }
 /**
- * ecryptfs_init_inode
- * @inode: The ecryptfs inode
- *
- * Set up the ecryptfs inode.
- */
-void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
-{
-        ecryptfs_set_inode_lower(inode, lower_inode);
-        inode->i_ino = lower_inode->i_ino;
-        inode->i_version++;
-        inode->i_op = &ecryptfs_main_iops;
-        inode->i_fop = &ecryptfs_main_fops;
-        inode->i_mapping->a_ops = &ecryptfs_aops;
-}
-/**
 * ecryptfs_statfs
 * @sb: The ecryptfs super block
 * @buf: The struct kstatfs to fill in with stats
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c35..3451d23c3bae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
-void ext3_dirty_inode(struct inode *inode)
+void ext3_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *current_handle = ext3_journal_current_handle();
        handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f90..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
 extern void ext4_evict_inode(struct inode *);
 extern void ext4_clear_inode(struct inode *);
 extern int  ext4_sync_inode(handle_t *, struct inode *);
-extern void ext4_dirty_inode(struct inode *);
+extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c64584..a5763e3505ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
-void ext4_dirty_inode(struct inode *inode)
+void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272e..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        dentry_unhash(dentry);
        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        err = fat_scan(old_dir, old_name, &old_sinfo);
        if (err) {
                err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36c..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        struct fat_slot_info sinfo;
        int err;
-        dentry_unhash(dentry);
        lock_super(sb);
        err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err, is_dir, update_dotdot, corrupt = 0;
        struct super_block *sb = old_dir->i_sb;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b5..0f015a0468de 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 * In short, make sure you hash any inodes _before_ you start marking
 * them dirty.
 *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
         */
        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
                if (sb->s_op->dirty_inode)
-                        sb->s_op->dirty_inode(inode);
+                        sb->s_op->dirty_inode(inode, flags);
        }
        /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcfa..d50160714595 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
        if (IS_ERR(req))
                return PTR_ERR(req);
-        dentry_unhash(entry);
        req->in.h.opcode = FUSE_RMDIR;
        req->in.h.nodeid = get_node_id(dir);
        req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        struct fuse_conn *fc = get_fuse_conn(olddir);
        struct fuse_req *req = fuse_get_req(fc);
-        if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
-                dentry_unhash(newent);
        if (IS_ERR(req))
                return PTR_ERR(req);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c1..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
-        if (S_ISDIR(inode->i_mode))
-                dentry_unhash(dentry);
        if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
                return -ENOTEMPTY;
        res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                if (S_ISDIR(new_dentry->d_inode->i_mode))
-                        dentry_unhash(new_dentry);
                res = hfs_remove(new_dir, new_dentry);
                if (res)
                        return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd0..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
        struct inode *inode = dentry->d_inode;
        int res;
-        dentry_unhash(dentry);
        if (inode->i_size != 2)
                return -ENOTEMPTY;
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* Unlink destination if it already exists */
        if (new_dentry->d_inode) {
-                if (S_ISDIR(new_dentry->d_inode->i_mode)) {
+                if (S_ISDIR(new_dentry->d_inode->i_mode))
-                        dentry_unhash(new_dentry);
                        res = hfsplus_rmdir(new_dir, new_dentry);
-                } else {
+                else
                        res = hfsplus_unlink(new_dir, new_dentry);
-                }
                if (res)
                        return res;
        }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e6903..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
        char *file;
        int err;
-        dentry_unhash(dentry);
        if ((file = dentry_name(dentry)) == NULL)
                return -ENOMEM;
        err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
        char *from_name, *to_name;
        int err;
-        if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
-                dentry_unhash(to);
        if ((from_name = dentry_name(from)) == NULL)
                return -ENOMEM;
        if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c0867..acf95dab2aac 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
        int err;
        int r;
-        dentry_unhash(dentry);
        hpfs_adjust_length(name, &len);
        hpfs_lock(dir->i_sb);
        err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct fnode *fnode;
        int err;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        if ((err = hpfs_chk_name(new_name, &new_len))) return err;
        err = 0;
        hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a1..0f7e88a7803f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
 /*
- * linux/fs/inode.c
- *
 * (C) 1997 Linus Torvalds
+ * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/dcache.h>
@@ -27,10 +25,11 @@
 #include <linux/prefetch.h>
 #include <linux/ima.h>
 #include <linux/cred.h>
+#include <linux/buffer_head.h> /* for inode_has_buffers */
 #include "internal.h"
 /*
- * inode locking rules.
+ * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
 *   inode_hash_lock
 */
-/*
- * This is needed for the following functions:
- *  - inode_has_buffers
- *  - invalidate_bdev
- *
- * FIXME: remove all knowledge of the buffer layer from this file
- */
-#include <linux/buffer_head.h>
-/*
- * New inode.c implementation.
- *
- * This implementation has the basic premise of trying
- * to be extremely low-overhead and SMP-safe, yet be
- * simple enough to be "obviously correct".
- *
- * Famous last words.
- */
-/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
-/* #define INODE_PARANOIA 1 */
-/* #define INODE_DEBUG 1 */
-/*
- * Inode lookup is no longer as critical as it used to be:
- * most of the lookups are going to be through the dcache.
- */
-#define I_HASHBITS      i_hash_shift
-#define I_HASHMASK      i_hash_mask
 static unsigned int i_hash_mask __read_mostly;
 static unsigned int i_hash_shift __read_mostly;
 static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
-/*
- * Each inode can be on two separate lists. One is
- * the hash list of the inode, used for lookups. The
- * other linked list is the "type" list:
- *  "in_use" - valid inode, i_count > 0, i_nlink > 0
- *  "dirty"  - as "in_use" but also dirty
- *  "unused" - valid inode, i_count = 0
- *
- * A "dirty" list is maintained for each super block,
- * allowing for low-overhead inode sync() operations.
- */
 static LIST_HEAD(inode_lru);
 static DEFINE_SPINLOCK(inode_lru_lock);
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
-        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
-        return tmp & I_HASHMASK;
+        return tmp & i_hash_mask;
 }
 /**
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9a1e86fc1362..4bca6a2e5c07 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -605,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
        int ret;
        uint32_t now = get_seconds();
-        dentry_unhash(dentry);
        for (fd = f->dents ; fd; fd = fd->next) {
                if (fd->ino)
                        return -ENOTEMPTY;
@@ -782,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
        uint8_t type;
        uint32_t now;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        /* The VFS will check for us and prevent trying to rename a
         * file over a directory and vice versa, but if it's a directory,
         * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767eb..46ad619b6124 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
        return ERR_PTR(ret);
 }
-void jffs2_dirty_inode(struct inode *inode)
+void jffs2_dirty_inode(struct inode *inode, int flags)
 {
        struct iattr iattr;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e48..65c6c43ca482 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
 int jffs2_do_setattr (struct inode *, struct iattr *);
 struct inode *jffs2_iget(struct super_block *, unsigned long);
 void jffs2_evict_inode (struct inode *);
-void jffs2_dirty_inode(struct inode *inode);
+void jffs2_dirty_inode(struct inode *inode, int flags);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
                               struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209e..109655904bbc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
        dquot_drop(inode);
 }
-void jfs_dirty_inode(struct inode *inode)
+void jfs_dirty_inode(struct inode *inode, int flags)
 {
        static int noisy = 5;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07d..ec2fb8b945fc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
 extern int jfs_commit_inode(struct inode *, int);
 extern int jfs_write_inode(struct inode *, struct writeback_control *);
 extern void jfs_evict_inode(struct inode *);
-extern void jfs_dirty_inode(struct inode *);
+extern void jfs_dirty_inode(struct inode *, int);
 extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf3..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
        jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
-        dentry_unhash(dentry);
        /* Init inode for quota operations. */
        dquot_initialize(dip);
        dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
                 new_dentry->d_name.name);
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        dquot_initialize(old_dir);
        dquot_initialize(new_dir);
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e94..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
-        dentry_unhash(dentry);
        if (!logfs_empty_dir(inode))
                return -ENOTEMPTY;
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
        loff_t pos;
        int err;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        /* 1. locate source dd */
        err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
        if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c4..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err = -ENOTEMPTY;
-        dentry_unhash(dentry);
        if (minix_empty_dir(inode)) {
                err = minix_unlink(dir, dentry);
                if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
        struct minix_dir_entry * old_de;
        int err = -ENOENT;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        old_de = minix_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b221..e2e4e8d032ee 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
 }
 /*
- * Skip to top of mountpoint pile in rcuwalk mode.  We abort the rcu-walk if we
+ * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
- * meet a managed dentry and we're not walking to "..".  True is returned to
+ * we meet a managed dentry that would need blocking.
- * continue, false to abort.
 */
 static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
-                               struct inode **inode, bool reverse_transit)
+                               struct inode **inode)
 {
        for (;;) {
                struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                 * that wants to block transit.
                 */
                *inode = path->dentry->d_inode;
-                if (!reverse_transit &&
+                if (unlikely(managed_dentry_might_block(path->dentry)))
-                     unlikely(managed_dentry_might_block(path->dentry)))
                        return false;
                if (!d_mountpoint(path->dentry))
@@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
                path->dentry = mounted->mnt_root;
                nd->seq = read_seqcount_begin(&path->dentry->d_seq);
        }
-        if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                return reverse_transit;
        return true;
 }
-static int follow_dotdot_rcu(struct nameidata *nd)
+static void follow_mount_rcu(struct nameidata *nd)
 {
-        struct inode *inode = nd->inode;
+        while (d_mountpoint(nd->path.dentry)) {
+                struct vfsmount *mounted;
+                mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
+                if (!mounted)
+                        break;
+                nd->path.mnt = mounted;
+                nd->path.dentry = mounted->mnt_root;
+                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+        }
+}
+static int follow_dotdot_rcu(struct nameidata *nd)
+{
        set_root_rcu(nd);
        while (1) {
@@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                        seq = read_seqcount_begin(&parent->d_seq);
                        if (read_seqcount_retry(&old->d_seq, nd->seq))
                                goto failed;
-                        inode = parent->d_inode;
                        nd->path.dentry = parent;
                        nd->seq = seq;
                        break;
@@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
                if (!follow_up_rcu(&nd->path))
                        break;
                nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-                inode = nd->path.dentry->d_inode;
        }
-        __follow_mount_rcu(nd, &nd->path, &inode, true);
+        follow_mount_rcu(nd);
-        nd->inode = inode;
+        nd->inode = nd->path.dentry->d_inode;
        return 0;
 failed:
@@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                }
                path->mnt = mnt;
                path->dentry = dentry;
-                if (likely(__follow_mount_rcu(nd, path, inode, false)))
+                if (unlikely(!__follow_mount_rcu(nd, path, inode)))
-                        return 0;
+                        goto unlazy;
+                if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
+                        goto unlazy;
+                return 0;
 unlazy:
                if (unlazy_walk(nd, dentry))
                        return -ECHILD;
@@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (error)
                goto out;
+        shrink_dcache_parent(dentry);
        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;
@@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
                goto out;
+        if (target)
+                shrink_dcache_parent(new_dentry);
        error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (error)
                goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b06404..9c51f621e901 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
        DPRINTK("ncp_rmdir: removing %s/%s\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
+        /*
+         * fail with EBUSY if there are still references to this
+         * directory.
+         */
        dentry_unhash(dentry);
        error = -EBUSY;
        if (!d_unhashed(dentry))
                goto out;
@@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
                old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
                new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
+        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
+                /*
+                 * fail with EBUSY if there are still references to this
+                 * directory.
+                 */
                dentry_unhash(new_dentry);
+                error = -EBUSY;
+                if (!d_unhashed(new_dentry))
+                        goto out;
+        }
        ncp_age_dentry(server, old_dentry);
        ncp_age_dentry(server, new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6db..81515545ba75 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
 config PNFS_FILE_LAYOUT
        tristate
+config PNFS_OBJLAYOUT
+        tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
+        depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
+        help
+          Say M here if you want your pNFS client to support the Objects Layout Driver.
+          Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
+          upper level driver (SCSI_OSD_ULD).
+          If unsure, say N.
 config ROOT_NFS
        bool "Root file system on NFS"
        depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e3814..6a34f7dd0e6f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
                           delegation.o idmap.o \
                           callback.o callback_xdr.o callback_proc.o \
                           nfs4namespace.o
-nfs-$(CONFIG_NFS_V4_1)  += pnfs.o
+nfs-$(CONFIG_NFS_V4_1)  += pnfs.o pnfs_dev.o
 nfs-$(CONFIG_SYSCTL) += sysctl.o
 nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
 nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311b..b257383bb565 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
 extern void nfs4_cb_take_slot(struct nfs_client *clp);
+struct cb_devicenotifyitem {
+        uint32_t                cbd_notify_type;
+        uint32_t                cbd_layout_type;
+        struct nfs4_deviceid    cbd_dev_id;
+        uint32_t                cbd_immediate;
+};
+struct cb_devicenotifyargs {
+        int                              ndevs;
+        struct cb_devicenotifyitem       *devs;
+};
+extern __be32 nfs4_callback_devicenotify(
+        struct cb_devicenotifyargs *args,
+        void *dummy, struct cb_process_state *cps);
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18e..d4d1954e9bb9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
        spin_lock(&ino->i_lock);
        if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
            mark_matching_lsegs_invalid(lo, &free_me_list,
-                                        args->cbl_range.iomode))
+                                        &args->cbl_range))
                rv = NFS4ERR_DELAY;
        else
                rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
                ino = lo->plh_inode;
                spin_lock(&ino->i_lock);
                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-                if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode))
+                if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
                        rv = NFS4ERR_DELAY;
                list_del_init(&lo->plh_bulk_recall);
                spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
        do_callback_layoutrecall(clp, &args);
 }
+__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
+                                  void *dummy, struct cb_process_state *cps)
+{
+        int i;
+        __be32 res = 0;
+        struct nfs_client *clp = cps->clp;
+        struct nfs_server *server = NULL;
+        dprintk("%s: -->\n", __func__);
+        if (!clp) {
+                res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
+                goto out;
+        }
+        for (i = 0; i < args->ndevs; i++) {
+                struct cb_devicenotifyitem *dev = &args->devs[i];
+                if (!server ||
+                    server->pnfs_curr_ld->id != dev->cbd_layout_type) {
+                        rcu_read_lock();
+                        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+                                if (server->pnfs_curr_ld &&
+                                    server->pnfs_curr_ld->id == dev->cbd_layout_type) {
+                                        rcu_read_unlock();
+                                        goto found;
+                                }
+                        rcu_read_unlock();
+                        dprintk("%s: layout type %u not found\n",
+                                __func__, dev->cbd_layout_type);
+                        continue;
+                }
+        found:
+                if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
+                        dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
+                                "deleting instead\n", __func__);
+                nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+        }
+out:
+        kfree(args->devs);
+        dprintk("%s: exit with status = %u\n",
+                __func__, be32_to_cpu(res));
+        return res;
+}
 int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
 {
        if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c1..c6c86a77e043 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
 #if defined(CONFIG_NFS_V4_1)
 #define CB_OP_LAYOUTRECALL_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_DEVICENOTIFY_RES_MAXSZ    (CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_SEQUENCE_RES_MAXSZ        (CB_OP_HDR_RES_MAXSZ + \
                                        4 + 1 + 3)
 #define CB_OP_RECALLANY_RES_MAXSZ       (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
        return status;
 }
+static
+__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
+                                struct xdr_stream *xdr,
+                                struct cb_devicenotifyargs *args)
+{
+        __be32 *p;
+        __be32 status = 0;
+        u32 tmp;
+        int n, i;
+        args->ndevs = 0;
+        /* Num of device notifications */
+        p = read_buf(xdr, sizeof(uint32_t));
+        if (unlikely(p == NULL)) {
+                status = htonl(NFS4ERR_BADXDR);
+                goto out;
+        }
+        n = ntohl(*p++);
+        if (n <= 0)
+                goto out;
+        args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+        if (!args->devs) {
+                status = htonl(NFS4ERR_DELAY);
+                goto out;
+        }
+        /* Decode each dev notification */
+        for (i = 0; i < n; i++) {
+                struct cb_devicenotifyitem *dev = &args->devs[i];
+                p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
+                if (unlikely(p == NULL)) {
+                        status = htonl(NFS4ERR_BADXDR);
+                        goto err;
+                }
+                tmp = ntohl(*p++);      /* bitmap size */
+                if (tmp != 1) {
+                        status = htonl(NFS4ERR_INVAL);
+                        goto err;
+                }
+                dev->cbd_notify_type = ntohl(*p++);
+                if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
+                    dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
+                        status = htonl(NFS4ERR_INVAL);
+                        goto err;
+                }
+                tmp = ntohl(*p++);      /* opaque size */
+                if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
+                     (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
+                    ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
+                     (tmp != NFS4_DEVICEID4_SIZE + 4))) {
+                        status = htonl(NFS4ERR_INVAL);
+                        goto err;
+                }
+                dev->cbd_layout_type = ntohl(*p++);
+                memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
+                p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
+                if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
+                        p = read_buf(xdr, sizeof(uint32_t));
+                        if (unlikely(p == NULL)) {
+                                status = htonl(NFS4ERR_BADXDR);
+                                goto err;
+                        }
+                        dev->cbd_immediate = ntohl(*p++);
+                } else {
+                        dev->cbd_immediate = 0;
+                }
+                args->ndevs++;
+                dprintk("%s: type %d layout 0x%x immediate %d\n",
+                        __func__, dev->cbd_notify_type, dev->cbd_layout_type,
+                        dev->cbd_immediate);
+        }
+out:
+        dprintk("%s: status %d ndevs %d\n",
+                __func__, ntohl(status), args->ndevs);
+        return status;
+err:
+        kfree(args->devs);
+        goto out;
+}
 static __be32 decode_sessionid(struct xdr_stream *xdr,
                                 struct nfs4_sessionid *sid)
 {
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        case OP_CB_RECALL_ANY:
        case OP_CB_RECALL_SLOT:
        case OP_CB_LAYOUTRECALL:
+        case OP_CB_NOTIFY_DEVICEID:
                *op = &callback_ops[op_nr];
                break;
-        case OP_CB_NOTIFY_DEVICEID:
        case OP_CB_NOTIFY:
        case OP_CB_PUSH_DELEG:
        case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
                        (callback_decode_arg_t)decode_layoutrecall_args,
                .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
        },
+        [OP_CB_NOTIFY_DEVICEID] = {
+                .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
+                .decode_args =
+                        (callback_decode_arg_t)decode_devicenotify_args,
+                .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
+        },
        [OP_CB_SEQUENCE] = {
                .process_op = (callback_process_op_t)nfs4_callback_sequence,
                .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d80..b3dc2b88b65b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
        if (clp->cl_machine_cred != NULL)
                put_rpccred(clp->cl_machine_cred);
+        nfs4_deviceid_purge_client(clp);
        kfree(clp->cl_hostname);
        kfree(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2e..dd25c2aec375 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
 #include "delegation.h"
 #include "internal.h"
-static void nfs_do_free_delegation(struct nfs_delegation *delegation)
-{
-        kfree(delegation);
-}
-static void nfs_free_delegation_callback(struct rcu_head *head)
-{
-        struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
-        nfs_do_free_delegation(delegation);
-}
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
        if (delegation->cred) {
                put_rpccred(delegation->cred);
                delegation->cred = NULL;
        }
-        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
+        kfree_rcu(delegation, rcu);
 }
 /**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a84..ededdbd0db38 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
                                struct page **xdr_pages, struct page *page, unsigned int buflen)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf = {
+        struct xdr_buf buf;
-                .pages = xdr_pages,
-                .page_len = buflen,
-                .buflen = buflen,
-                .len = buflen,
-        };
        struct page *scratch;
        struct nfs_cache_array *array;
        unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
        if (scratch == NULL)
                return -ENOMEM;
-        xdr_init_decode(&stream, &buf, NULL);
+        xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5e..144f2a3c7185 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                i_size_write(inode, new_isize);
                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        }
-                        dprintk("NFS: isize change on server for file %s/%ld\n",
+                        dprintk("NFS: isize change on server for file %s/%ld "
-                                        inode->i_sb->s_id, inode->i_ino);
+                                        "(%Ld to %Ld)\n",
+                                        inode->i_sb->s_id,
+                                        inode->i_ino,
+                                        (long long)cur_isize,
+                                        (long long)new_isize);
                }
        } else
                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 */
 void nfs4_evict_inode(struct inode *inode)
 {
-        pnfs_destroy_layout(NFS_I(inode));
        truncate_inode_pages(&inode->i_data, 0);
        end_writeback(inode);
+        pnfs_return_layout(inode);
+        pnfs_destroy_layout(NFS_I(inode));
        /* If we are holding a delegation, return it! */
        nfs_inode_return_delegation_noreclaim(inode);
        /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b5898..b9056cbe68d6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
 #endif
 /* nfs4proc.c */
+extern void __nfs4_read_done_cb(struct nfs_read_data *);
 extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
 extern int nfs4_init_client(struct nfs_client *clp,
                            const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386d..426908809c97 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
                        struct nfs4_deviceid *id,
                        gfp_t gfp_flags)
 {
+        struct nfs4_deviceid_node *d;
        struct nfs4_file_layout_dsaddr *dsaddr;
        int status = -EINVAL;
        struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        dprintk("--> %s\n", __func__);
        if (fl->pattern_offset > lgr->range.offset) {
-                dprintk("%s pattern_offset %lld to large\n",
+                dprintk("%s pattern_offset %lld too large\n",
                                __func__, fl->pattern_offset);
                goto out;
        }
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
        }
        /* find and reference the deviceid */
-        dsaddr = nfs4_fl_find_get_deviceid(id);
+        d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
-        if (dsaddr == NULL) {
+                                   NFS_SERVER(lo->plh_inode)->nfs_client, id);
+        if (d == NULL) {
                dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
                if (dsaddr == NULL)
                        goto out;
-        }
+        } else
+                dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
        fl->dsaddr = dsaddr;
        if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
                         gfp_t gfp_flags)
 {
        struct xdr_stream stream;
-        struct xdr_buf buf = {
+        struct xdr_buf buf;
-                .pages =  lgr->layoutp->pages,
-                .page_len =  lgr->layoutp->len,
-                .buflen =  lgr->layoutp->len,
-                .len = lgr->layoutp->len,
-        };
        struct page *scratch;
        __be32 *p;
        uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
        if (!scratch)
                return -ENOMEM;
-        xdr_init_decode(&stream, &buf, NULL);
+        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
        memcpy(id, p, sizeof(*id));
        p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
-        print_deviceid(id);
+        nfs4_print_deviceid(id);
        nfl_util = be32_to_cpup(p++);
        if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
 /*
 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
 *
- * return 1 :  coalesce page
+ * return true  : coalesce page
- * return 0 :  don't coalesce page
+ * return false : don't coalesce page
 */
-int
+bool
 filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                   struct nfs_page *req)
 {
        u64 p_stripe, r_stripe;
        u32 stripe_unit;
+        if (!pnfs_generic_pg_test(pgio, prev, req))
+                return 0;
        if (!pgio->pg_lseg)
                return 1;
        p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
        return -ENOMEM;
 }
+static void
+filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+        nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
+}
 static struct pnfs_layoutdriver_type filelayout_type = {
        .id                     = LAYOUT_NFSV4_1_FILES,
        .name                   = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
        .write_pagelist         = filelayout_write_pagelist,
+        .free_deviceid_node     = filelayout_free_deveiceid_node,
 };
 static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43a..cebe01e3795e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
 #define NFS4_DEVICE_ID_NEG_ENTRY        0x00000001
 struct nfs4_file_layout_dsaddr {
-        struct hlist_node               node;
+        struct nfs4_deviceid_node       id_node;
-        struct nfs4_deviceid            deviceid;
-        atomic_t                        ref;
        unsigned long                   flags;
        u32                             stripe_count;
        u8                              *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
 extern void print_ds(struct nfs4_pnfs_ds *ds);
-extern void print_deviceid(struct nfs4_deviceid *dev_id);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
                                        u32 ds_idx);
-extern struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
 extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
+extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
 struct nfs4_file_layout_dsaddr *
 get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af1395..3b7bf1377264 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 /*
- * Device ID RCU cache. A device ID is unique per client ID and layout type.
- */
-#define NFS4_FL_DEVICE_ID_HASH_BITS     5
-#define NFS4_FL_DEVICE_ID_HASH_SIZE     (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
-#define NFS4_FL_DEVICE_ID_HASH_MASK     (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
-static inline u32
-nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
-{
-        unsigned char *cptr = (unsigned char *)id->data;
-        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
-        u32 x = 0;
-        while (nbytes--) {
-                x *= 37;
-                x += *cptr++;
-        }
-        return x & NFS4_FL_DEVICE_ID_HASH_MASK;
-}
-static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
-static DEFINE_SPINLOCK(filelayout_deviceid_lock);
-/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
 }
-void
-print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
-{
-        int i;
-        ifdebug(FACILITY) {
-                printk("%s dsaddr->ds_num %d\n", __func__,
-                       dsaddr->ds_num);
-                for (i = 0; i < dsaddr->ds_num; i++)
-                        print_ds(dsaddr->ds_list[i]);
-        }
-}
-void print_deviceid(struct nfs4_deviceid *id)
-{
-        u32 *p = (u32 *)id;
-        dprintk("%s: device id= [%x%x%x%x]\n", __func__,
-                p[0], p[1], p[2], p[3]);
-}
 /* nfs4_ds_cache_lock is held */
 static struct nfs4_pnfs_ds *
 _data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
        kfree(ds);
 }
-static void
+void
 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
        struct nfs4_pnfs_ds *ds;
        int i;
-        print_deviceid(&dsaddr->deviceid);
+        nfs4_print_deviceid(&dsaddr->id_node.deviceid);
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        u8 max_stripe_index;
        struct nfs4_file_layout_dsaddr *dsaddr = NULL;
        struct xdr_stream stream;
-        struct xdr_buf buf = {
+        struct xdr_buf buf;
-                .pages = pdev->pages,
-                .page_len = pdev->pglen,
-                .buflen = pdev->pglen,
-                .len = pdev->pglen,
-        };
        struct page *scratch;
        /* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        if (!scratch)
                goto out_err;
-        xdr_init_decode(&stream, &buf, NULL);
+        xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
        /* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
        dsaddr->stripe_indices = stripe_indices;
        stripe_indices = NULL;
        dsaddr->ds_num = num;
+        nfs4_init_deviceid_node(&dsaddr->id_node,
-        memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
+                                NFS_SERVER(ino)->pnfs_curr_ld,
+                                NFS_SERVER(ino)->nfs_client,
+                                &pdev->dev_id);
        for (i = 0; i < dsaddr->ds_num; i++) {
                int j;
@@ -505,8 +457,8 @@ out_err:
 static struct nfs4_file_layout_dsaddr *
 decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
 {
-        struct nfs4_file_layout_dsaddr *d, *new;
+        struct nfs4_deviceid_node *d;
-        long hash;
+        struct nfs4_file_layout_dsaddr *n, *new;
        new = decode_device(inode, dev, gfp_flags);
        if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
                return NULL;
        }
-        spin_lock(&filelayout_deviceid_lock);
+        d = nfs4_insert_deviceid_node(&new->id_node);
-        d = nfs4_fl_find_get_deviceid(&new->deviceid);
+        n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
-        if (d) {
+        if (n != new) {
-                spin_unlock(&filelayout_deviceid_lock);
                nfs4_fl_free_deviceid(new);
-                return d;
+                return n;
        }
-        INIT_HLIST_NODE(&new->node);
-        atomic_set(&new->ref, 1);
-        hash = nfs4_fl_deviceid_hash(&new->deviceid);
-        hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
-        spin_unlock(&filelayout_deviceid_lock);
        return new;
 }
@@ -600,35 +545,7 @@ out_free:
 void
 nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
-        if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
+        nfs4_put_deviceid_node(&dsaddr->id_node);
-                hlist_del_rcu(&dsaddr->node);
-                spin_unlock(&filelayout_deviceid_lock);
-                synchronize_rcu();
-                nfs4_fl_free_deviceid(dsaddr);
-        }
-}
-struct nfs4_file_layout_dsaddr *
-nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
-{
-        struct nfs4_file_layout_dsaddr *d;
-        struct hlist_node *n;
-        long hash = nfs4_fl_deviceid_hash(id);
-        rcu_read_lock();
-        hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
-                if (!memcmp(&d->deviceid, id, sizeof(*id))) {
-                        if (!atomic_inc_not_zero(&d->ref))
-                                goto fail;
-                        rcu_read_unlock();
-                        return d;
-                }
-        }
-fail:
-        rcu_read_unlock();
-        return NULL;
 }
 /*
@@ -676,15 +593,15 @@ static void
 filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
                               int err, u32 ds_addr)
 {
-        u32 *p = (u32 *)&dsaddr->deviceid;
+        u32 *p = (u32 *)&dsaddr->id_node.deviceid;
        printk(KERN_ERR "NFS: data server %x connection error %d."
                " Deviceid [%x%x%x%x] marked out of use.\n",
                ds_addr, err, p[0], p[1], p[2], p[3]);
-        spin_lock(&filelayout_deviceid_lock);
+        spin_lock(&nfs4_ds_cache_lock);
        dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
-        spin_unlock(&filelayout_deviceid_lock);
+        spin_unlock(&nfs4_ds_cache_lock);
 }
 struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c3937..d2c4b59c896d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
                                break;
                        nfs4_schedule_stateid_recovery(server, state);
                        goto wait_on_recovery;
+                case -NFS4ERR_EXPIRED:
+                        if (state != NULL)
+                                nfs4_schedule_stateid_recovery(server, state);
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_EXPIRED:
                        nfs4_schedule_lease_recovery(clp);
                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
@@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
        struct nfs4_state *state = NULL;
        int status;
+        if (pnfs_ld_layoutret_on_setattr(inode))
+                pnfs_return_layout(inode);
        nfs_fattr_init(fattr);
        
        /* Search for an existing open(O_WRITE) file */
@@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
        return err;
 }
+void __nfs4_read_done_cb(struct nfs_read_data *data)
+{
+        nfs_invalidate_atime(data->inode);
+}
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
                return -EAGAIN;
        }
-        nfs_invalidate_atime(data->inode);
+        __nfs4_read_done_cb(data);
        if (task->tk_status > 0)
                renew_lease(server, data->timestamp);
        return 0;
@@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
        if (!nfs4_sequence_done(task, &data->res.seq_res))
                return -EAGAIN;
-        return data->read_done_cb(task, data);
+        return data->read_done_cb ? data->read_done_cb(task, data) :
+                                    nfs4_read_done_cb(task, data);
 }
 static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
 {
        if (!nfs4_sequence_done(task, &data->res.seq_res))
                return -EAGAIN;
-        return data->write_done_cb(task, data);
+        return data->write_done_cb ? data->write_done_cb(task, data) :
+                nfs4_write_done_cb(task, data);
 }
 /* Reset the the nfs_write_data to send the write to the MDS. */
@@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
                                break;
                        nfs4_schedule_stateid_recovery(server, state);
                        goto wait_on_recovery;
+                case -NFS4ERR_EXPIRED:
+                        if (state != NULL)
+                                nfs4_schedule_stateid_recovery(server, state);
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_STALE_CLIENTID:
-                case -NFS4ERR_EXPIRED:
                        nfs4_schedule_lease_recovery(clp);
                        goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case -ESTALE:
                                goto out;
                        case -NFS4ERR_EXPIRED:
+                                nfs4_schedule_stateid_recovery(server, state);
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
                                nfs4_schedule_lease_recovery(server->nfs_client);
@@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
        return status;
 }
+static void
+nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutreturn *lrp = calldata;
+        dprintk("--> %s\n", __func__);
+        if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
+                                &lrp->res.seq_res, 0, task))
+                return;
+        rpc_call_start(task);
+}
+static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
+{
+        struct nfs4_layoutreturn *lrp = calldata;
+        struct nfs_server *server;
+        dprintk("--> %s\n", __func__);
+        if (!nfs4_sequence_done(task, &lrp->res.seq_res))
+                return;
+        server = NFS_SERVER(lrp->args.inode);
+        if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+                nfs_restart_rpc(task, lrp->clp);
+                return;
+        }
+        if (task->tk_status == 0) {
+                struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
+                if (lrp->res.lrs_present) {
+                        spin_lock(&lo->plh_inode->i_lock);
+                        pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+                        spin_unlock(&lo->plh_inode->i_lock);
+                } else
+                        BUG_ON(!list_empty(&lo->plh_segs));
+        }
+        dprintk("<-- %s\n", __func__);
+}
+static void nfs4_layoutreturn_release(void *calldata)
+{
+        struct nfs4_layoutreturn *lrp = calldata;
+        dprintk("--> %s\n", __func__);
+        put_layout_hdr(NFS_I(lrp->args.inode)->layout);
+        kfree(calldata);
+        dprintk("<-- %s\n", __func__);
+}
+static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
+        .rpc_call_prepare = nfs4_layoutreturn_prepare,
+        .rpc_call_done = nfs4_layoutreturn_done,
+        .rpc_release = nfs4_layoutreturn_release,
+};
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+{
+        struct rpc_task *task;
+        struct rpc_message msg = {
+                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
+                .rpc_argp = &lrp->args,
+                .rpc_resp = &lrp->res,
+        };
+        struct rpc_task_setup task_setup_data = {
+                .rpc_client = lrp->clp->cl_rpcclient,
+                .rpc_message = &msg,
+                .callback_ops = &nfs4_layoutreturn_call_ops,
+                .callback_data = lrp,
+        };
+        int status;
+        dprintk("--> %s\n", __func__);
+        task = rpc_run_task(&task_setup_data);
+        if (IS_ERR(task))
+                return PTR_ERR(task);
+        status = task->tk_status;
+        dprintk("<-- %s status=%d\n", __func__, status);
+        rpc_put_task(task);
+        return status;
+}
 static int
 _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
 {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1f..e97dd219f84f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 #ifdef CONFIG_NFS_V4_1
 void nfs4_schedule_session_recovery(struct nfs4_session *session)
 {
-        nfs4_schedule_lease_recovery(session->clp);
+        struct nfs_client *clp = session->clp;
+        set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+        nfs4_schedule_lease_recovery(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
                status = nfs4_recovery_handle_error(clp, status);
                goto out;
        }
+        clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
        /* create_session negotiated new slot table */
        clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c46834..d869a5e5464b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
                                1 /* layoutupdate4 layout type */ + \
                                1 /* NULL filelayout layoutupdate4 payload */)
 #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
+#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
+                                encode_stateid_maxsz + \
+                                1 /* FIXME: opaque lrf_body always empty at the moment */)
+#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
+                                1 + decode_stateid_maxsz)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz   0
 #define decode_sequence_maxsz   0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
                                decode_putfh_maxsz + \
                                decode_layoutcommit_maxsz + \
                                decode_getattr_maxsz)
+#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
+                                encode_sequence_maxsz + \
+                                encode_putfh_maxsz + \
+                                encode_layoutreturn_maxsz)
+#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
+                                decode_sequence_maxsz + \
+                                decode_putfh_maxsz + \
+                                decode_layoutreturn_maxsz)
 const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
                                      compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
 static int
 encode_layoutcommit(struct xdr_stream *xdr,
+                    struct inode *inode,
                    const struct nfs4_layoutcommit_args *args,
                    struct compound_hdr *hdr)
 {
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
        dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
                NFS_SERVER(args->inode)->pnfs_curr_ld->id);
-        p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
        *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
        /* Only whole file layouts */
        p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
        p = xdr_encode_hyper(p, args->lastbytewritten);
        *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
        *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
-        *p++ = cpu_to_be32(0); /* no file layout payload */
+        if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
+                NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
+                        NFS_I(inode)->layout, xdr, args);
+        else {
+                p = reserve_space(xdr, 4);
+                *p = cpu_to_be32(0); /* no layout-type payload */
+        }
        hdr->nops++;
        hdr->replen += decode_layoutcommit_maxsz;
        return 0;
 }
+static void
+encode_layoutreturn(struct xdr_stream *xdr,
+                    const struct nfs4_layoutreturn_args *args,
+                    struct compound_hdr *hdr)
+{
+        __be32 *p;
+        p = reserve_space(xdr, 20);
+        *p++ = cpu_to_be32(OP_LAYOUTRETURN);
+        *p++ = cpu_to_be32(0);          /* reclaim. always 0 for now */
+        *p++ = cpu_to_be32(args->layout_type);
+        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p = cpu_to_be32(RETURN_FILE);
+        p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
+        p = xdr_encode_hyper(p, 0);
+        p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        spin_lock(&args->inode->i_lock);
+        xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
+        spin_unlock(&args->inode->i_lock);
+        if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
+                NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
+                        NFS_I(args->inode)->layout, xdr, args);
+        } else {
+                p = reserve_space(xdr, 4);
+                *p = cpu_to_be32(0);
+        }
+        hdr->nops++;
+        hdr->replen += decode_layoutreturn_maxsz;
+}
 #endif /* CONFIG_NFS_V4_1 */
 /*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
 /*
 *  Encode LAYOUTCOMMIT request
 */
-static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
+static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
-                                     struct xdr_stream *xdr,
+                                      struct xdr_stream *xdr,
-                                     struct nfs4_layoutcommit_args *args)
+                                      struct nfs4_layoutcommit_args *args)
 {
+        struct nfs4_layoutcommit_data *data =
+                container_of(args, struct nfs4_layoutcommit_data, args);
        struct compound_hdr hdr = {
                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
        };
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
        encode_compound_hdr(xdr, req, &hdr);
        encode_sequence(xdr, &args->seq_args, &hdr);
        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
-        encode_layoutcommit(xdr, args, &hdr);
+        encode_layoutcommit(xdr, data->args.inode, args, &hdr);
        encode_getfattr(xdr, args->bitmask, &hdr);
        encode_nops(&hdr);
-        return 0;
+}
+/*
+ * Encode LAYOUTRETURN request
+ */
+static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
+                                      struct xdr_stream *xdr,
+                                      struct nfs4_layoutreturn_args *args)
+{
+        struct compound_hdr hdr = {
+                .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+        };
+        encode_compound_hdr(xdr, req, &hdr);
+        encode_sequence(xdr, &args->seq_args, &hdr);
+        encode_putfh(xdr, NFS_FH(args->inode), &hdr);
+        encode_layoutreturn(xdr, args, &hdr);
+        encode_nops(&hdr);
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -5203,6 +5271,27 @@ out_overflow:
        return -EIO;
 }
+static int decode_layoutreturn(struct xdr_stream *xdr,
+                               struct nfs4_layoutreturn_res *res)
+{
+        __be32 *p;
+        int status;
+        status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
+        if (status)
+                return status;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->lrs_present = be32_to_cpup(p);
+        if (res->lrs_present)
+                status = decode_stateid(xdr, &res->stateid);
+        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_layoutcommit(struct xdr_stream *xdr,
                               struct rpc_rqst *req,
                               struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
 }
 /*
+ * Decode LAYOUTRETURN response
+ */
+static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
+                                     struct xdr_stream *xdr,
+                                     struct nfs4_layoutreturn_res *res)
+{
+        struct compound_hdr hdr;
+        int status;
+        status = decode_compound_hdr(xdr, &hdr);
+        if (status)
+                goto out;
+        status = decode_sequence(xdr, &res->seq_res, rqstp);
+        if (status)
+                goto out;
+        status = decode_putfh(xdr);
+        if (status)
+                goto out;
+        status = decode_layoutreturn(xdr, res);
+out:
+        return status;
+}
+/*
 * Decode LAYOUTCOMMIT response
 */
 static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
        PROC(GETDEVICEINFO,     enc_getdeviceinfo,      dec_getdeviceinfo),
        PROC(LAYOUTGET,         enc_layoutget,          dec_layoutget),
        PROC(LAYOUTCOMMIT,      enc_layoutcommit,       dec_layoutcommit),
+        PROC(LAYOUTRETURN,      enc_layoutreturn,       dec_layoutreturn),
 #endif /* CONFIG_NFS_V4_1 */
 };
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf2..c4744e1d513c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
 #define NFS_ROOT                "/tftpboot/%s"
 /* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS         "udp"
+#define NFS_DEF_OPTIONS         "vers=2,udp,rsize=4096,wsize=4096"
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Objects Layout Driver kernel module
+#
+objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
+obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..9cf208df1f25
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
+/*
+ *  pNFS Objects layout implementation over open-osd initiator library
+ *
+ *  Copyright (C) 2009 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/module.h>
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+#define _LLU(x) ((unsigned long long)x)
+enum { BIO_MAX_PAGES_KMALLOC =
+                (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
+};
+struct objio_dev_ent {
+        struct nfs4_deviceid_node id_node;
+        struct osd_dev *od;
+};
+static void
+objio_free_deviceid_node(struct nfs4_deviceid_node *d)
+{
+        struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
+        dprintk("%s: free od=%p\n", __func__, de->od);
+        osduld_put_device(de->od);
+        kfree(de);
+}
+static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
+        const struct nfs4_deviceid *d_id)
+{
+        struct nfs4_deviceid_node *d;
+        struct objio_dev_ent *de;
+        d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
+        if (!d)
+                return NULL;
+        de = container_of(d, struct objio_dev_ent, id_node);
+        return de;
+}
+static struct objio_dev_ent *
+_dev_list_add(const struct nfs_server *nfss,
+        const struct nfs4_deviceid *d_id, struct osd_dev *od,
+        gfp_t gfp_flags)
+{
+        struct nfs4_deviceid_node *d;
+        struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
+        struct objio_dev_ent *n;
+        if (!de) {
+                dprintk("%s: -ENOMEM od=%p\n", __func__, od);
+                return NULL;
+        }
+        dprintk("%s: Adding od=%p\n", __func__, od);
+        nfs4_init_deviceid_node(&de->id_node,
+                                nfss->pnfs_curr_ld,
+                                nfss->nfs_client,
+                                d_id);
+        de->od = od;
+        d = nfs4_insert_deviceid_node(&de->id_node);
+        n = container_of(d, struct objio_dev_ent, id_node);
+        if (n != de) {
+                dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
+                objio_free_deviceid_node(&de->id_node);
+                de = n;
+        }
+        atomic_inc(&de->id_node.ref);
+        return de;
+}
+struct caps_buffers {
+        u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
+        u8 creds[OSD_CAP_LEN];
+};
+struct objio_segment {
+        struct pnfs_layout_segment lseg;
+        struct pnfs_osd_object_cred *comps;
+        unsigned mirrors_p1;
+        unsigned stripe_unit;
+        unsigned group_width;   /* Data stripe_units without integrity comps */
+        u64 group_depth;
+        unsigned group_count;
+        unsigned max_io_size;
+        unsigned comps_index;
+        unsigned num_comps;
+        /* variable length */
+        struct objio_dev_ent *ods[];
+};
+static inline struct objio_segment *
+OBJIO_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg, struct objio_segment, lseg);
+}
+struct objio_state;
+typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
+struct objio_state {
+        /* Generic layer */
+        struct objlayout_io_state ol_state;
+        struct objio_segment *layout;
+        struct kref kref;
+        objio_done_fn done;
+        void *private;
+        unsigned long length;
+        unsigned numdevs; /* Actually used devs in this IO */
+        /* A per-device variable array of size numdevs */
+        struct _objio_per_comp {
+                struct bio *bio;
+                struct osd_request *or;
+                unsigned long length;
+                u64 offset;
+                unsigned dev;
+        } per_dev[];
+};
+/* Send and wait for a get_device_info of devices in the layout,
+   then look them up with the osd_initiator library */
+static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
+                                struct objio_segment *objio_seg, unsigned comp,
+                                gfp_t gfp_flags)
+{
+        struct pnfs_osd_deviceaddr *deviceaddr;
+        struct nfs4_deviceid *d_id;
+        struct objio_dev_ent *ode;
+        struct osd_dev *od;
+        struct osd_dev_info odi;
+        int err;
+        d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
+        ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
+        if (ode)
+                return ode;
+        err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
+        if (unlikely(err)) {
+                dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
+                        __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
+                return ERR_PTR(err);
+        }
+        odi.systemid_len = deviceaddr->oda_systemid.len;
+        if (odi.systemid_len > sizeof(odi.systemid)) {
+                err = -EINVAL;
+                goto out;
+        } else if (odi.systemid_len)
+                memcpy(odi.systemid, deviceaddr->oda_systemid.data,
+                       odi.systemid_len);
+        odi.osdname_len  = deviceaddr->oda_osdname.len;
+        odi.osdname      = (u8 *)deviceaddr->oda_osdname.data;
+        if (!odi.osdname_len && !odi.systemid_len) {
+                dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
+                        __func__);
+                err = -ENODEV;
+                goto out;
+        }
+        od = osduld_info_lookup(&odi);
+        if (unlikely(IS_ERR(od))) {
+                err = PTR_ERR(od);
+                dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
+                goto out;
+        }
+        ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
+                            gfp_flags);
+out:
+        dprintk("%s: return=%d\n", __func__, err);
+        objlayout_put_deviceinfo(deviceaddr);
+        return err ? ERR_PTR(err) : ode;
+}
+static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
+        struct objio_segment *objio_seg,
+        gfp_t gfp_flags)
+{
+        unsigned i;
+        int err;
+        /* lookup all devices */
+        for (i = 0; i < objio_seg->num_comps; i++) {
+                struct objio_dev_ent *ode;
+                ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
+                if (unlikely(IS_ERR(ode))) {
+                        err = PTR_ERR(ode);
+                        goto out;
+                }
+                objio_seg->ods[i] = ode;
+        }
+        err = 0;
+out:
+        dprintk("%s: return=%d\n", __func__, err);
+        return err;
+}
+static int _verify_data_map(struct pnfs_osd_layout *layout)
+{
+        struct pnfs_osd_data_map *data_map = &layout->olo_map;
+        u64 stripe_length;
+        u32 group_width;
+/* FIXME: Only raid0 for now. if not go through MDS */
+        if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
+                printk(KERN_ERR "Only RAID_0 for now\n");
+                return -ENOTSUPP;
+        }
+        if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
+                printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
+                          data_map->odm_num_comps, data_map->odm_mirror_cnt);
+                return -EINVAL;
+        }
+        if (data_map->odm_group_width)
+                group_width = data_map->odm_group_width;
+        else
+                group_width = data_map->odm_num_comps /
+                                                (data_map->odm_mirror_cnt + 1);
+        stripe_length = (u64)data_map->odm_stripe_unit * group_width;
+        if (stripe_length >= (1ULL << 32)) {
+                printk(KERN_ERR "Total Stripe length(0x%llx)"
+                          " >= 32bit is not supported\n", _LLU(stripe_length));
+                return -ENOTSUPP;
+        }
+        if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
+                printk(KERN_ERR "Stripe Unit(0x%llx)"
+                          " must be Multples of PAGE_SIZE(0x%lx)\n",
+                          _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
+                return -ENOTSUPP;
+        }
+        return 0;
+}
+static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
+                             struct pnfs_osd_object_cred *src_comp,
+                             struct caps_buffers *caps_p)
+{
+        WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
+        WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
+        *cur_comp = *src_comp;
+        memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
+               sizeof(caps_p->caps_key));
+        cur_comp->oc_cap_key.cred = caps_p->caps_key;
+        memcpy(caps_p->creds, src_comp->oc_cap.cred,
+               sizeof(caps_p->creds));
+        cur_comp->oc_cap.cred = caps_p->creds;
+}
+int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+        struct pnfs_layout_hdr *pnfslay,
+        struct pnfs_layout_range *range,
+        struct xdr_stream *xdr,
+        gfp_t gfp_flags)
+{
+        struct objio_segment *objio_seg;
+        struct pnfs_osd_xdr_decode_layout_iter iter;
+        struct pnfs_osd_layout layout;
+        struct pnfs_osd_object_cred *cur_comp, src_comp;
+        struct caps_buffers *caps_p;
+        int err;
+        err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
+        if (unlikely(err))
+                return err;
+        err = _verify_data_map(&layout);
+        if (unlikely(err))
+                return err;
+        objio_seg = kzalloc(sizeof(*objio_seg) +
+                            sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
+                            sizeof(*objio_seg->comps) * layout.olo_num_comps +
+                            sizeof(struct caps_buffers) * layout.olo_num_comps,
+                            gfp_flags);
+        if (!objio_seg)
+                return -ENOMEM;
+        objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
+        cur_comp = objio_seg->comps;
+        caps_p = (void *)(cur_comp + layout.olo_num_comps);
+        while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
+                copy_single_comp(cur_comp++, &src_comp, caps_p++);
+        if (unlikely(err))
+                goto err;
+        objio_seg->num_comps = layout.olo_num_comps;
+        objio_seg->comps_index = layout.olo_comps_index;
+        err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
+        if (err)
+                goto err;
+        objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
+        objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
+        if (layout.olo_map.odm_group_width) {
+                objio_seg->group_width = layout.olo_map.odm_group_width;
+                objio_seg->group_depth = layout.olo_map.odm_group_depth;
+                objio_seg->group_count = layout.olo_map.odm_num_comps /
+                                                objio_seg->mirrors_p1 /
+                                                objio_seg->group_width;
+        } else {
+                objio_seg->group_width = layout.olo_map.odm_num_comps /
+                                                objio_seg->mirrors_p1;
+                objio_seg->group_depth = -1;
+                objio_seg->group_count = 1;
+        }
+        /* Cache this calculation it will hit for every page */
+        objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
+                                  objio_seg->stripe_unit) *
+                                 objio_seg->group_width;
+        *outp = &objio_seg->lseg;
+        return 0;
+err:
+        kfree(objio_seg);
+        dprintk("%s: Error: return %d\n", __func__, err);
+        *outp = NULL;
+        return err;
+}
+void objio_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        int i;
+        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+        for (i = 0; i < objio_seg->num_comps; i++) {
+                if (!objio_seg->ods[i])
+                        break;
+                nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
+        }
+        kfree(objio_seg);
+}
+int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
+                         struct objlayout_io_state **outp,
+                         gfp_t gfp_flags)
+{
+        struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
+        struct objio_state *ios;
+        const unsigned first_size = sizeof(*ios) +
+                                objio_seg->num_comps * sizeof(ios->per_dev[0]);
+        const unsigned sec_size = objio_seg->num_comps *
+                                                sizeof(ios->ol_state.ioerrs[0]);
+        ios = kzalloc(first_size + sec_size, gfp_flags);
+        if (unlikely(!ios))
+                return -ENOMEM;
+        ios->layout = objio_seg;
+        ios->ol_state.ioerrs = ((void *)ios) + first_size;
+        ios->ol_state.num_comps = objio_seg->num_comps;
+        *outp = &ios->ol_state;
+        return 0;
+}
+void objio_free_io_state(struct objlayout_io_state *ol_state)
+{
+        struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                               ol_state);
+        kfree(ios);
+}
+enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
+{
+        switch (oep) {
+        case OSD_ERR_PRI_NO_ERROR:
+                return (enum pnfs_osd_errno)0;
+        case OSD_ERR_PRI_CLEAR_PAGES:
+                BUG_ON(1);
+                return 0;
+        case OSD_ERR_PRI_RESOURCE:
+                return PNFS_OSD_ERR_RESOURCE;
+        case OSD_ERR_PRI_BAD_CRED:
+                return PNFS_OSD_ERR_BAD_CRED;
+        case OSD_ERR_PRI_NO_ACCESS:
+                return PNFS_OSD_ERR_NO_ACCESS;
+        case OSD_ERR_PRI_UNREACHABLE:
+                return PNFS_OSD_ERR_UNREACHABLE;
+        case OSD_ERR_PRI_NOT_FOUND:
+                return PNFS_OSD_ERR_NOT_FOUND;
+        case OSD_ERR_PRI_NO_SPACE:
+                return PNFS_OSD_ERR_NO_SPACE;
+        default:
+                WARN_ON(1);
+                /* fallthrough */
+        case OSD_ERR_PRI_EIO:
+                return PNFS_OSD_ERR_EIO;
+        }
+}
+static void _clear_bio(struct bio *bio)
+{
+        struct bio_vec *bv;
+        unsigned i;
+        __bio_for_each_segment(bv, bio, i, 0) {
+                unsigned this_count = bv->bv_len;
+                if (likely(PAGE_SIZE == this_count))
+                        clear_highpage(bv->bv_page);
+                else
+                        zero_user(bv->bv_page, bv->bv_offset, this_count);
+        }
+}
+static int _io_check(struct objio_state *ios, bool is_write)
+{
+        enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
+        int lin_ret = 0;
+        int i;
+        for (i = 0; i <  ios->numdevs; i++) {
+                struct osd_sense_info osi;
+                struct osd_request *or = ios->per_dev[i].or;
+                unsigned dev;
+                int ret;
+                if (!or)
+                        continue;
+                ret = osd_req_decode_sense(or, &osi);
+                if (likely(!ret))
+                        continue;
+                if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
+                        /* start read offset passed endof file */
+                        BUG_ON(is_write);
+                        _clear_bio(ios->per_dev[i].bio);
+                        dprintk("%s: start read offset passed end of file "
+                                "offset=0x%llx, length=0x%lx\n", __func__,
+                                _LLU(ios->per_dev[i].offset),
+                                ios->per_dev[i].length);
+                        continue; /* we recovered */
+                }
+                dev = ios->per_dev[i].dev;
+                objlayout_io_set_result(&ios->ol_state, dev,
+                                        &ios->layout->comps[dev].oc_object_id,
+                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
+                                        ios->per_dev[i].offset,
+                                        ios->per_dev[i].length,
+                                        is_write);
+                if (osi.osd_err_pri >= oep) {
+                        oep = osi.osd_err_pri;
+                        lin_ret = ret;
+                }
+        }
+        return lin_ret;
+}
+/*
+ * Common IO state helpers.
+ */
+static void _io_free(struct objio_state *ios)
+{
+        unsigned i;
+        for (i = 0; i < ios->numdevs; i++) {
+                struct _objio_per_comp *per_dev = &ios->per_dev[i];
+                if (per_dev->or) {
+                        osd_end_request(per_dev->or);
+                        per_dev->or = NULL;
+                }
+                if (per_dev->bio) {
+                        bio_put(per_dev->bio);
+                        per_dev->bio = NULL;
+                }
+        }
+}
+struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
+{
+        unsigned min_dev = ios->layout->comps_index;
+        unsigned max_dev = min_dev + ios->layout->num_comps;
+        BUG_ON(dev < min_dev || max_dev <= dev);
+        return ios->layout->ods[dev - min_dev]->od;
+}
+struct _striping_info {
+        u64 obj_offset;
+        u64 group_length;
+        unsigned dev;
+        unsigned unit_off;
+};
+static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
+                              struct _striping_info *si)
+{
+        u32     stripe_unit = ios->layout->stripe_unit;
+        u32     group_width = ios->layout->group_width;
+        u64     group_depth = ios->layout->group_depth;
+        u32     U = stripe_unit * group_width;
+        u64     T = U * group_depth;
+        u64     S = T * ios->layout->group_count;
+        u64     M = div64_u64(file_offset, S);
+        /*
+        G = (L - (M * S)) / T
+        H = (L - (M * S)) % T
+        */
+        u64     LmodU = file_offset - M * S;
+        u32     G = div64_u64(LmodU, T);
+        u64     H = LmodU - G * T;
+        u32     N = div_u64(H, U);
+        div_u64_rem(file_offset, stripe_unit, &si->unit_off);
+        si->obj_offset = si->unit_off + (N * stripe_unit) +
+                                  (M * group_depth * stripe_unit);
+        /* "H - (N * U)" is just "H % U" so it's bound to u32 */
+        si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+        si->dev *= ios->layout->mirrors_p1;
+        si->group_length = T - H;
+}
+static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
+                unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+                gfp_t gfp_flags)
+{
+        unsigned pg = *cur_pg;
+        struct request_queue *q =
+                        osd_request_queue(_io_od(ios, per_dev->dev));
+        per_dev->length += cur_len;
+        if (per_dev->bio == NULL) {
+                unsigned stripes = ios->layout->num_comps /
+                                                     ios->layout->mirrors_p1;
+                unsigned pages_in_stripe = stripes *
+                                      (ios->layout->stripe_unit / PAGE_SIZE);
+                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
+                                    stripes;
+                if (BIO_MAX_PAGES_KMALLOC < bio_size)
+                        bio_size = BIO_MAX_PAGES_KMALLOC;
+                per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
+                if (unlikely(!per_dev->bio)) {
+                        dprintk("Faild to allocate BIO size=%u\n", bio_size);
+                        return -ENOMEM;
+                }
+        }
+        while (cur_len > 0) {
+                unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
+                unsigned added_len;
+                BUG_ON(ios->ol_state.nr_pages <= pg);
+                cur_len -= pglen;
+                added_len = bio_add_pc_page(q, per_dev->bio,
+                                        ios->ol_state.pages[pg], pglen, pgbase);
+                if (unlikely(pglen != added_len))
+                        return -ENOMEM;
+                pgbase = 0;
+                ++pg;
+        }
+        BUG_ON(cur_len);
+        *cur_pg = pg;
+        return 0;
+}
+static int _prepare_one_group(struct objio_state *ios, u64 length,
+                              struct _striping_info *si, unsigned *last_pg,
+                              gfp_t gfp_flags)
+{
+        unsigned stripe_unit = ios->layout->stripe_unit;
+        unsigned mirrors_p1 = ios->layout->mirrors_p1;
+        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
+        unsigned dev = si->dev;
+        unsigned first_dev = dev - (dev % devs_in_group);
+        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
+        unsigned cur_pg = *last_pg;
+        int ret = 0;
+        while (length) {
+                struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+                unsigned cur_len, page_off = 0;
+                if (!per_dev->length) {
+                        per_dev->dev = dev;
+                        if (dev < si->dev) {
+                                per_dev->offset = si->obj_offset + stripe_unit -
+                                                                   si->unit_off;
+                                cur_len = stripe_unit;
+                        } else if (dev == si->dev) {
+                                per_dev->offset = si->obj_offset;
+                                cur_len = stripe_unit - si->unit_off;
+                                page_off = si->unit_off & ~PAGE_MASK;
+                                BUG_ON(page_off &&
+                                      (page_off != ios->ol_state.pgbase));
+                        } else { /* dev > si->dev */
+                                per_dev->offset = si->obj_offset - si->unit_off;
+                                cur_len = stripe_unit;
+                        }
+                        if (max_comp < dev)
+                                max_comp = dev;
+                } else {
+                        cur_len = stripe_unit;
+                }
+                if (cur_len >= length)
+                        cur_len = length;
+                ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
+                                       cur_len, gfp_flags);
+                if (unlikely(ret))
+                        goto out;
+                dev += mirrors_p1;
+                dev = (dev % devs_in_group) + first_dev;
+                length -= cur_len;
+                ios->length += cur_len;
+        }
+out:
+        ios->numdevs = max_comp + mirrors_p1;
+        *last_pg = cur_pg;
+        return ret;
+}
+static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
+{
+        u64 length = ios->ol_state.count;
+        u64 offset = ios->ol_state.offset;
+        struct _striping_info si;
+        unsigned last_pg = 0;
+        int ret = 0;
+        while (length) {
+                _calc_stripe_info(ios, offset, &si);
+                if (length < si.group_length)
+                        si.group_length = length;
+                ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
+                if (unlikely(ret))
+                        goto out;
+                offset += si.group_length;
+                length -= si.group_length;
+        }
+out:
+        if (!ios->length)
+                return ret;
+        return 0;
+}
+static ssize_t _sync_done(struct objio_state *ios)
+{
+        struct completion *waiting = ios->private;
+        complete(waiting);
+        return 0;
+}
+static void _last_io(struct kref *kref)
+{
+        struct objio_state *ios = container_of(kref, struct objio_state, kref);
+        ios->done(ios);
+}
+static void _done_io(struct osd_request *or, void *p)
+{
+        struct objio_state *ios = p;
+        kref_put(&ios->kref, _last_io);
+}
+static ssize_t _io_exec(struct objio_state *ios)
+{
+        DECLARE_COMPLETION_ONSTACK(wait);
+        ssize_t status = 0; /* sync status */
+        unsigned i;
+        objio_done_fn saved_done_fn = ios->done;
+        bool sync = ios->ol_state.sync;
+        if (sync) {
+                ios->done = _sync_done;
+                ios->private = &wait;
+        }
+        kref_init(&ios->kref);
+        for (i = 0; i < ios->numdevs; i++) {
+                struct osd_request *or = ios->per_dev[i].or;
+                if (!or)
+                        continue;
+                kref_get(&ios->kref);
+                osd_execute_request_async(or, _done_io, ios);
+        }
+        kref_put(&ios->kref, _last_io);
+        if (sync) {
+                wait_for_completion(&wait);
+                status = saved_done_fn(ios);
+        }
+        return status;
+}
+/*
+ * read
+ */
+static ssize_t _read_done(struct objio_state *ios)
+{
+        ssize_t status;
+        int ret = _io_check(ios, false);
+        _io_free(ios);
+        if (likely(!ret))
+                status = ios->length;
+        else
+                status = ret;
+        objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
+        return status;
+}
+static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+        struct osd_request *or = NULL;
+        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+        unsigned dev = per_dev->dev;
+        struct pnfs_osd_object_cred *cred =
+                        &ios->layout->comps[dev];
+        struct osd_obj_id obj = {
+                .partition = cred->oc_object_id.oid_partition_id,
+                .id = cred->oc_object_id.oid_object_id,
+        };
+        int ret;
+        or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
+        if (unlikely(!or)) {
+                ret = -ENOMEM;
+                goto err;
+        }
+        per_dev->or = or;
+        osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
+        ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+        if (ret) {
+                dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                        __func__, ret);
+                goto err;
+        }
+        dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+                __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+                per_dev->length);
+err:
+        return ret;
+}
+static ssize_t _read_exec(struct objio_state *ios)
+{
+        unsigned i;
+        int ret;
+        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+                if (!ios->per_dev[i].length)
+                        continue;
+                ret = _read_mirrors(ios, i);
+                if (unlikely(ret))
+                        goto err;
+        }
+        ios->done = _read_done;
+        return _io_exec(ios); /* In sync mode exec returns the io status */
+err:
+        _io_free(ios);
+        return ret;
+}
+ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
+{
+        struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                               ol_state);
+        int ret;
+        ret = _io_rw_pagelist(ios, GFP_KERNEL);
+        if (unlikely(ret))
+                return ret;
+        return _read_exec(ios);
+}
+/*
+ * write
+ */
+static ssize_t _write_done(struct objio_state *ios)
+{
+        ssize_t status;
+        int ret = _io_check(ios, true);
+        _io_free(ios);
+        if (likely(!ret)) {
+                /* FIXME: should be based on the OSD's persistence model
+                 * See OSD2r05 Section 4.13 Data persistence model */
+                ios->ol_state.committed = NFS_FILE_SYNC;
+                status = ios->length;
+        } else {
+                status = ret;
+        }
+        objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
+        return status;
+}
+static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
+{
+        struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
+        unsigned dev = ios->per_dev[cur_comp].dev;
+        unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
+        int ret;
+        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
+                struct osd_request *or = NULL;
+                struct pnfs_osd_object_cred *cred =
+                                        &ios->layout->comps[dev];
+                struct osd_obj_id obj = {
+                        .partition = cred->oc_object_id.oid_partition_id,
+                        .id = cred->oc_object_id.oid_object_id,
+                };
+                struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
+                struct bio *bio;
+                or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
+                if (unlikely(!or)) {
+                        ret = -ENOMEM;
+                        goto err;
+                }
+                per_dev->or = or;
+                if (per_dev != master_dev) {
+                        bio = bio_kmalloc(GFP_NOFS,
+                                          master_dev->bio->bi_max_vecs);
+                        if (unlikely(!bio)) {
+                                dprintk("Faild to allocate BIO size=%u\n",
+                                        master_dev->bio->bi_max_vecs);
+                                ret = -ENOMEM;
+                                goto err;
+                        }
+                        __bio_clone(bio, master_dev->bio);
+                        bio->bi_bdev = NULL;
+                        bio->bi_next = NULL;
+                        per_dev->bio = bio;
+                        per_dev->dev = dev;
+                        per_dev->length = master_dev->length;
+                        per_dev->offset =  master_dev->offset;
+                } else {
+                        bio = master_dev->bio;
+                        bio->bi_rw |= REQ_WRITE;
+                }
+                osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
+                ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
+                if (ret) {
+                        dprintk("%s: Faild to osd_finalize_request() => %d\n",
+                                __func__, ret);
+                        goto err;
+                }
+                dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
+                        __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
+                        per_dev->length);
+        }
+err:
+        return ret;
+}
+static ssize_t _write_exec(struct objio_state *ios)
+{
+        unsigned i;
+        int ret;
+        for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
+                if (!ios->per_dev[i].length)
+                        continue;
+                ret = _write_mirrors(ios, i);
+                if (unlikely(ret))
+                        goto err;
+        }
+        ios->done = _write_done;
+        return _io_exec(ios); /* In sync mode exec returns the io->status */
+err:
+        _io_free(ios);
+        return ret;
+}
+ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
+{
+        struct objio_state *ios = container_of(ol_state, struct objio_state,
+                                               ol_state);
+        int ret;
+        /* TODO: ios->stable = stable; */
+        ret = _io_rw_pagelist(ios, GFP_NOFS);
+        if (unlikely(ret))
+                return ret;
+        return _write_exec(ios);
+}
+static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+                          struct nfs_page *prev, struct nfs_page *req)
+{
+        if (!pnfs_generic_pg_test(pgio, prev, req))
+                return false;
+        return pgio->pg_count + req->wb_bytes <=
+                        OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
+}
+static struct pnfs_layoutdriver_type objlayout_type = {
+        .id = LAYOUT_OSD2_OBJECTS,
+        .name = "LAYOUT_OSD2_OBJECTS",
+        .flags                   = PNFS_LAYOUTRET_ON_SETATTR,
+        .alloc_layout_hdr        = objlayout_alloc_layout_hdr,
+        .free_layout_hdr         = objlayout_free_layout_hdr,
+        .alloc_lseg              = objlayout_alloc_lseg,
+        .free_lseg               = objlayout_free_lseg,
+        .read_pagelist           = objlayout_read_pagelist,
+        .write_pagelist          = objlayout_write_pagelist,
+        .pg_test                 = objio_pg_test,
+        .free_deviceid_node      = objio_free_deviceid_node,
+        .encode_layoutcommit     = objlayout_encode_layoutcommit,
+        .encode_layoutreturn     = objlayout_encode_layoutreturn,
+};
+MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
+MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
+MODULE_LICENSE("GPL");
+static int __init
+objlayout_init(void)
+{
+        int ret = pnfs_register_layoutdriver(&objlayout_type);
+        if (ret)
+                printk(KERN_INFO
+                        "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
+                        __func__, ret);
+        else
+                printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
+                        __func__);
+        return ret;
+}
+static void __exit
+objlayout_exit(void)
+{
+        pnfs_unregister_layoutdriver(&objlayout_type);
+        printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
+               __func__);
+}
+module_init(objlayout_init);
+module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..dc3956c0de80
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
+/*
+ *  pNFS Objects layout driver high level definitions
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <scsi/osd_initiator.h>
+#include "objlayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * Create a objlayout layout structure for the given inode and return it.
+ */
+struct pnfs_layout_hdr *
+objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+        struct objlayout *objlay;
+        objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
+        if (objlay) {
+                spin_lock_init(&objlay->lock);
+                INIT_LIST_HEAD(&objlay->err_list);
+        }
+        dprintk("%s: Return %p\n", __func__, objlay);
+        return &objlay->pnfs_layout;
+}
+/*
+ * Free an objlayout layout structure
+ */
+void
+objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct objlayout *objlay = OBJLAYOUT(lo);
+        dprintk("%s: objlay %p\n", __func__, objlay);
+        WARN_ON(!list_empty(&objlay->err_list));
+        kfree(objlay);
+}
+/*
+ * Unmarshall layout and store it in pnfslay.
+ */
+struct pnfs_layout_segment *
+objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
+                     struct nfs4_layoutget_res *lgr,
+                     gfp_t gfp_flags)
+{
+        int status = -ENOMEM;
+        struct xdr_stream stream;
+        struct xdr_buf buf = {
+                .pages =  lgr->layoutp->pages,
+                .page_len =  lgr->layoutp->len,
+                .buflen =  lgr->layoutp->len,
+                .len = lgr->layoutp->len,
+        };
+        struct page *scratch;
+        struct pnfs_layout_segment *lseg;
+        dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                goto err_nofree;
+        xdr_init_decode(&stream, &buf, NULL);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
+        if (unlikely(status)) {
+                dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
+                        status);
+                goto err;
+        }
+        __free_page(scratch);
+        dprintk("%s: Return %p\n", __func__, lseg);
+        return lseg;
+err:
+        __free_page(scratch);
+err_nofree:
+        dprintk("%s: Err Return=>%d\n", __func__, status);
+        return ERR_PTR(status);
+}
+/*
+ * Free a layout segement
+ */
+void
+objlayout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        dprintk("%s: freeing layout segment %p\n", __func__, lseg);
+        if (unlikely(!lseg))
+                return;
+        objio_free_lseg(lseg);
+}
+/*
+ * I/O Operations
+ */
+static inline u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end : NFS4_MAX_UINT64;
+}
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+        u64 end;
+        BUG_ON(!len);
+        end = start + len;
+        return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+static struct objlayout_io_state *
+objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
+                        struct page **pages,
+                        unsigned pgbase,
+                        loff_t offset,
+                        size_t count,
+                        struct pnfs_layout_segment *lseg,
+                        void *rpcdata,
+                        gfp_t gfp_flags)
+{
+        struct objlayout_io_state *state;
+        u64 lseg_end_offset;
+        dprintk("%s: allocating io_state\n", __func__);
+        if (objio_alloc_io_state(lseg, &state, gfp_flags))
+                return NULL;
+        BUG_ON(offset < lseg->pls_range.offset);
+        lseg_end_offset = end_offset(lseg->pls_range.offset,
+                                     lseg->pls_range.length);
+        BUG_ON(offset >= lseg_end_offset);
+        if (offset + count > lseg_end_offset) {
+                count = lseg->pls_range.length -
+                                (offset - lseg->pls_range.offset);
+                dprintk("%s: truncated count %Zd\n", __func__, count);
+        }
+        if (pgbase > PAGE_SIZE) {
+                pages += pgbase >> PAGE_SHIFT;
+                pgbase &= ~PAGE_MASK;
+        }
+        INIT_LIST_HEAD(&state->err_list);
+        state->lseg = lseg;
+        state->rpcdata = rpcdata;
+        state->pages = pages;
+        state->pgbase = pgbase;
+        state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        state->offset = offset;
+        state->count = count;
+        state->sync = 0;
+        return state;
+}
+static void
+objlayout_free_io_state(struct objlayout_io_state *state)
+{
+        dprintk("%s: freeing io_state\n", __func__);
+        if (unlikely(!state))
+                return;
+        objio_free_io_state(state);
+}
+/*
+ * I/O done common code
+ */
+static void
+objlayout_iodone(struct objlayout_io_state *state)
+{
+        dprintk("%s: state %p status\n", __func__, state);
+        if (likely(state->status >= 0)) {
+                objlayout_free_io_state(state);
+        } else {
+                struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+                spin_lock(&objlay->lock);
+                objlay->delta_space_valid = OBJ_DSU_INVALID;
+                list_add(&objlay->err_list, &state->err_list);
+                spin_unlock(&objlay->lock);
+        }
+}
+/*
+ * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
+ *
+ * The @index component IO failed (error returned from target). Register
+ * the error for later reporting at layout-return.
+ */
+void
+objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
+                        struct pnfs_osd_objid *pooid, int osd_error,
+                        u64 offset, u64 length, bool is_write)
+{
+        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
+        BUG_ON(index >= state->num_comps);
+        if (osd_error) {
+                ioerr->oer_component = *pooid;
+                ioerr->oer_comp_offset = offset;
+                ioerr->oer_comp_length = length;
+                ioerr->oer_iswrite = is_write;
+                ioerr->oer_errno = osd_error;
+                dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
+                        "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
+                        __func__, index, ioerr->oer_errno,
+                        ioerr->oer_iswrite,
+                        _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                        _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                        ioerr->oer_component.oid_partition_id,
+                        ioerr->oer_component.oid_object_id,
+                        ioerr->oer_comp_offset,
+                        ioerr->oer_comp_length);
+        } else {
+                /* User need not call if no error is reported */
+                ioerr->oer_errno = 0;
+        }
+}
+/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_read_complete(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_read_data *rdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        rdata = container_of(task, struct nfs_read_data, task);
+        pnfs_ld_read_done(rdata);
+}
+void
+objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
+{
+        int eof = state->eof;
+        struct nfs_read_data *rdata;
+        state->status = status;
+        dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
+        rdata = state->rpcdata;
+        rdata->task.tk_status = status;
+        if (status >= 0) {
+                rdata->res.count = status;
+                rdata->res.eof = eof;
+        }
+        objlayout_iodone(state);
+        /* must not use state after this point */
+        if (sync)
+                pnfs_ld_read_done(rdata);
+        else {
+                INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
+                schedule_work(&rdata->task.u.tk_work);
+        }
+}
+/*
+ * Perform sync or async reads.
+ */
+enum pnfs_try_status
+objlayout_read_pagelist(struct nfs_read_data *rdata)
+{
+        loff_t offset = rdata->args.offset;
+        size_t count = rdata->args.count;
+        struct objlayout_io_state *state;
+        ssize_t status = 0;
+        loff_t eof;
+        dprintk("%s: Begin inode %p offset %llu count %d\n",
+                __func__, rdata->inode, offset, (int)count);
+        eof = i_size_read(rdata->inode);
+        if (unlikely(offset + count > eof)) {
+                if (offset >= eof) {
+                        status = 0;
+                        rdata->res.count = 0;
+                        rdata->res.eof = 1;
+                        goto out;
+                }
+                count = eof - offset;
+        }
+        state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
+                                         rdata->args.pages, rdata->args.pgbase,
+                                         offset, count,
+                                         rdata->lseg, rdata,
+                                         GFP_KERNEL);
+        if (unlikely(!state)) {
+                status = -ENOMEM;
+                goto out;
+        }
+        state->eof = state->offset + state->count >= eof;
+        status = objio_read_pagelist(state);
+ out:
+        dprintk("%s: Return status %Zd\n", __func__, status);
+        rdata->pnfs_error = status;
+        return PNFS_ATTEMPTED;
+}
+/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
+ * This is because the osd completion is called with ints-off from
+ * the block layer
+ */
+static void _rpc_write_complete(struct work_struct *work)
+{
+        struct rpc_task *task;
+        struct nfs_write_data *wdata;
+        dprintk("%s enter\n", __func__);
+        task = container_of(work, struct rpc_task, u.tk_work);
+        wdata = container_of(task, struct nfs_write_data, task);
+        pnfs_ld_write_done(wdata);
+}
+void
+objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
+                     bool sync)
+{
+        struct nfs_write_data *wdata;
+        dprintk("%s: Begin\n", __func__);
+        wdata = state->rpcdata;
+        state->status = status;
+        wdata->task.tk_status = status;
+        if (status >= 0) {
+                wdata->res.count = status;
+                wdata->verf.committed = state->committed;
+                dprintk("%s: Return status %d committed %d\n",
+                        __func__, wdata->task.tk_status,
+                        wdata->verf.committed);
+        } else
+                dprintk("%s: Return status %d\n",
+                        __func__, wdata->task.tk_status);
+        objlayout_iodone(state);
+        /* must not use state after this point */
+        if (sync)
+                pnfs_ld_write_done(wdata);
+        else {
+                INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
+                schedule_work(&wdata->task.u.tk_work);
+        }
+}
+/*
+ * Perform sync or async writes.
+ */
+enum pnfs_try_status
+objlayout_write_pagelist(struct nfs_write_data *wdata,
+                         int how)
+{
+        struct objlayout_io_state *state;
+        ssize_t status;
+        dprintk("%s: Begin inode %p offset %llu count %u\n",
+                __func__, wdata->inode, wdata->args.offset, wdata->args.count);
+        state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
+                                         wdata->args.pages,
+                                         wdata->args.pgbase,
+                                         wdata->args.offset,
+                                         wdata->args.count,
+                                         wdata->lseg, wdata,
+                                         GFP_NOFS);
+        if (unlikely(!state)) {
+                status = -ENOMEM;
+                goto out;
+        }
+        state->sync = how & FLUSH_SYNC;
+        status = objio_write_pagelist(state, how & FLUSH_STABLE);
+ out:
+        dprintk("%s: Return status %Zd\n", __func__, status);
+        wdata->pnfs_error = status;
+        return PNFS_ATTEMPTED;
+}
+void
+objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
+                              struct xdr_stream *xdr,
+                              const struct nfs4_layoutcommit_args *args)
+{
+        struct objlayout *objlay = OBJLAYOUT(pnfslay);
+        struct pnfs_osd_layoutupdate lou;
+        __be32 *start;
+        dprintk("%s: Begin\n", __func__);
+        spin_lock(&objlay->lock);
+        lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
+        lou.dsu_delta = objlay->delta_space_used;
+        objlay->delta_space_used = 0;
+        objlay->delta_space_valid = OBJ_DSU_INIT;
+        lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
+        spin_unlock(&objlay->lock);
+        start = xdr_reserve_space(xdr, 4);
+        BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
+        *start = cpu_to_be32((xdr->p - start - 1) * 4);
+        dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
+                lou.dsu_delta, lou.olu_ioerr_flag);
+}
+static int
+err_prio(u32 oer_errno)
+{
+        switch (oer_errno) {
+        case 0:
+                return 0;
+        case PNFS_OSD_ERR_RESOURCE:
+                return OSD_ERR_PRI_RESOURCE;
+        case PNFS_OSD_ERR_BAD_CRED:
+                return OSD_ERR_PRI_BAD_CRED;
+        case PNFS_OSD_ERR_NO_ACCESS:
+                return OSD_ERR_PRI_NO_ACCESS;
+        case PNFS_OSD_ERR_UNREACHABLE:
+                return OSD_ERR_PRI_UNREACHABLE;
+        case PNFS_OSD_ERR_NOT_FOUND:
+                return OSD_ERR_PRI_NOT_FOUND;
+        case PNFS_OSD_ERR_NO_SPACE:
+                return OSD_ERR_PRI_NO_SPACE;
+        default:
+                WARN_ON(1);
+                /* fallthrough */
+        case PNFS_OSD_ERR_EIO:
+                return OSD_ERR_PRI_EIO;
+        }
+}
+static void
+merge_ioerr(struct pnfs_osd_ioerr *dest_err,
+            const struct pnfs_osd_ioerr *src_err)
+{
+        u64 dest_end, src_end;
+        if (!dest_err->oer_errno) {
+                *dest_err = *src_err;
+                /* accumulated device must be blank */
+                memset(&dest_err->oer_component.oid_device_id, 0,
+                        sizeof(dest_err->oer_component.oid_device_id));
+                return;
+        }
+        if (dest_err->oer_component.oid_partition_id !=
+                                src_err->oer_component.oid_partition_id)
+                dest_err->oer_component.oid_partition_id = 0;
+        if (dest_err->oer_component.oid_object_id !=
+                                src_err->oer_component.oid_object_id)
+                dest_err->oer_component.oid_object_id = 0;
+        if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
+                dest_err->oer_comp_offset = src_err->oer_comp_offset;
+        dest_end = end_offset(dest_err->oer_comp_offset,
+                              dest_err->oer_comp_length);
+        src_end =  end_offset(src_err->oer_comp_offset,
+                              src_err->oer_comp_length);
+        if (dest_end < src_end)
+                dest_end = src_end;
+        dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
+        if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
+            (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
+                        dest_err->oer_errno = src_err->oer_errno;
+        } else if (src_err->oer_iswrite) {
+                dest_err->oer_iswrite = true;
+                dest_err->oer_errno = src_err->oer_errno;
+        }
+}
+static void
+encode_accumulated_error(struct objlayout *objlay, __be32 *p)
+{
+        struct objlayout_io_state *state, *tmp;
+        struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
+        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+                unsigned i;
+                for (i = 0; i < state->num_comps; i++) {
+                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        if (!ioerr->oer_errno)
+                                continue;
+                        printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
+                                "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                                "offset=0x%llx length=0x%llx\n",
+                                __func__, i, ioerr->oer_errno,
+                                ioerr->oer_iswrite,
+                                _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                                _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                                ioerr->oer_component.oid_partition_id,
+                                ioerr->oer_component.oid_object_id,
+                                ioerr->oer_comp_offset,
+                                ioerr->oer_comp_length);
+                        merge_ioerr(&accumulated_err, ioerr);
+                }
+                list_del(&state->err_list);
+                objlayout_free_io_state(state);
+        }
+        pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
+}
+void
+objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
+                              struct xdr_stream *xdr,
+                              const struct nfs4_layoutreturn_args *args)
+{
+        struct objlayout *objlay = OBJLAYOUT(pnfslay);
+        struct objlayout_io_state *state, *tmp;
+        __be32 *start;
+        dprintk("%s: Begin\n", __func__);
+        start = xdr_reserve_space(xdr, 4);
+        BUG_ON(!start);
+        spin_lock(&objlay->lock);
+        list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
+                __be32 *last_xdr = NULL, *p;
+                unsigned i;
+                int res = 0;
+                for (i = 0; i < state->num_comps; i++) {
+                        struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
+                        if (!ioerr->oer_errno)
+                                continue;
+                        dprintk("%s: err[%d]: errno=%d is_write=%d "
+                                "dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                                "offset=0x%llx length=0x%llx\n",
+                                __func__, i, ioerr->oer_errno,
+                                ioerr->oer_iswrite,
+                                _DEVID_LO(&ioerr->oer_component.oid_device_id),
+                                _DEVID_HI(&ioerr->oer_component.oid_device_id),
+                                ioerr->oer_component.oid_partition_id,
+                                ioerr->oer_component.oid_object_id,
+                                ioerr->oer_comp_offset,
+                                ioerr->oer_comp_length);
+                        p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
+                        if (unlikely(!p)) {
+                                res = -E2BIG;
+                                break; /* accumulated_error */
+                        }
+                        last_xdr = p;
+                        pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
+                }
+                /* TODO: use xdr_write_pages */
+                if (unlikely(res)) {
+                        /* no space for even one error descriptor */
+                        BUG_ON(!last_xdr);
+                        /* we've encountered a situation with lots and lots of
+                         * errors and no space to encode them all. Use the last
+                         * available slot to report the union of all the
+                         * remaining errors.
+                         */
+                        encode_accumulated_error(objlay, last_xdr);
+                        goto loop_done;
+                }
+                list_del(&state->err_list);
+                objlayout_free_io_state(state);
+        }
+loop_done:
+        spin_unlock(&objlay->lock);
+        *start = cpu_to_be32((xdr->p - start - 1) * 4);
+        dprintk("%s: Return\n", __func__);
+}
+/*
+ * Get Device Info API for io engines
+ */
+struct objlayout_deviceinfo {
+        struct page *page;
+        struct pnfs_osd_deviceaddr da; /* This must be last */
+};
+/* Initialize and call nfs_getdeviceinfo, then decode and return a
+ * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
+ * should be called.
+ */
+int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+        struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+        gfp_t gfp_flags)
+{
+        struct objlayout_deviceinfo *odi;
+        struct pnfs_device pd;
+        struct super_block *sb;
+        struct page *page, **pages;
+        u32 *p;
+        int err;
+        page = alloc_page(gfp_flags);
+        if (!page)
+                return -ENOMEM;
+        pages = &page;
+        pd.pages = pages;
+        memcpy(&pd.dev_id, d_id, sizeof(*d_id));
+        pd.layout_type = LAYOUT_OSD2_OBJECTS;
+        pd.pages = &page;
+        pd.pgbase = 0;
+        pd.pglen = PAGE_SIZE;
+        pd.mincount = 0;
+        sb = pnfslay->plh_inode->i_sb;
+        err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
+        dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
+        if (err)
+                goto err_out;
+        p = page_address(page);
+        odi = kzalloc(sizeof(*odi), gfp_flags);
+        if (!odi) {
+                err = -ENOMEM;
+                goto err_out;
+        }
+        pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
+        odi->page = page;
+        *deviceaddr = &odi->da;
+        return 0;
+err_out:
+        __free_page(page);
+        return err;
+}
+void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
+{
+        struct objlayout_deviceinfo *odi = container_of(deviceaddr,
+                                                struct objlayout_deviceinfo,
+                                                da);
+        __free_page(odi->page);
+        kfree(odi);
+}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
+/*
+ *  Data types and function declerations for interfacing with the
+ *  pNFS standard object layout driver.
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _OBJLAYOUT_H
+#define _OBJLAYOUT_H
+#include <linux/nfs_fs.h>
+#include <linux/pnfs_osd_xdr.h>
+#include "../pnfs.h"
+/*
+ * per-inode layout
+ */
+struct objlayout {
+        struct pnfs_layout_hdr pnfs_layout;
+         /* for layout_commit */
+        enum osd_delta_space_valid_enum {
+                OBJ_DSU_INIT = 0,
+                OBJ_DSU_VALID,
+                OBJ_DSU_INVALID,
+        } delta_space_valid;
+        s64 delta_space_used;  /* consumed by write ops */
+         /* for layout_return */
+        spinlock_t lock;
+        struct list_head err_list;
+};
+static inline struct objlayout *
+OBJLAYOUT(struct pnfs_layout_hdr *lo)
+{
+        return container_of(lo, struct objlayout, pnfs_layout);
+}
+/*
+ * per-I/O operation state
+ * embedded in objects provider io_state data structure
+ */
+struct objlayout_io_state {
+        struct pnfs_layout_segment *lseg;
+        struct page **pages;
+        unsigned pgbase;
+        unsigned nr_pages;
+        unsigned long count;
+        loff_t offset;
+        bool sync;
+        void *rpcdata;
+        int status;             /* res */
+        int eof;                /* res */
+        int committed;          /* res */
+        /* Error reporting (layout_return) */
+        struct list_head err_list;
+        unsigned num_comps;
+        /* Pointer to array of error descriptors of size num_comps.
+         * It should contain as many entries as devices in the osd_layout
+         * that participate in the I/O. It is up to the io_engine to allocate
+         * needed space and set num_comps.
+         */
+        struct pnfs_osd_ioerr *ioerrs;
+};
+/*
+ * Raid engine I/O API
+ */
+extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
+        struct pnfs_layout_hdr *pnfslay,
+        struct pnfs_layout_range *range,
+        struct xdr_stream *xdr,
+        gfp_t gfp_flags);
+extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
+extern int objio_alloc_io_state(
+        struct pnfs_layout_segment *lseg,
+        struct objlayout_io_state **outp,
+        gfp_t gfp_flags);
+extern void objio_free_io_state(struct objlayout_io_state *state);
+extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
+extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
+                                    bool stable);
+/*
+ * callback API
+ */
+extern void objlayout_io_set_result(struct objlayout_io_state *state,
+                        unsigned index, struct pnfs_osd_objid *pooid,
+                        int osd_error, u64 offset, u64 length, bool is_write);
+static inline void
+objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
+{
+        struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
+        /* If one of the I/Os errored out and the delta_space_used was
+         * invalid we render the complete report as invalid. Protocol mandate
+         * the DSU be accurate or not reported.
+         */
+        spin_lock(&objlay->lock);
+        if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
+                objlay->delta_space_valid = OBJ_DSU_VALID;
+                objlay->delta_space_used += space_used;
+        }
+        spin_unlock(&objlay->lock);
+}
+extern void objlayout_read_done(struct objlayout_io_state *state,
+                                ssize_t status, bool sync);
+extern void objlayout_write_done(struct objlayout_io_state *state,
+                                 ssize_t status, bool sync);
+extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
+        struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
+        gfp_t gfp_flags);
+extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
+/*
+ * exported generic objects function vectors
+ */
+extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
+extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
+extern struct pnfs_layout_segment *objlayout_alloc_lseg(
+        struct pnfs_layout_hdr *,
+        struct nfs4_layoutget_res *,
+        gfp_t gfp_flags);
+extern void objlayout_free_lseg(struct pnfs_layout_segment *);
+extern enum pnfs_try_status objlayout_read_pagelist(
+        struct nfs_read_data *);
+extern enum pnfs_try_status objlayout_write_pagelist(
+        struct nfs_write_data *,
+        int how);
+extern void objlayout_encode_layoutcommit(
+        struct pnfs_layout_hdr *,
+        struct xdr_stream *,
+        const struct nfs4_layoutcommit_args *);
+extern void objlayout_encode_layoutreturn(
+        struct pnfs_layout_hdr *,
+        struct xdr_stream *,
+        const struct nfs4_layoutreturn_args *);
+#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
+/*
+ *  Object-Based pNFS Layout XDR layer
+ *
+ *  Copyright (C) 2007 Panasas Inc. [year of first publication]
+ *  All rights reserved.
+ *
+ *  Benny Halevy <bhalevy@panasas.com>
+ *  Boaz Harrosh <bharrosh@panasas.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2
+ *  See the file COPYING included with this distribution for more details.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the Panasas company nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <linux/pnfs_osd_xdr.h>
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+/*
+ * The following implementation is based on RFC5664
+ */
+/*
+ * struct pnfs_osd_objid {
+ *      struct nfs4_deviceid    oid_device_id;
+ *      u64                     oid_partition_id;
+ *      u64                     oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static __be32 *
+_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
+{
+        p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
+                                    sizeof(objid->oid_device_id.data));
+        p = xdr_decode_hyper(p, &objid->oid_partition_id);
+        p = xdr_decode_hyper(p, &objid->oid_object_id);
+        return p;
+}
+/*
+ * struct pnfs_osd_opaque_cred {
+ *      u32 cred_len;
+ *      void *cred;
+ * }; // xdr size [variable]
+ * The return pointers are from the xdr buffer
+ */
+static int
+_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
+                            struct xdr_stream *xdr)
+{
+        __be32 *p = xdr_inline_decode(xdr, 1);
+        if (!p)
+                return -EINVAL;
+        opaque_cred->cred_len = be32_to_cpu(*p++);
+        p = xdr_inline_decode(xdr, opaque_cred->cred_len);
+        if (!p)
+                return -EINVAL;
+        opaque_cred->cred = p;
+        return 0;
+}
+/*
+ * struct pnfs_osd_object_cred {
+ *      struct pnfs_osd_objid           oc_object_id;
+ *      u32                             oc_osd_version;
+ *      u32                             oc_cap_key_sec;
+ *      struct pnfs_osd_opaque_cred     oc_cap_key
+ *      struct pnfs_osd_opaque_cred     oc_cap;
+ * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
+ */
+static int
+_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
+                            struct xdr_stream *xdr)
+{
+        __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
+        int ret;
+        if (!p)
+                return -EIO;
+        p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+        comp->oc_osd_version = be32_to_cpup(p++);
+        comp->oc_cap_key_sec = be32_to_cpup(p);
+        ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
+        if (unlikely(ret))
+                return ret;
+        ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
+        return ret;
+}
+/*
+ * struct pnfs_osd_data_map {
+ *      u32     odm_num_comps;
+ *      u64     odm_stripe_unit;
+ *      u32     odm_group_width;
+ *      u32     odm_group_depth;
+ *      u32     odm_mirror_cnt;
+ *      u32     odm_raid_algorithm;
+ * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
+ */
+static inline int
+_osd_data_map_xdr_sz(void)
+{
+        return 4 + 8 + 4 + 4 + 4 + 4;
+}
+static __be32 *
+_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
+{
+        data_map->odm_num_comps = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
+        data_map->odm_group_width = be32_to_cpup(p++);
+        data_map->odm_group_depth = be32_to_cpup(p++);
+        data_map->odm_mirror_cnt = be32_to_cpup(p++);
+        data_map->odm_raid_algorithm = be32_to_cpup(p++);
+        dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
+                "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
+                __func__,
+                data_map->odm_num_comps,
+                (unsigned long long)data_map->odm_stripe_unit,
+                data_map->odm_group_width,
+                data_map->odm_group_depth,
+                data_map->odm_mirror_cnt,
+                data_map->odm_raid_algorithm);
+        return p;
+}
+int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
+        struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
+{
+        __be32 *p;
+        memset(iter, 0, sizeof(*iter));
+        p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
+        if (unlikely(!p))
+                return -EINVAL;
+        p = _osd_xdr_decode_data_map(p, &layout->olo_map);
+        layout->olo_comps_index = be32_to_cpup(p++);
+        layout->olo_num_comps = be32_to_cpup(p++);
+        iter->total_comps = layout->olo_num_comps;
+        return 0;
+}
+bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
+        struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
+        int *err)
+{
+        BUG_ON(iter->decoded_comps > iter->total_comps);
+        if (iter->decoded_comps == iter->total_comps)
+                return false;
+        *err = _osd_xdr_decode_object_cred(comp, xdr);
+        if (unlikely(*err)) {
+                dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
+                        "total_comps=%d\n", __func__, *err,
+                        iter->decoded_comps, iter->total_comps);
+                return false; /* stop the loop */
+        }
+        dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
+                "key_len=%u cap_len=%u\n",
+                __func__,
+                _DEVID_LO(&comp->oc_object_id.oid_device_id),
+                _DEVID_HI(&comp->oc_object_id.oid_device_id),
+                comp->oc_object_id.oid_partition_id,
+                comp->oc_object_id.oid_object_id,
+                comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
+        iter->decoded_comps++;
+        return true;
+}
+/*
+ * Get Device Information Decoding
+ *
+ * Note: since Device Information is currently done synchronously, all
+ *       variable strings fields are left inside the rpc buffer and are only
+ *       pointed to by the pnfs_osd_deviceaddr members. So the read buffer
+ *       should not be freed while the returned information is in use.
+ */
+/*
+ *struct nfs4_string {
+ *      unsigned int len;
+ *      char *data;
+ *}; // size [variable]
+ * NOTE: Returned string points to inside the XDR buffer
+ */
+static __be32 *
+__read_u8_opaque(__be32 *p, struct nfs4_string *str)
+{
+        str->len = be32_to_cpup(p++);
+        str->data = (char *)p;
+        p += XDR_QUADLEN(str->len);
+        return p;
+}
+/*
+ * struct pnfs_osd_targetid {
+ *      u32                     oti_type;
+ *      struct nfs4_string      oti_scsi_device_id;
+ * };// size 4 + [variable]
+ */
+static __be32 *
+__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
+{
+        u32 oti_type;
+        oti_type = be32_to_cpup(p++);
+        targetid->oti_type = oti_type;
+        switch (oti_type) {
+        case OBJ_TARGET_SCSI_NAME:
+        case OBJ_TARGET_SCSI_DEVICE_ID:
+                p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
+        }
+        return p;
+}
+/*
+ * struct pnfs_osd_net_addr {
+ *      struct nfs4_string      r_netid;
+ *      struct nfs4_string      r_addr;
+ * };
+ */
+static __be32 *
+__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
+{
+        p = __read_u8_opaque(p, &netaddr->r_netid);
+        p = __read_u8_opaque(p, &netaddr->r_addr);
+        return p;
+}
+/*
+ * struct pnfs_osd_targetaddr {
+ *      u32                             ota_available;
+ *      struct pnfs_osd_net_addr        ota_netaddr;
+ * };
+ */
+static __be32 *
+__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
+{
+        u32 ota_available;
+        ota_available = be32_to_cpup(p++);
+        targetaddr->ota_available = ota_available;
+        if (ota_available)
+                p = __read_net_addr(p, &targetaddr->ota_netaddr);
+        return p;
+}
+/*
+ * struct pnfs_osd_deviceaddr {
+ *      struct pnfs_osd_targetid        oda_targetid;
+ *      struct pnfs_osd_targetaddr      oda_targetaddr;
+ *      u8                              oda_lun[8];
+ *      struct nfs4_string              oda_systemid;
+ *      struct pnfs_osd_object_cred     oda_root_obj_cred;
+ *      struct nfs4_string              oda_osdname;
+ * };
+ */
+/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
+ * not have an xdr_stream
+ */
+static __be32 *
+__read_opaque_cred(__be32 *p,
+                              struct pnfs_osd_opaque_cred *opaque_cred)
+{
+        opaque_cred->cred_len = be32_to_cpu(*p++);
+        opaque_cred->cred = p;
+        return p + XDR_QUADLEN(opaque_cred->cred_len);
+}
+static __be32 *
+__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
+{
+        p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
+        comp->oc_osd_version = be32_to_cpup(p++);
+        comp->oc_cap_key_sec = be32_to_cpup(p++);
+        p = __read_opaque_cred(p, &comp->oc_cap_key);
+        p = __read_opaque_cred(p, &comp->oc_cap);
+        return p;
+}
+void pnfs_osd_xdr_decode_deviceaddr(
+        struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
+{
+        p = __read_targetid(p, &deviceaddr->oda_targetid);
+        p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
+        p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
+                                    sizeof(deviceaddr->oda_lun));
+        p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
+        p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
+        p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
+        /* libosd likes this terminated in dbg. It's last, so no problems */
+        deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
+}
+/*
+ * struct pnfs_osd_layoutupdate {
+ *      u32     dsu_valid;
+ *      s64     dsu_delta;
+ *      u32     olu_ioerr_flag;
+ * }; xdr size 4 + 8 + 4
+ */
+int
+pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
+                                 struct pnfs_osd_layoutupdate *lou)
+{
+        __be32 *p = xdr_reserve_space(xdr,  4 + 8 + 4);
+        if (!p)
+                return -E2BIG;
+        *p++ = cpu_to_be32(lou->dsu_valid);
+        if (lou->dsu_valid)
+                p = xdr_encode_hyper(p, lou->dsu_delta);
+        *p++ = cpu_to_be32(lou->olu_ioerr_flag);
+        return 0;
+}
+/*
+ * struct pnfs_osd_objid {
+ *      struct nfs4_deviceid    oid_device_id;
+ *      u64                     oid_partition_id;
+ *      u64                     oid_object_id;
+ * }; // xdr size 32 bytes
+ */
+static inline __be32 *
+pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
+{
+        p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
+                                    sizeof(object_id->oid_device_id.data));
+        p = xdr_encode_hyper(p, object_id->oid_partition_id);
+        p = xdr_encode_hyper(p, object_id->oid_object_id);
+        return p;
+}
+/*
+ * struct pnfs_osd_ioerr {
+ *      struct pnfs_osd_objid   oer_component;
+ *      u64                     oer_comp_offset;
+ *      u64                     oer_comp_length;
+ *      u32                     oer_iswrite;
+ *      u32                     oer_errno;
+ * }; // xdr size 32 + 24 bytes
+ */
+void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
+{
+        p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
+        p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
+        p = xdr_encode_hyper(p, ioerr->oer_comp_length);
+        *p++ = cpu_to_be32(ioerr->oer_iswrite);
+        *p   = cpu_to_be32(ioerr->oer_errno);
+}
+__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 32 + 24);
+        if (unlikely(!p))
+                dprintk("%s: out of xdr space\n", __func__);
+        return p;
+}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e2213..7913961aff22 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
                        TASK_UNINTERRUPTIBLE);
 }
+static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+{
+        /*
+         * FIXME: ideally we should be able to coalesce all requests
+         * that are not block boundary aligned, but currently this
+         * is problematic for the case of bsize < PAGE_CACHE_SIZE,
+         * since nfs_flush_multi and nfs_pagein_multi assume you
+         * can have only one struct nfs_page.
+         */
+        if (desc->pg_bsize < PAGE_SIZE)
+                return 0;
+        return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+}
 /**
 * nfs_pageio_init - initialise a page io descriptor
 * @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_ioflags = io_flags;
        desc->pg_error = 0;
        desc->pg_lseg = NULL;
+        desc->pg_test = nfs_generic_pg_test;
+        pnfs_pageio_init(desc, inode);
 }
 /**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
 *
 * Return 'true' if this is the case, else return 'false'.
 */
-static int nfs_can_coalesce_requests(struct nfs_page *prev,
+static bool nfs_can_coalesce_requests(struct nfs_page *prev,
-                                     struct nfs_page *req,
+                                      struct nfs_page *req,
-                                     struct nfs_pageio_descriptor *pgio)
+                                      struct nfs_pageio_descriptor *pgio)
 {
        if (req->wb_context->cred != prev->wb_context->cred)
-                return 0;
+                return false;
        if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
-                return 0;
+                return false;
        if (req->wb_context->state != prev->wb_context->state)
-                return 0;
+                return false;
        if (req->wb_index != (prev->wb_index + 1))
-                return 0;
+                return false;
        if (req->wb_pgbase != 0)
-                return 0;
+                return false;
        if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
-                return 0;
+                return false;
-        /*
+        return pgio->pg_test(pgio, prev, req);
-         * Non-whole file layouts need to check that req is inside of
-         * pgio->pg_lseg.
-         */
-        if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
-                return 0;
-        return 1;
 }
 /**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                                     struct nfs_page *req)
 {
-        size_t newlen = req->wb_bytes;
        if (desc->pg_count != 0) {
                struct nfs_page *prev;
-                /*
-                 * FIXME: ideally we should be able to coalesce all requests
-                 * that are not block boundary aligned, but currently this
-                 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
-                 * since nfs_flush_multi and nfs_pagein_multi assume you
-                 * can have only one struct nfs_page.
-                 */
-                if (desc->pg_bsize < PAGE_SIZE)
-                        return 0;
-                newlen += desc->pg_count;
-                if (newlen > desc->pg_bsize)
-                        return 0;
                prev = nfs_list_entry(desc->pg_list.prev);
                if (!nfs_can_coalesce_requests(prev, req, desc))
                        return 0;
-        } else
+        } else {
                desc->pg_base = req->wb_pgbase;
+        }
        nfs_list_remove_request(req);
        nfs_list_add_request(req, &desc->pg_list);
-        desc->pg_count = newlen;
+        desc->pg_count += req->wb_bytes;
        return 1;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a520..8c1309d852a6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
        atomic_inc(&lo->plh_refcount);
 }
+static struct pnfs_layout_hdr *
+pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
+{
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+        return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
+                kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+}
+static void
+pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
+        return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
+}
 static void
 destroy_layout_hdr(struct pnfs_layout_hdr *lo)
 {
        dprintk("%s: freeing layout cache %p\n", __func__, lo);
        BUG_ON(!list_empty(&lo->plh_layouts));
        NFS_I(lo->plh_inode)->layout = NULL;
-        kfree(lo);
+        pnfs_free_layout_hdr(lo);
 }
 static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
 {
        struct inode *inode = lseg->pls_layout->plh_inode;
-        BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
        list_del_init(&lseg->pls_list);
        if (list_empty(&lseg->pls_layout->plh_segs)) {
                set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
 }
 EXPORT_SYMBOL_GPL(put_lseg);
+static inline u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end : NFS4_MAX_UINT64;
+}
+/* last octet in a range */
+static inline u64
+last_byte_offset(u64 start, u64 len)
+{
+        u64 end;
+        BUG_ON(!len);
+        end = start + len;
+        return end > start ? end - 1 : NFS4_MAX_UINT64;
+}
+/*
+ * is l2 fully contained in l1?
+ *   start1                             end1
+ *   [----------------------------------)
+ *           start2           end2
+ *           [----------------)
+ */
+static inline int
+lo_seg_contained(struct pnfs_layout_range *l1,
+                 struct pnfs_layout_range *l2)
+{
+        u64 start1 = l1->offset;
+        u64 end1 = end_offset(start1, l1->length);
+        u64 start2 = l2->offset;
+        u64 end2 = end_offset(start2, l2->length);
+        return (start1 <= start2) && (end1 >= end2);
+}
+/*
+ * is l1 and l2 intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                              start2           end2
+ *                              [----------------)
+ */
+static inline int
+lo_seg_intersecting(struct pnfs_layout_range *l1,
+                    struct pnfs_layout_range *l2)
+{
+        u64 start1 = l1->offset;
+        u64 end1 = end_offset(start1, l1->length);
+        u64 start2 = l2->offset;
+        u64 end2 = end_offset(start2, l2->length);
+        return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
+               (end2 == NFS4_MAX_UINT64 || end2 > start1);
+}
 static bool
-should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
+should_free_lseg(struct pnfs_layout_range *lseg_range,
+                 struct pnfs_layout_range *recall_range)
 {
-        return (recall_iomode == IOMODE_ANY ||
+        return (recall_range->iomode == IOMODE_ANY ||
-                lseg_iomode == recall_iomode);
+                lseg_range->iomode == recall_range->iomode) &&
+               lo_seg_intersecting(lseg_range, recall_range);
 }
 /* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 int
 mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                            struct list_head *tmp_list,
-                            u32 iomode)
+                            struct pnfs_layout_range *recall_range)
 {
        struct pnfs_layout_segment *lseg, *next;
        int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                return 0;
        }
        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
-                if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
+                if (!recall_range ||
+                    should_free_lseg(&lseg->pls_range, recall_range)) {
                        dprintk("%s: freeing lseg %p iomode %d "
                                "offset %llu length %llu\n", __func__,
                                lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
        lo = nfsi->layout;
        if (lo) {
                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-                mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
+                mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
        }
        spin_unlock(&nfsi->vfs_inode.i_lock);
        pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
           struct nfs_open_context *ctx,
-           u32 iomode,
+           struct pnfs_layout_range *range,
           gfp_t gfp_flags)
 {
        struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                        goto out_err_free;
        }
-        lgp->args.minlength = NFS4_MAX_UINT64;
+        lgp->args.minlength = PAGE_CACHE_SIZE;
+        if (lgp->args.minlength > range->length)
+                lgp->args.minlength = range->length;
        lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-        lgp->args.range.iomode = iomode;
+        lgp->args.range = *range;
-        lgp->args.range.offset = 0;
-        lgp->args.range.length = NFS4_MAX_UINT64;
        lgp->args.type = server->pnfs_curr_ld->id;
        lgp->args.inode = ino;
        lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
        nfs4_proc_layoutget(lgp);
        if (!lseg) {
                /* remember that LAYOUTGET failed and suspend trying */
-                set_bit(lo_fail_bit(iomode), &lo->plh_flags);
+                set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
        }
        /* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
        return NULL;
 }
+/* Initiates a LAYOUTRETURN(FILE) */
+int
+_pnfs_return_layout(struct inode *ino)
+{
+        struct pnfs_layout_hdr *lo = NULL;
+        struct nfs_inode *nfsi = NFS_I(ino);
+        LIST_HEAD(tmp_list);
+        struct nfs4_layoutreturn *lrp;
+        nfs4_stateid stateid;
+        int status = 0;
+        dprintk("--> %s\n", __func__);
+        spin_lock(&ino->i_lock);
+        lo = nfsi->layout;
+        if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
+                spin_unlock(&ino->i_lock);
+                dprintk("%s: no layout segments to return\n", __func__);
+                goto out;
+        }
+        stateid = nfsi->layout->plh_stateid;
+        /* Reference matched in nfs4_layoutreturn_release */
+        get_layout_hdr(lo);
+        spin_unlock(&ino->i_lock);
+        pnfs_free_lseg_list(&tmp_list);
+        WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
+        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+        if (unlikely(lrp == NULL)) {
+                status = -ENOMEM;
+                goto out;
+        }
+        lrp->args.stateid = stateid;
+        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+        lrp->args.inode = ino;
+        lrp->clp = NFS_SERVER(ino)->nfs_client;
+        status = nfs4_proc_layoutreturn(lrp);
+out:
+        dprintk("<-- %s status: %d\n", __func__, status);
+        return status;
+}
 bool pnfs_roc(struct inode *ino)
 {
        struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
 * are seen first.
 */
 static s64
-cmp_layout(u32 iomode1, u32 iomode2)
+cmp_layout(struct pnfs_layout_range *l1,
+           struct pnfs_layout_range *l2)
 {
+        s64 d;
+        /* high offset > low offset */
+        d = l1->offset - l2->offset;
+        if (d)
+                return d;
+        /* short length > long length */
+        d = l2->length - l1->length;
+        if (d)
+                return d;
        /* read > read/write */
-        return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ);
+        return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
 }
 static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                   struct pnfs_layout_segment *lseg)
 {
        struct pnfs_layout_segment *lp;
-        int found = 0;
        dprintk("%s:Begin\n", __func__);
        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lp, &lo->plh_segs, pls_list) {
-                if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0)
+                if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
                        continue;
                list_add_tail(&lseg->pls_list, &lp->pls_list);
                dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
                        lseg->pls_range.offset, lseg->pls_range.length,
                        lp, lp->pls_range.iomode, lp->pls_range.offset,
                        lp->pls_range.length);
-                found = 1;
+                goto out;
-                break;
-        }
-        if (!found) {
-                list_add_tail(&lseg->pls_list, &lo->plh_segs);
-                dprintk("%s: inserted lseg %p "
-                        "iomode %d offset %llu length %llu at tail\n",
-                        __func__, lseg, lseg->pls_range.iomode,
-                        lseg->pls_range.offset, lseg->pls_range.length);
        }
+        list_add_tail(&lseg->pls_list, &lo->plh_segs);
+        dprintk("%s: inserted lseg %p "
+                "iomode %d offset %llu length %llu at tail\n",
+                __func__, lseg, lseg->pls_range.iomode,
+                lseg->pls_range.offset, lseg->pls_range.length);
+out:
        get_layout_hdr(lo);
        dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
 {
        struct pnfs_layout_hdr *lo;
-        lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
+        lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
        if (!lo)
                return NULL;
        atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
        if (likely(nfsi->layout == NULL))       /* Won the race? */
                nfsi->layout = new;
        else
-                kfree(new);
+                pnfs_free_layout_hdr(new);
        return nfsi->layout;
 }
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
 * READ         RW      true
 */
 static int
-is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode)
+is_matching_lseg(struct pnfs_layout_range *ls_range,
+                 struct pnfs_layout_range *range)
 {
-        return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW);
+        struct pnfs_layout_range range1;
+        if ((range->iomode == IOMODE_RW &&
+             ls_range->iomode != IOMODE_RW) ||
+            !lo_seg_intersecting(ls_range, range))
+                return 0;
+        /* range1 covers only the first byte in the range */
+        range1 = *range;
+        range1.length = 1;
+        return lo_seg_contained(ls_range, &range1);
 }
 /*
 * lookup range in layout
 */
 static struct pnfs_layout_segment *
-pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
+pnfs_find_lseg(struct pnfs_layout_hdr *lo,
+                struct pnfs_layout_range *range)
 {
        struct pnfs_layout_segment *lseg, *ret = NULL;
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
        assert_spin_locked(&lo->plh_inode->i_lock);
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
-                    is_matching_lseg(lseg, iomode)) {
+                    is_matching_lseg(&lseg->pls_range, range)) {
                        ret = get_lseg(lseg);
                        break;
                }
-                if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
+                if (cmp_layout(range, &lseg->pls_range) > 0)
                        break;
        }
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino,
                   struct nfs_open_context *ctx,
+                   loff_t pos,
+                   u64 count,
                   enum pnfs_iomode iomode,
                   gfp_t gfp_flags)
 {
+        struct pnfs_layout_range arg = {
+                .iomode = iomode,
+                .offset = pos,
+                .length = count,
+        };
+        unsigned pg_offset;
        struct nfs_inode *nfsi = NFS_I(ino);
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
                goto out_unlock;
        /* Check to see if the layout for the given range already exists */
-        lseg = pnfs_find_lseg(lo, iomode);
+        lseg = pnfs_find_lseg(lo, &arg);
        if (lseg)
                goto out_unlock;
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
                spin_unlock(&clp->cl_lock);
        }
-        lseg = send_layoutget(lo, ctx, iomode, gfp_flags);
+        pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+        if (pg_offset) {
+                arg.offset -= pg_offset;
+                arg.length += pg_offset;
+        }
+        arg.length = PAGE_CACHE_ALIGN(arg.length);
+        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
        if (!lseg && first) {
                spin_lock(&clp->cl_lock);
                list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
        struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
        int status = 0;
-        /* Verify we got what we asked for.
-         * Note that because the xdr parsing only accepts a single
-         * element array, this can fail even if the server is behaving
-         * correctly.
-         */
-        if (lgp->args.range.iomode > res->range.iomode ||
-            res->range.offset != 0 ||
-            res->range.length != NFS4_MAX_UINT64) {
-                status = -EINVAL;
-                goto out;
-        }
        /* Inject layout blob into I/O device driver */
        lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
        if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
        goto out;
 }
-static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
+bool
-                             struct nfs_page *prev,
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
-                             struct nfs_page *req)
+                     struct nfs_page *req)
 {
+        enum pnfs_iomode access_type;
+        gfp_t gfp_flags;
+        /* We assume that pg_ioflags == 0 iff we're reading a page */
+        if (pgio->pg_ioflags == 0) {
+                access_type = IOMODE_READ;
+                gfp_flags = GFP_KERNEL;
+        } else {
+                access_type = IOMODE_RW;
+                gfp_flags = GFP_NOFS;
+        }
        if (pgio->pg_count == prev->wb_bytes) {
                /* This is first coelesce call for a series of nfs_pages */
                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
                                                   prev->wb_context,
-                                                   IOMODE_READ,
+                                                   req_offset(req),
-                                                   GFP_KERNEL);
+                                                   pgio->pg_count,
+                                                   access_type,
+                                                   gfp_flags);
+                return true;
        }
-        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
-void
+        if (pgio->pg_lseg &&
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+            req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
-{
+                                         pgio->pg_lseg->pls_range.length))
-        struct pnfs_layoutdriver_type *ld;
+                return false;
-        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        return true;
-        pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
 }
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
-static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
+/*
-                              struct nfs_page *prev,
+ * Called by non rpc-based layout drivers
-                              struct nfs_page *req)
+ */
+int
+pnfs_ld_write_done(struct nfs_write_data *data)
 {
-        if (pgio->pg_count == prev->wb_bytes) {
+        int status;
-                /* This is first coelesce call for a series of nfs_pages */
-                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-                                                   prev->wb_context,
-                                                   IOMODE_RW,
-                                                   GFP_NOFS);
-        }
-        return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
-}
-void
+        if (!data->pnfs_error) {
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
+                pnfs_set_layoutcommit(data);
-{
+                data->mds_ops->rpc_call_done(&data->task, data);
-        struct pnfs_layoutdriver_type *ld;
+                data->mds_ops->rpc_release(data);
+                return 0;
+        }
-        ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
-        pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
+                data->pnfs_error);
+        status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
+                                    data->mds_ops, NFS_FILE_SYNC);
+        return status ? : -EAGAIN;
 }
+EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
 enum pnfs_try_status
 pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
 }
 /*
+ * Called by non rpc-based layout drivers
+ */
+int
+pnfs_ld_read_done(struct nfs_read_data *data)
+{
+        int status;
+        if (!data->pnfs_error) {
+                __nfs4_read_done_cb(data);
+                data->mds_ops->rpc_call_done(&data->task, data);
+                data->mds_ops->rpc_release(data);
+                return 0;
+        }
+        dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
+                data->pnfs_error);
+        status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
+                                   data->mds_ops);
+        return status ? : -EAGAIN;
+}
+EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
+/*
 * Call the appropriate parallel I/O subsystem read function.
 */
 enum pnfs_try_status
@@ -1009,7 +1193,7 @@ void
 pnfs_set_layoutcommit(struct nfs_write_data *wdata)
 {
        struct nfs_inode *nfsi = NFS_I(wdata->inode);
-        loff_t end_pos = wdata->args.offset + wdata->res.count;
+        loff_t end_pos = wdata->mds_offset + wdata->res.count;
        bool mark_as_dirty = false;
        spin_lock(&nfsi->vfs_inode.i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7a..48d0a8e4d062 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
 #ifndef FS_NFS_PNFS_H
 #define FS_NFS_PNFS_H
+#include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 enum {
@@ -64,17 +65,29 @@ enum {
        NFS_LAYOUT_DESTROYED,           /* no new use of layout allowed */
 };
+enum layoutdriver_policy_flags {
+        /* Should the pNFS client commit and return the layout upon a setattr */
+        PNFS_LAYOUTRET_ON_SETATTR       = 1 << 0,
+};
+struct nfs4_deviceid_node;
 /* Per-layout driver specific registration structure */
 struct pnfs_layoutdriver_type {
        struct list_head pnfs_tblid;
        const u32 id;
        const char *name;
        struct module *owner;
+        unsigned flags;
+        struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
+        void (*free_layout_hdr) (struct pnfs_layout_hdr *);
        struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
        void (*free_lseg) (struct pnfs_layout_segment *lseg);
        /* test for nfs page cache coalescing */
-        int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
+        bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
        /* Returns true if layoutdriver wants to divert this request to
         * driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
         */
        enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
        enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+        void (*free_deviceid_node) (struct nfs4_deviceid_node *);
+        void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_layoutreturn_args *args);
+        void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_layoutcommit_args *args);
 };
 struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
 extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev);
 extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
 /* pnfs.c */
 void get_layout_hdr(struct pnfs_layout_hdr *lo);
 void put_lseg(struct pnfs_layout_segment *lseg);
 struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-                   enum pnfs_iomode access_type, gfp_t gfp_flags);
+                   loff_t pos, u64 count, enum pnfs_iomode access_type,
+                   gfp_t gfp_flags);
 void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
                                             const struct rpc_call_ops *, int);
 enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
                                            const struct rpc_call_ops *);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
+bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
 int pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct nfs4_state *open_state);
 int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
-                                u32 iomode);
+                                struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
 bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
 void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
 int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
+int _pnfs_return_layout(struct inode *);
+int pnfs_ld_write_done(struct nfs_write_data *);
+int pnfs_ld_read_done(struct nfs_read_data *);
+/* pnfs_dev.c */
+struct nfs4_deviceid_node {
+        struct hlist_node               node;
+        const struct pnfs_layoutdriver_type *ld;
+        const struct nfs_client         *nfs_client;
+        struct nfs4_deviceid            deviceid;
+        atomic_t                        ref;
+};
+void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
+struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
+void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
+                             const struct pnfs_layoutdriver_type *,
+                             const struct nfs_client *,
+                             const struct nfs4_deviceid *);
+struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
+bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
+void nfs4_deviceid_purge_client(const struct nfs_client *);
 static inline int lo_fail_bit(u32 iomode)
 {
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
                put_lseg(req->wb_commit_lseg);
 }
+/* Should the pNFS client commit and return the layout upon a setattr */
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+        if (!pnfs_enabled_sb(NFS_SERVER(inode)))
+                return false;
+        return NFS_SERVER(inode)->pnfs_curr_ld->flags &
+                PNFS_LAYOUTRET_ON_SETATTR;
+}
+static inline int pnfs_return_layout(struct inode *ino)
+{
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_server *nfss = NFS_SERVER(ino);
+        if (pnfs_enabled_sb(nfss) && nfsi->layout)
+                return _pnfs_return_layout(ino);
+        return 0;
+}
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
+                                    struct inode *inode)
+{
+        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+        if (ld)
+                pgio->pg_test = ld->pg_test;
+}
 #else  /* CONFIG_NFS_V4_1 */
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
 static inline struct pnfs_layout_segment *
 pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
-                   enum pnfs_iomode access_type, gfp_t gfp_flags)
+                   loff_t pos, u64 count, enum pnfs_iomode access_type,
+                   gfp_t gfp_flags)
 {
        return NULL;
 }
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
        return PNFS_NOT_ATTEMPTED;
 }
+static inline int pnfs_return_layout(struct inode *ino)
+{
+        return 0;
+}
+static inline bool
+pnfs_ld_layoutret_on_setattr(struct inode *inode)
+{
+        return false;
+}
 static inline bool
 pnfs_roc(struct inode *ino)
 {
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
 {
 }
-static inline void
+static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
+                                    struct inode *inode)
-{
-        pgio->pg_test = NULL;
-}
-static inline void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
 {
-        pgio->pg_test = NULL;
 }
 static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
 {
        return 0;
 }
+static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000000..c65e133ce9c0
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
+/*
+ *  Device operations for the pnfs client.
+ *
+ *  Copyright (c) 2002
+ *  The Regents of the University of Michigan
+ *  All Rights Reserved
+ *
+ *  Dean Hildebrand <dhildebz@umich.edu>
+ *  Garth Goodson   <Garth.Goodson@netapp.com>
+ *
+ *  Permission is granted to use, copy, create derivative works, and
+ *  redistribute this software and such derivative works for any purpose,
+ *  so long as the name of the University of Michigan is not used in
+ *  any advertising or publicity pertaining to the use or distribution
+ *  of this software without specific, written prior authorization. If
+ *  the above copyright notice or any other identification of the
+ *  University of Michigan is included in any copy of any portion of
+ *  this software, then the disclaimer below must also be included.
+ *
+ *  This software is provided as is, without representation or warranty
+ *  of any kind either express or implied, including without limitation
+ *  the implied warranties of merchantability, fitness for a particular
+ *  purpose, or noninfringement.  The Regents of the University of
+ *  Michigan shall not be liable for any damages, including special,
+ *  indirect, incidental, or consequential damages, with respect to any
+ *  claim arising out of or in connection with the use of the software,
+ *  even if it has been or is hereafter advised of the possibility of
+ *  such damages.
+ */
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+/*
+ * Device ID RCU cache. A device ID is unique per server and layout type.
+ */
+#define NFS4_DEVICE_ID_HASH_BITS        5
+#define NFS4_DEVICE_ID_HASH_SIZE        (1 << NFS4_DEVICE_ID_HASH_BITS)
+#define NFS4_DEVICE_ID_HASH_MASK        (NFS4_DEVICE_ID_HASH_SIZE - 1)
+static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfs4_deviceid_lock);
+void
+nfs4_print_deviceid(const struct nfs4_deviceid *id)
+{
+        u32 *p = (u32 *)id;
+        dprintk("%s: device id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+}
+EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
+static inline u32
+nfs4_deviceid_hash(const struct nfs4_deviceid *id)
+{
+        unsigned char *cptr = (unsigned char *)id->data;
+        unsigned int nbytes = NFS4_DEVICEID4_SIZE;
+        u32 x = 0;
+        while (nbytes--) {
+                x *= 37;
+                x += *cptr++;
+        }
+        return x & NFS4_DEVICE_ID_HASH_MASK;
+}
+static struct nfs4_deviceid_node *
+_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
+                 const struct nfs_client *clp, const struct nfs4_deviceid *id,
+                 long hash)
+{
+        struct nfs4_deviceid_node *d;
+        struct hlist_node *n;
+        hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+                if (d->ld == ld && d->nfs_client == clp &&
+                    !memcmp(&d->deviceid, id, sizeof(*id))) {
+                        if (atomic_read(&d->ref))
+                                return d;
+                        else
+                                continue;
+                }
+        return NULL;
+}
+/*
+ * Lookup a deviceid in cache and get a reference count on it if found
+ *
+ * @clp nfs_client associated with deviceid
+ * @id deviceid to look up
+ */
+struct nfs4_deviceid_node *
+_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+                   const struct nfs_client *clp, const struct nfs4_deviceid *id,
+                   long hash)
+{
+        struct nfs4_deviceid_node *d;
+        rcu_read_lock();
+        d = _lookup_deviceid(ld, clp, id, hash);
+        if (d && !atomic_inc_not_zero(&d->ref))
+                d = NULL;
+        rcu_read_unlock();
+        return d;
+}
+struct nfs4_deviceid_node *
+nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
+                       const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+        return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+}
+EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
+/*
+ * Unhash and put deviceid
+ *
+ * @clp nfs_client associated with deviceid
+ * @id the deviceid to unhash
+ *
+ * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
+ */
+struct nfs4_deviceid_node *
+nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
+                         const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+        struct nfs4_deviceid_node *d;
+        spin_lock(&nfs4_deviceid_lock);
+        rcu_read_lock();
+        d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
+        rcu_read_unlock();
+        if (!d) {
+                spin_unlock(&nfs4_deviceid_lock);
+                return NULL;
+        }
+        hlist_del_init_rcu(&d->node);
+        spin_unlock(&nfs4_deviceid_lock);
+        synchronize_rcu();
+        /* balance the initial ref set in pnfs_insert_deviceid */
+        if (atomic_dec_and_test(&d->ref))
+                return d;
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
+/*
+ * Delete a deviceid from cache
+ *
+ * @clp struct nfs_client qualifying the deviceid
+ * @id deviceid to delete
+ */
+void
+nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
+                     const struct nfs_client *clp, const struct nfs4_deviceid *id)
+{
+        struct nfs4_deviceid_node *d;
+        d = nfs4_unhash_put_deviceid(ld, clp, id);
+        if (!d)
+                return;
+        d->ld->free_deviceid_node(d);
+}
+EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
+void
+nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
+                        const struct pnfs_layoutdriver_type *ld,
+                        const struct nfs_client *nfs_client,
+                        const struct nfs4_deviceid *id)
+{
+        INIT_HLIST_NODE(&d->node);
+        d->ld = ld;
+        d->nfs_client = nfs_client;
+        d->deviceid = *id;
+        atomic_set(&d->ref, 1);
+}
+EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
+/*
+ * Uniquely initialize and insert a deviceid node into cache
+ *
+ * @new new deviceid node
+ *      Note that the caller must set up the following members:
+ *        new->ld
+ *        new->nfs_client
+ *        new->deviceid
+ *
+ * @ret the inserted node, if none found, otherwise, the found entry.
+ */
+struct nfs4_deviceid_node *
+nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
+{
+        struct nfs4_deviceid_node *d;
+        long hash;
+        spin_lock(&nfs4_deviceid_lock);
+        hash = nfs4_deviceid_hash(&new->deviceid);
+        d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
+        if (d) {
+                spin_unlock(&nfs4_deviceid_lock);
+                return d;
+        }
+        hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
+        spin_unlock(&nfs4_deviceid_lock);
+        return new;
+}
+EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
+/*
+ * Dereference a deviceid node and delete it when its reference count drops
+ * to zero.
+ *
+ * @d deviceid node to put
+ *
+ * @ret true iff the node was deleted
+ */
+bool
+nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
+{
+        if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
+                return false;
+        hlist_del_init_rcu(&d->node);
+        spin_unlock(&nfs4_deviceid_lock);
+        synchronize_rcu();
+        d->ld->free_deviceid_node(d);
+        return true;
+}
+EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
+static void
+_deviceid_purge_client(const struct nfs_client *clp, long hash)
+{
+        struct nfs4_deviceid_node *d;
+        struct hlist_node *n, *next;
+        HLIST_HEAD(tmp);
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
+                if (d->nfs_client == clp && atomic_read(&d->ref)) {
+                        hlist_del_init_rcu(&d->node);
+                        hlist_add_head(&d->node, &tmp);
+                }
+        rcu_read_unlock();
+        if (hlist_empty(&tmp))
+                return;
+        synchronize_rcu();
+        hlist_for_each_entry_safe(d, n, next, &tmp, node)
+                if (atomic_dec_and_test(&d->ref))
+                        d->ld->free_deviceid_node(d);
+}
+void
+nfs4_deviceid_purge_client(const struct nfs_client *clp)
+{
+        long h;
+        spin_lock(&nfs4_deviceid_lock);
+        for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
+                _deviceid_purge_client(clp, h);
+        spin_unlock(&nfs4_deviceid_lock);
+}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a1..20a7f952e244 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
        atomic_set(&req->wb_complete, requests);
        BUG_ON(desc->pg_lseg != NULL);
-        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                  req_offset(req), desc->pg_count,
+                                  IOMODE_READ, GFP_KERNEL);
        ClearPageError(page);
        offset = 0;
        nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
        }
        req = nfs_list_entry(data->pages.next);
        if ((!lseg) && list_is_singular(&data->pages))
-                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL);
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                          req_offset(req), desc->pg_count,
+                                          IOMODE_READ, GFP_KERNEL);
        ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
                                0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
        if (ret == 0)
                goto read_complete; /* all pages were read */
-        pnfs_pageio_init_read(&pgio, inode);
        if (rsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
        else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa7..ce40e5c568ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "pnfs.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
        return 0;
 }
+#ifdef CONFIG_NFS_V4_1
+void show_sessions(struct seq_file *m, struct nfs_server *server)
+{
+        if (nfs4_has_session(server->nfs_client))
+                seq_printf(m, ",sessions");
+}
+#else
+void show_sessions(struct seq_file *m, struct nfs_server *server) {}
+#endif
+#ifdef CONFIG_NFS_V4_1
+void show_pnfs(struct seq_file *m, struct nfs_server *server)
+{
+        seq_printf(m, ",pnfs=");
+        if (server->pnfs_curr_ld)
+                seq_printf(m, "%s", server->pnfs_curr_ld->name);
+        else
+                seq_printf(m, "not configured");
+}
+#else  /* CONFIG_NFS_V4_1 */
+void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
+#endif /* CONFIG_NFS_V4_1 */
 static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
 {
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
                seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
                seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
                seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
+                show_sessions(m, nfss);
+                show_pnfs(m, nfss);
        }
 #endif
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac92..e268e3b23497 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
        atomic_set(&req->wb_complete, requests);
        BUG_ON(desc->pg_lseg);
-        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+        lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                  req_offset(req), desc->pg_count,
+                                  IOMODE_RW, GFP_NOFS);
        ClearPageError(page);
        offset = 0;
        nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
        }
        req = nfs_list_entry(data->pages.next);
        if ((!lseg) && list_is_singular(&data->pages))
-                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS);
+                lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
+                                          req_offset(req), desc->pg_count,
+                                          IOMODE_RW, GFP_NOFS);
        if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
            (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
 {
        size_t wsize = NFS_SERVER(inode)->wsize;
-        pnfs_pageio_init_write(pgio, inode);
        if (wsize < PAGE_CACHE_SIZE)
                nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
        else
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a2..b9566e46219f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
-        if (rv)
-                goto out;
-        rv = check_nfsd_access(exp, rqstp);
-        if (rv)
-                fh_put(fhp);
-out:
        exp_put(exp);
        return rv;
 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e9..9095f3c21df9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
        }
        /* Now create the file and set attributes */
-        nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len,
+        nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
                                attr, newfhp,
                                argp->createmode, argp->verf, NULL, NULL);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index ad48faca20fc..08c6e36ab2eb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
        return rv;
 }
-__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
+static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
 {
        struct svc_fh   fh;
        int err;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e3..3a6dbd70b34b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
                /*
                 * Note: create modes (UNCHECKED,GUARDED...) are the same
-                 * in NFSv4 as in v3.
+                 * in NFSv4 as in v3 except EXCLUSIVE4_1.
                 */
-                status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data,
+                status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
                                        open->op_fname.len, &open->op_iattr,
                                        &resfh, open->op_createmode,
                                        (u32 *)open->op_verf.data,
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
        memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
               putfh->pf_fhlen);
-        return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
+        return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
 }
 static __be32
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        __be32 err;
        fh_init(&resfh, NFS4_FHSIZE);
+        err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
+        if (err)
+                return err;
        err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
                                    secinfo->si_name, secinfo->si_namelen,
                                    &exp, &dentry);
@@ -986,6 +989,9 @@ enum nfsd4_op_flags {
        ALLOWED_WITHOUT_FH = 1 << 0,    /* No current filehandle required */
        ALLOWED_ON_ABSENT_FS = 1 << 1,  /* ops processed on absent fs */
        ALLOWED_AS_FIRST_OP = 1 << 2,   /* ops reqired first in compound */
+        /* For rfc 5661 section 2.6.3.1.1: */
+        OP_HANDLES_WRONGSEC = 1 << 3,
+        OP_IS_PUTFH_LIKE = 1 << 4,
 };
 struct nfsd4_operation {
@@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
        return nfs_ok;
 }
+static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
+{
+        return &nfsd4_ops[op->opnum];
+}
+static bool need_wrongsec_check(struct svc_rqst *rqstp)
+{
+        struct nfsd4_compoundres *resp = rqstp->rq_resp;
+        struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+        struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+        struct nfsd4_op *next = &argp->ops[resp->opcnt];
+        struct nfsd4_operation *thisd;
+        struct nfsd4_operation *nextd;
+        thisd = OPDESC(this);
+        /*
+         * Most ops check wronsec on our own; only the putfh-like ops
+         * have special rules.
+         */
+        if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
+                return false;
+        /*
+         * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
+         * put-filehandle operation if we're not going to use the
+         * result:
+         */
+        if (argp->opcnt == resp->opcnt)
+                return false;
+        nextd = OPDESC(next);
+        /*
+         * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
+         * errors themselves as necessary; others should check for them
+         * now:
+         */
+        return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+}
 /*
 * COMPOUND call.
 */
@@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                        goto encode_op;
                }
-                opdesc = &nfsd4_ops[op->opnum];
+                opdesc = OPDESC(op);
                if (!cstate->current_fh.fh_dentry) {
                        if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
@@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
                else
                        BUG_ON(op->status == nfs_ok);
+                if (!op->status && need_wrongsec_check(rqstp))
+                        op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
 encode_op:
                /* Only from SEQUENCE */
                if (resp->cstate.status == nfserr_replay_cache) {
@@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_LOOKUP] = {
                .op_func = (nfsd4op_func)nfsd4_lookup,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_LOOKUP",
        },
        [OP_LOOKUPP] = {
                .op_func = (nfsd4op_func)nfsd4_lookupp,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_LOOKUPP",
        },
        [OP_NVERIFY] = {
@@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_OPEN] = {
                .op_func = (nfsd4op_func)nfsd4_open,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_OPEN",
        },
        [OP_OPEN_CONFIRM] = {
@@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_PUTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putfh,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_IS_PUTFH_LIKE,
                .op_name = "OP_PUTFH",
        },
        [OP_PUTPUBFH] = {
                .op_func = (nfsd4op_func)nfsd4_putrootfh,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_IS_PUTFH_LIKE,
                .op_name = "OP_PUTPUBFH",
        },
        [OP_PUTROOTFH] = {
                .op_func = (nfsd4op_func)nfsd4_putrootfh,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_IS_PUTFH_LIKE,
                .op_name = "OP_PUTROOTFH",
        },
        [OP_READ] = {
@@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_RESTOREFH] = {
                .op_func = (nfsd4op_func)nfsd4_restorefh,
-                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+                .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
+                                | OP_IS_PUTFH_LIKE,
                .op_name = "OP_RESTOREFH",
        },
        [OP_SAVEFH] = {
                .op_func = (nfsd4op_func)nfsd4_savefh,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SAVEFH",
        },
        [OP_SECINFO] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO",
        },
        [OP_SETATTR] = {
@@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
        },
        [OP_SECINFO_NO_NAME] = {
                .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
+                .op_flags = OP_HANDLES_WRONGSEC,
                .op_name = "OP_SECINFO_NO_NAME",
        },
 };
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4cf04e11c66c..e98f3c2e9492 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        bool confirm_me = false;
        int status = 0;
+        if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
+                return nfserr_inval;
        nfs4_lock_state();
        unconf = find_unconfirmed_client(&cr_ses->clientid);
        conf = find_confirmed_client(&cr_ses->clientid);
@@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
                return nfserr_badsession;
        status = nfsd4_map_bcts_dir(&bcts->dir);
-        nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+        if (!status)
-        return nfs_ok;
+                nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
+        return status;
 }
 static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
        return;
 }
+static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
+{
+        struct nfsd4_compoundargs *args = rqstp->rq_argp;
+        return args->opcnt > session->se_fchannel.maxops;
+}
 __be32
 nfsd4_sequence(struct svc_rqst *rqstp,
               struct nfsd4_compound_state *cstate,
@@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        if (!session)
                goto out;
+        status = nfserr_too_many_ops;
+        if (nfsd4_session_too_many_ops(rqstp, session))
+                goto out;
        status = nfserr_badslot;
        if (seq->slotid >= session->se_fchannel.maxreqs)
                goto out;
@@ -1808,6 +1823,8 @@ out:
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
+        int status = 0;
        if (rc->rca_one_fs) {
                if (!cstate->current_fh.fh_dentry)
                        return nfserr_nofilehandle;
@@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                 */
                 return nfs_ok;
        }
        nfs4_lock_state();
-        if (is_client_expired(cstate->session->se_client)) {
+        status = nfserr_complete_already;
-                nfs4_unlock_state();
+        if (cstate->session->se_client->cl_firststate)
+                goto out;
+        status = nfserr_stale_clientid;
+        if (is_client_expired(cstate->session->se_client))
                /*
                 * The following error isn't really legal.
                 * But we only get here if the client just explicitly
@@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
                 * error it gets back on an operation for the dead
                 * client.
                 */
-                return nfserr_stale_clientid;
+                goto out;
-        }
+        status = nfs_ok;
        nfsd4_create_clid_dir(cstate->session->se_client);
+out:
        nfs4_unlock_state();
-        return nfs_ok;
+        return status;
 }
 __be32
@@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
        return NULL;
 }
-int share_access_to_flags(u32 share_access)
+static int share_access_to_flags(u32 share_access)
 {
        share_access &= ~NFS4_SHARE_WANT_MASK;
@@ -2882,7 +2906,7 @@ out:
        return status;
 }
-struct lock_manager nfsd4_manager = {
+static struct lock_manager nfsd4_manager = {
 };
 static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d98..990181103214 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
 static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
 {
        DECODE_HEAD;
-        u32 dummy;
        READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
        COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
        READ32(bcts->dir);
-        /* XXX: Perhaps Tom Tucker could help us figure out how we
+        /* XXX: skipping ctsa_use_conn_in_rdma_mode.  Perhaps Tom Tucker
-         * should be using ctsa_use_conn_in_rdma_mode: */
+         * could help us figure out we should be using it. */
-        READ32(dummy);
        DECODE_TAIL;
 }
@@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
        READ_BUF(lockt->lt_owner.len);
        READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
-        if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
-                return nfserr_inval;
        DECODE_TAIL;
 }
@@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
        return nfserr;
 }
-__be32
+static __be32
 nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
                      struct nfsd4_sequence *seq)
 {
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0be..90c6aa6d5e0f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
         * which clients virtually always use auth_sys for,
         * even while using RPCSEC_GSS for NFS.
         */
-        if (access & NFSD_MAY_LOCK)
+        if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
                goto skip_pseudoflavor_check;
        /*
         * Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 129f3c9f62d5..d5718273bb32 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
        struct svc_export       *exp;
        struct dentry           *dparent;
        struct dentry           *dentry;
-        __be32                  err;
        int                     host_err;
        dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
-        /* Obtain dentry and export. */
-        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
-        if (err)
-                return err;
        dparent = fhp->fh_dentry;
        exp  = fhp->fh_export;
        exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
        struct dentry           *dentry;
        __be32 err;
+        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+        if (err)
+                return err;
        err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
        if (err)
                return err;
@@ -877,13 +874,11 @@ static __be32
 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
              loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
 {
-        struct inode *inode;
        mm_segment_t    oldfs;
        __be32          err;
        int             host_err;
        err = nfserr_perm;
-        inode = file->f_path.dentry->d_inode;
        if (file->f_op->splice_read && rqstp->rq_splice_ok) {
                struct splice_desc sd = {
@@ -1340,11 +1335,18 @@ out_nfserr:
 }
 #ifdef CONFIG_NFSD_V3
+static inline int nfsd_create_is_exclusive(int createmode)
+{
+        return createmode == NFS3_CREATE_EXCLUSIVE
+               || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+}
 /*
- * NFSv3 version of nfsd_create
+ * NFSv3 and NFSv4 version of nfsd_create
 */
 __be32
-nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
+do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                char *fname, int flen, struct iattr *iap,
                struct svc_fh *resfhp, int createmode, u32 *verifier,
                int *truncp, int *created)
@@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (err)
                goto out;
-        if (createmode == NFS3_CREATE_EXCLUSIVE) {
+        if (nfsd_create_is_exclusive(createmode)) {
                /* solaris7 gets confused (bugid 4218508) if these have
                 * the high bit set, so just clear the high bits. If this is
                 * ever changed to use different attrs for storing the
@@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
                            && dchild->d_inode->i_atime.tv_sec == v_atime
                            && dchild->d_inode->i_size  == 0 )
                                break;
+                case NFS4_CREATE_EXCLUSIVE4_1:
+                        if (   dchild->d_inode->i_mtime.tv_sec == v_mtime
+                            && dchild->d_inode->i_atime.tv_sec == v_atime
+                            && dchild->d_inode->i_size  == 0 )
+                                goto set_attr;
                         /* fallthru */
                case NFS3_CREATE_GUARDED:
                        err = nfserr_exist;
@@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
        nfsd_check_ignore_resizing(iap);
-        if (createmode == NFS3_CREATE_EXCLUSIVE) {
+        if (nfsd_create_is_exclusive(createmode)) {
                /* Cram the verifier into atime/mtime */
                iap->ia_valid = ATTR_MTIME|ATTR_ATIME
                        | ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        struct inode    *inode = dentry->d_inode;
        int             err;
-        if (acc == NFSD_MAY_NOP)
+        if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
                return 0;
 #if 0
        dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b7..e0bbac04d1dd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,10 +17,14 @@
 #define NFSD_MAY_SATTR          8
 #define NFSD_MAY_TRUNC          16
 #define NFSD_MAY_LOCK           32
+#define NFSD_MAY_MASK           63
+/* extra hints to permission and open routines: */
 #define NFSD_MAY_OWNER_OVERRIDE 64
 #define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
 #define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
 #define NFSD_MAY_NOT_BREAK_LEASE 512
+#define NFSD_MAY_BYPASS_GSS     1024
 #define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
 #define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -54,7 +58,7 @@ __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
                                int type, dev_t rdev, struct svc_fh *res);
 #ifdef CONFIG_NFSD_V3
 __be32          nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
-__be32          nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+__be32          do_nfsd_create(struct svc_rqst *, struct svc_fh *,
                                char *name, int len, struct iattr *attrs,
                                struct svc_fh *res, int createmode,
                                u32 *verifier, int *truncp, int *created);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f18432832..b954878ad6ce 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 * construction. This function can be called both as a single operation
 * and as a part of indivisible file operations.
 */
-void nilfs_dirty_inode(struct inode *inode)
+void nilfs_dirty_inode(struct inode *inode, int flags)
 {
        struct nilfs_transaction_info ti;
        struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb744..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
        struct nilfs_transaction_info ti;
        int err;
-        dentry_unhash(dentry);
        err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
        if (err)
                return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct nilfs_transaction_info ti;
        int err;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
        if (unlikely(err))
                return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80c..f02b9ad43a21 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
 extern int nilfs_mark_inode_dirty(struct inode *);
-extern void nilfs_dirty_inode(struct inode *);
+extern void nilfs_dirty_inode(struct inode *, int flags);
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 __u64 start, __u64 len);
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a1..3b8d3979e03b 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
        int ret;
-        if (S_ISDIR(inode->i_mode)) {
+        if (S_ISDIR(inode->i_mode) &&
-                dentry_unhash(dentry);
+            !omfs_dir_is_empty(inode))
-                if (!omfs_dir_is_empty(inode))
+                return -ENOTEMPTY;
-                        return -ENOTEMPTY;
-        }
        ret = omfs_delete_entry(dentry);
        if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        int err;
        if (new_inode) {
-                if (S_ISDIR(new_inode->i_mode))
-                        dentry_unhash(new_dentry);
                /* overwriting existing file/dir */
                err = omfs_remove(new_dir, new_dentry);
                if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f82e762eeca2..d545e97d99c3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
 {
        struct hd_struct *p = dev_to_part(dev);
-        struct gendisk *disk = dev_to_disk(dev);
+        return sprintf(buf, "%u\n", p->discard_alignment);
-        unsigned int alignment = 0;
-        if (disk->queue)
-                alignment = queue_limit_discard_alignment(&disk->queue->limits,
-                                                                p->start_sect);
-        return sprintf(buf, "%u\n", alignment);
 }
 ssize_t part_stat_show(struct device *dev,
@@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
        p->start_sect = start;
        p->alignment_offset =
                queue_limit_alignment_offset(&disk->queue->limits, start);
+        p->discard_alignment =
+                queue_limit_discard_alignment(&disk->queue->limits, start);
        p->nr_sects = len;
        p->partno = partno;
        p->policy = get_disk_ro(disk);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ede550517a6..14def991d9dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#ifdef CONFIG_HARDWALL
+#include <asm/hardwall.h>
+#endif
 #include "internal.h"
 /* NOTE:
@@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
        INF("io",       S_IRUGO, proc_tgid_io_accounting),
 #endif
+#ifdef CONFIG_HARDWALL
+        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
 };
 static int proc_tgid_base_readdir(struct file * filp,
@@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_TASK_IO_ACCOUNTING
        INF("io",       S_IRUGO, proc_tid_io_accounting),
 #endif
+#ifdef CONFIG_HARDWALL
+        INF("hardwall",   S_IRUGO, proc_pid_hardwall),
+#endif
 };
 static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d5651..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
        INITIALIZE_PATH(path);
        struct reiserfs_dir_entry de;
-        dentry_unhash(dentry);
        /* we will be doing 2 balancings and update 2 stat data, we change quotas
         * of the owner of the directory and of the owner of the parent directory.
         * The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        unsigned long savelink = 1;
        struct timespec ctime;
-        if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
-                dentry_unhash(new_dentry);
        /* three balancings: (1) old name removal, (2) new name insertion
           and (3) maybe "save" link insertion
           stat data updates: (1) old directory,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c9..aa91089162cb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
 }
 /* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode(struct inode *inode)
+static void reiserfs_dirty_inode(struct inode *inode, int flags)
 {
        struct reiserfs_transaction_handle th;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1cd..e8a62f41b458 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
        reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
                                        I_MUTEX_CHILD, dir->i_sb);
-        dentry_unhash(dentry);
        error = dir->i_op->rmdir(dir, dentry);
        if (!error)
                dentry->d_inode->i_flags |= S_DEAD;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9b..5e1101ff276f 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
         * table[0] points to the first inode lookup table metadata block,
         * this should be less than lookup_table_start
         */
-        if (!IS_ERR(table) && table[0] >= lookup_table_start) {
+        if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bfb..0ed6edbc5c71 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
         * table[0] points to the first fragment table metadata block, this
         * should be less than fragment_table_start
         */
-        if (!IS_ERR(table) && table[0] >= fragment_table_start) {
+        if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb44..d38ea3dab951 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
         * table[0] points to the first id lookup table metadata block, this
         * should be less than id_table_start
         */
-        if (!IS_ERR(table) && table[0] >= id_table_start) {
+        if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
                kfree(table);
                return ERR_PTR(-EINVAL);
        }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee3597..7438850c62d0 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
                msblk->id_table = NULL;
                goto failed_mount;
        }
-        next_table = msblk->id_table[0];
+        next_table = le64_to_cpu(msblk->id_table[0]);
        /* Handle inode lookup table */
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
                msblk->inode_lookup_table = NULL;
                goto failed_mount;
        }
-        next_table = msblk->inode_lookup_table[0];
+        next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
        sb->s_export_op = &squashfs_export_ops;
@@ -286,7 +286,7 @@ handle_fragments:
                msblk->fragment_index = NULL;
                goto failed_mount;
        }
-        next_table = msblk->fragment_index[0];
+        next_table = le64_to_cpu(msblk->fragment_index[0]);
 check_directory_table:
        /* Sanity check directory_table */
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b1..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
        struct inode *inode = dentry->d_inode;
        int err = -ENOTEMPTY;
-        dentry_unhash(dentry);
        if (sysv_empty_dir(inode)) {
                err = sysv_unlink(dir, dentry);
                if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
        struct sysv_dir_entry * old_de;
        int err = -ENOENT;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        old_de = sysv_find_entry(old_dentry, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560d..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
        struct ubifs_inode *dir_ui = ubifs_inode(dir);
        struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
-        dentry_unhash(dentry);
        /*
         * Budget request settings: deletion direntry, deletion inode and
         * changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
        struct timespec time;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        /*
         * Budget request settings: deletion direntry, new direntry, removing
         * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd3..3be645e012c9 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
        ubifs_assert(wbuf->size % c->min_io_size == 0);
        ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        ubifs_assert(!c->space_fixup);
        if (c->leb_size - wbuf->offs >= c->max_write_size)
                ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
        ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
        ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
        ubifs_assert(!c->ro_media && !c->ro_mount);
+        ubifs_assert(!c->space_fixup);
        if (c->ro_error)
                return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3a..cef0460f4c54 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
 out_release:
        release_head(c, BASEHD);
+        kfree(dent);
 out_ro:
        ubifs_ro_mode(c, err);
        if (last_reference)
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a8..a5422fffbd69 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
                if (IS_ERR(sleb)) {
                        if (PTR_ERR(sleb) == -EUCLEAN)
                                sleb = ubifs_recover_leb(c, lnum, 0,
-                                                         c->sbuf, 0);
+                                                         c->sbuf, -1);
                        if (IS_ERR(sleb)) {
                                err = PTR_ERR(sleb);
                                break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b50..783d8e0beb76 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 }
 /**
- * drop_last_node - drop the last node or group of nodes.
+ * drop_last_group - drop the last group of nodes.
 * @sleb: scanned LEB information
 * @offs: offset of dropped nodes is returned here
- * @grouped: non-zero if whole group of nodes have to be dropped
 *
 * This is a helper function for 'ubifs_recover_leb()' which drops the last
- * node of the scanned LEB or the last group of nodes if @grouped is not zero.
+ * group of nodes of the scanned LEB.
- * This function returns %1 if a node was dropped and %0 otherwise.
 */
-static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
+static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
 {
-        int dropped = 0;
        while (!list_empty(&sleb->nodes)) {
                struct ubifs_scan_node *snod;
                struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
                                  list);
                ch = snod->node;
                if (ch->group_type != UBIFS_IN_NODE_GROUP)
-                        return dropped;
+                        break;
-                dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+                dbg_rcvry("dropping grouped node at %d:%d",
+                          sleb->lnum, snod->offs);
+                *offs = snod->offs;
+                list_del(&snod->list);
+                kfree(snod);
+                sleb->nodes_cnt -= 1;
+        }
+}
+/**
+ * drop_last_node - drop the last node.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ * @grouped: non-zero if whole group of nodes have to be dropped
+ *
+ * This is a helper function for 'ubifs_recover_leb()' which drops the last
+ * node of the scanned LEB.
+ */
+static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
+{
+        struct ubifs_scan_node *snod;
+        if (!list_empty(&sleb->nodes)) {
+                snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+                                  list);
+                dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
                *offs = snod->offs;
                list_del(&snod->list);
                kfree(snod);
                sleb->nodes_cnt -= 1;
-                dropped = 1;
-                if (!grouped)
-                        break;
        }
-        return dropped;
 }
 /**
@@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 * @lnum: LEB number
 * @offs: offset
 * @sbuf: LEB-sized buffer to use
- * @grouped: nodes may be grouped for recovery
+ * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
+ *         belong to any journal head)
 *
 * This function does a scan of a LEB, but caters for errors that might have
 * been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
 * found, and a negative error code in case of failure.
 */
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-                                         int offs, void *sbuf, int grouped)
+                                         int offs, void *sbuf, int jhead)
 {
        int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
+        int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
        struct ubifs_scan_leb *sleb;
        void *buf = sbuf + offs;
-        dbg_rcvry("%d:%d", lnum, offs);
+        dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
        sleb = ubifs_start_scan(c, lnum, offs, sbuf);
        if (IS_ERR(sleb))
@@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * Scan quietly until there is an error from which we cannot
                 * recover
                 */
-                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+                ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
                if (ret == SCANNED_A_NODE) {
                        /* A valid node, and not a padding node */
                        struct ubifs_ch *ch = buf;
@@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
                 * If nodes are grouped, always drop the incomplete group at
                 * the end.
                 */
-                drop_last_node(sleb, &offs, 1);
+                drop_last_group(sleb, &offs);
-        /*
+        if (jhead == GCHD) {
-         * While we are in the middle of the same min. I/O unit keep dropping
+                /*
-         * nodes. So basically, what we want is to make sure that the last min.
+                 * If this LEB belongs to the GC head then while we are in the
-         * I/O unit where we saw the corruption is dropped completely with all
+                 * middle of the same min. I/O unit keep dropping nodes. So
-         * the uncorrupted node which may possibly sit there.
+                 * basically, what we want is to make sure that the last min.
-         *
+                 * I/O unit where we saw the corruption is dropped completely
-         * In other words, let's name the min. I/O unit where the corruption
+                 * with all the uncorrupted nodes which may possibly sit there.
-         * starts B, and the previous min. I/O unit A. The below code tries to
+                 *
-         * deal with a situation when half of B contains valid nodes or the end
+                 * In other words, let's name the min. I/O unit where the
-         * of a valid node, and the second half of B contains corrupted data or
+                 * corruption starts B, and the previous min. I/O unit A. The
-         * garbage. This means that UBIFS had been writing to B just before the
+                 * below code tries to deal with a situation when half of B
-         * power cut happened. I do not know how realistic is this scenario
+                 * contains valid nodes or the end of a valid node, and the
-         * that half of the min. I/O unit had been written successfully and the
+                 * second half of B contains corrupted data or garbage. This
-         * other half not, but this is possible in our 'failure mode emulation'
+                 * means that UBIFS had been writing to B just before the power
-         * infrastructure at least.
+                 * cut happened. I do not know how realistic is this scenario
-         *
+                 * that half of the min. I/O unit had been written successfully
-         * So what is the problem, why we need to drop those nodes? Whey can't
+                 * and the other half not, but this is possible in our 'failure
-         * we just clean-up the second half of B by putting a padding node
+                 * mode emulation' infrastructure at least.
-         * there? We can, and this works fine with one exception which was
+                 *
-         * reproduced with power cut emulation testing and happens extremely
+                 * So what is the problem, why we need to drop those nodes? Why
-         * rarely. The description follows, but it is worth noting that that is
+                 * can't we just clean-up the second half of B by putting a
-         * only about the GC head, so we could do this trick only if the bud
+                 * padding node there? We can, and this works fine with one
-         * belongs to the GC head, but it does not seem to be worth an
+                 * exception which was reproduced with power cut emulation
-         * additional "if" statement.
+                 * testing and happens extremely rarely.
-         *
+                 *
-         * So, imagine the file-system is full, we run GC which is moving valid
+                 * Imagine the file-system is full, we run GC which starts
-         * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head
+                 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
-         * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X
+                 * the current GC head LEB). The @c->gc_lnum is -1, which means
-         * and will try to continue. Imagine that LEB X is currently the
+                 * that GC will retain LEB X and will try to continue. Imagine
-         * dirtiest LEB, and the amount of used space in LEB Y is exactly the
+                 * that LEB X is currently the dirtiest LEB, and the amount of
-         * same as amount of free space in LEB X.
+                 * used space in LEB Y is exactly the same as amount of free
-         *
+                 * space in LEB X.
-         * And a power cut happens when nodes are moved from LEB X to LEB Y. We
+                 *
-         * are here trying to recover LEB Y which is the GC head LEB. We find
+                 * And a power cut happens when nodes are moved from LEB X to
-         * the min. I/O unit B as described above. Then we clean-up LEB Y by
+                 * LEB Y. We are here trying to recover LEB Y which is the GC
-         * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function
+                 * head LEB. We find the min. I/O unit B as described above.
-         * fails, because it cannot find a dirty LEB which could be GC'd into
+                 * Then we clean-up LEB Y by padding min. I/O unit. And later
-         * LEB Y! Even LEB X does not match because the amount of valid nodes
+                 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
-         * there does not fit the free space in LEB Y any more! And this is
+                 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
-         * because of the padding node which we added to LEB Y. The
+                 * does not match because the amount of valid nodes there does
-         * user-visible effect of this which I once observed and analysed is
+                 * not fit the free space in LEB Y any more! And this is
-         * that we cannot mount the file-system with -ENOSPC error.
+                 * because of the padding node which we added to LEB Y. The
-         *
+                 * user-visible effect of this which I once observed and
-         * So obviously, to make sure that situation does not happen we should
+                 * analysed is that we cannot mount the file-system with
-         * free min. I/O unit B in LEB Y completely and the last used min. I/O
+                 * -ENOSPC error.
-         * unit in LEB Y should be A. This is basically what the below code
+                 *
-         * tries to do.
+                 * So obviously, to make sure that situation does not happen we
-         */
+                 * should free min. I/O unit B in LEB Y completely and the last
-        while (min_io_unit == round_down(offs, c->min_io_size) &&
+                 * used min. I/O unit in LEB Y should be A. This is basically
-               min_io_unit != offs &&
+                 * what the below code tries to do.
-               drop_last_node(sleb, &offs, grouped));
+                 */
+                while (offs > min_io_unit)
+                        drop_last_node(sleb, &offs);
+        }
        buf = sbuf + offs;
        len = c->leb_size - offs;
@@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
                }
                ubifs_scan_destroy(sleb);
        }
-        return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+        return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
 }
 /**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d1679..5e97161ce4d3 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
                 * these LEBs could possibly be written to at the power cut
                 * time.
                 */
-                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf,
+                sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
-                                         b->bud->jhead != GCHD);
        else
                sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
        if (IS_ERR(sleb))
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c003236..9e1d05666fed 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,13 +277,18 @@ static int kick_a_thread(void)
        return 0;
 }
-int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 {
+        int nr = sc->nr_to_scan;
        int freed, contention = 0;
        long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
        if (nr == 0)
-                return clean_zn_cnt;
+                /*
+                 * Due to the way UBIFS updates the clean znode counter it may
+                 * temporarily be negative.
+                 */
+                return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
        if (!clean_zn_cnt) {
                /*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f74..b5aeb5a8ebed 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
        end_writeback(inode);
 }
-static void ubifs_dirty_inode(struct inode *inode)
+static void ubifs_dirty_inode(struct inode *inode, int flags)
 {
        struct ubifs_inode *ui = ubifs_inode(inode);
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
                c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
                c->jheads[i].wbuf.jhead = i;
+                c->jheads[i].grouped = 1;
        }
        c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
        /*
         * Garbage Collector head likely contains long-term data and
-         * does not need to be synchronized by timer.
+         * does not need to be synchronized by timer. Also GC head nodes are
+         * not grouped.
         */
        c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
        c->jheads[GCHD].wbuf.no_timer = 1;
+        c->jheads[GCHD].grouped = 0;
        return 0;
 }
@@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c)
        if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
                ubifs_msg("recovery needed");
                c->need_recovery = 1;
-                if (!c->ro_mount) {
+        }
-                        err = ubifs_recover_inl_heads(c, c->sbuf);
-                        if (err)
+        if (c->need_recovery && !c->ro_mount) {
-                                goto out_master;
+                err = ubifs_recover_inl_heads(c, c->sbuf);
-                }
+                if (err)
-        } else if (!c->ro_mount) {
+                        goto out_master;
+        }
+        err = ubifs_lpt_init(c, 1, !c->ro_mount);
+        if (err)
+                goto out_master;
+        if (!c->ro_mount && c->space_fixup) {
+                err = ubifs_fixup_free_space(c);
+                if (err)
+                        goto out_master;
+        }
+        if (!c->ro_mount) {
                /*
                 * Set the "dirty" flag so that if we reboot uncleanly we
                 * will notice this immediately on the next mount.
@@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c)
                c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
                err = ubifs_write_master(c);
                if (err)
-                        goto out_master;
+                        goto out_lpt;
        }
-        err = ubifs_lpt_init(c, 1, !c->ro_mount);
-        if (err)
-                goto out_lpt;
        err = dbg_check_idx_size(c, c->bi.old_idx_sz);
        if (err)
                goto out_lpt;
@@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
        } else
                ubifs_assert(c->lst.taken_empty_lebs > 0);
-        if (!c->ro_mount && c->space_fixup) {
-                err = ubifs_fixup_free_space(c);
-                if (err)
-                        goto out_infos;
-        }
        err = dbg_check_filesystem(c);
        if (err)
                goto out_infos;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d94..91b4213dde84 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
 */
 void ubifs_tnc_close(struct ubifs_info *c)
 {
-        long clean_freed;
        tnc_destroy_cnext(c);
        if (c->zroot.znode) {
-                clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+                long n;
-                atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+                ubifs_destroy_tnc_subtree(c->zroot.znode);
+                n = atomic_long_read(&c->clean_zn_cnt);
+                atomic_long_sub(n, &ubifs_clean_zn_cnt);
        }
        kfree(c->gap_lebs);
        kfree(c->ilebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f0..f79983d6f860 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
 * struct ubifs_jhead - journal head.
 * @wbuf: head's write-buffer
 * @buds_list: list of bud LEBs belonging to this journal head
+ * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
 *
 * Note, the @buds list is protected by the @c->buds_lock.
 */
 struct ubifs_jhead {
        struct ubifs_wbuf wbuf;
        struct list_head buds_list;
+        unsigned int grouped:1;
 };
 /**
@@ -1614,7 +1616,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 /* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
 /* commit.c */
 int ubifs_bg_thread(void *info);
@@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
 int ubifs_recover_master_node(struct ubifs_info *c);
 int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
 struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
-                                         int offs, void *sbuf, int grouped);
+                                         int offs, void *sbuf, int jhead);
 struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
                                             int offs, void *sbuf);
 int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8f..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
-        dentry_unhash(dentry);
        retval = -ENOENT;
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
        if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
                if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf7..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
        struct inode * inode = dentry->d_inode;
        int err= -ENOTEMPTY;
-        dentry_unhash(dentry);
        lock_ufs(dir->i_sb);
        if (ufs_empty_dir (inode)) {
                err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        if (new_inode && S_ISDIR(new_inode->i_mode))
-                dentry_unhash(new_dentry);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974dea..f060663ab70c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
                return 0;
        /*
-         * The trusted.* namespace can only be accessed by a privileged user.
+         * The trusted.* namespace can only be accessed by privileged users.
         */
-        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
+        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
-                return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM);
+                if (!capable(CAP_SYS_ADMIN))
+                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
+                return 0;
+        }
-        /* In user.* namespace, only regular files and directories can have
+        /*
+         * In the user.* namespace, only regular files and directories can have
         * extended attributes. For sticky directories, only the owner and
-         * privileged user can write attributes.
+         * privileged users can write attributes.
         */
        if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
-                        return -EPERM;
+                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
                    (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
                        return -EPERM;
@@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
 {
        struct inode *inode = dentry->d_inode;
        int error = -EOPNOTSUPP;
+        int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
+                                   XATTR_SECURITY_PREFIX_LEN);
+        if (issec)
+                inode->i_flags &= ~S_NOSEC;
        if (inode->i_op->setxattr) {
                error = inode->i_op->setxattr(dentry, name, value, size, flags);
                if (!error) {
@@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
                        security_inode_post_setxattr(dentry, name, value,
                                                     size, flags);
                }
-        } else if (!strncmp(name, XATTR_SECURITY_PREFIX,
+        } else if (issec) {
-                                XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
                error = security_inode_setsecurity(inode, suffix, value,
                                                   size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf1..1e3a7ce804dc 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
 */
 STATIC void
 xfs_fs_dirty_inode(
-        struct inode    *inode)
+        struct inode    *inode,
+        int             flags)
 {
        barrier();
        XFS_I(inode)->i_update_core = 1;
author	Tony Lindgren <tony@atomide.com>	2011-06-13 10:40:25 -0400
committer	Tony Lindgren <tony@atomide.com>	2011-06-13 10:40:25 -0400
commit	c8e0bf95fc01d6e2ca585fe08010800b6c56e823 (patch)
tree	f901bdcb5b20e93261cf9cf324ebbcf3fd24ce58 /fs
parent	9d5ae7cd6cb9ead43336fec1094184d1dc740fbd (diff)
parent	345f79b3de7f6d651e4dba794af7c7303bdfd649 (diff)